blob: e4539cd46a8d6f090bee4dc3f79ada5ec1820b72 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Alexander Belopolsky40018472011-02-26 01:02:56 +0000145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
Alexander Belopolsky40018472011-02-26 01:02:56 +0000151static void
152raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Alexander Belopolsky40018472011-02-26 01:02:56 +0000228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static int
261unicode_resize(register PyUnicodeObject *unicode,
262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000302 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303 }
304 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 return 0;
307}
308
309/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000310 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312
313 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316*/
317
Alexander Belopolsky40018472011-02-26 01:02:56 +0000318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320{
321 register PyUnicodeObject *unicode;
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty);
326 return unicode_empty;
327 }
328
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000329 /* Ensure we won't overflow the size. */
330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331 return (PyUnicodeObject *)PyErr_NoMemory();
332 }
333
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000335 if (free_list) {
336 unicode = free_list;
337 free_list = *(PyUnicodeObject **)unicode;
338 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 if (unicode->str) {
340 /* Keep-Alive optimization: we only upsize the buffer,
341 never downsize it. */
342 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000343 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 PyObject_DEL(unicode->str);
345 unicode->str = NULL;
346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 }
352 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 }
354 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 if (unicode == NULL)
358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000363 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 PyErr_NoMemory();
365 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000367 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000368 * the caller fails before initializing str -- unicode_resize()
369 * reads str[0], and the Keep-Alive optimization can keep memory
370 * allocated for str alive across a call to unicode_dealloc(unicode).
371 * We don't want unicode_resize to read uninitialized memory in
372 * that case.
373 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000374 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000383 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000386 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388}
389
Alexander Belopolsky40018472011-02-26 01:02:56 +0000390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392{
Walter Dörwald16807132007-05-25 13:52:07 +0000393 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000394 case SSTATE_NOT_INTERNED:
395 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000396
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError(
402 "deletion of interned string failed");
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_IMMORTAL:
406 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 default:
409 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410 }
411
Guido van Rossum604ddf82001-12-06 20:03:56 +0000412 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000414 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416 PyObject_DEL(unicode->str);
417 unicode->str = NULL;
418 unicode->length = 0;
419 }
420 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000421 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 }
423 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000424 *(PyUnicodeObject **)unicode = free_list;
425 free_list = unicode;
426 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000429 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 }
433}
434
Alexander Belopolsky40018472011-02-26 01:02:56 +0000435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000437{
438 register PyUnicodeObject *v;
439
440 /* Argument checks */
441 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 PyErr_BadInternalCall();
448 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 }
450
451 /* Resizing unicode_empty and single character objects is not
452 possible since these are being shared. We simply return a fresh
453 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000454 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 (v == unicode_empty || v->length == 1)) {
456 PyUnicodeObject *w = _PyUnicode_New(length);
457 if (w == NULL)
458 return -1;
459 Py_UNICODE_COPY(w->str, v->str,
460 length < v->length ? length : v->length);
461 Py_DECREF(*unicode);
462 *unicode = w;
463 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000464 }
465
466 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */
468 return unicode_resize(v, length);
469}
470
Alexander Belopolsky40018472011-02-26 01:02:56 +0000471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473{
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476
Alexander Belopolsky40018472011-02-26 01:02:56 +0000477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 PyUnicodeObject *unicode;
481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */
484 if (u != NULL) {
485
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000490 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000491
492 /* Single character Unicode objects in the Latin-1 range are
493 shared when using this constructor */
494 if (size == 1 && *u < 256) {
495 unicode = unicode_latin1[*u];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = *u;
501 unicode_latin1[*u] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 /* Copy the Unicode data into the new object */
513 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000514 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515
516 return (PyObject *)unicode;
517}
518
Alexander Belopolsky40018472011-02-26 01:02:56 +0000519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Alexander Belopolsky40018472011-02-26 01:02:56 +0000567PyObject *
568PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569{
570 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL;
574 }
575
576 return PyUnicode_FromStringAndSize(u, size);
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579#ifdef HAVE_WCHAR_H
580
Mark Dickinson081dfee2009-03-18 14:47:41 +0000581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */
589
Alexander Belopolsky40018472011-02-26 01:02:56 +0000590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +0000592{
593 PyUnicodeObject *unicode;
594 register Py_ssize_t i;
595 Py_ssize_t alloc;
596 const wchar_t *orig_w;
597
598 if (w == NULL) {
599 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 if (size == -1) {
606 size = wcslen(w);
607 }
608
609 alloc = size;
610 orig_w = w;
611 for (i = size; i > 0; i--) {
612 if (*w > 0xFFFF)
613 alloc++;
614 w++;
615 }
616 w = orig_w;
617 unicode = _PyUnicode_New(alloc);
618 if (!unicode)
619 return NULL;
620
621 /* Copy the wchar_t data into the new object */
622 {
623 register Py_UNICODE *u;
624 u = PyUnicode_AS_UNICODE(unicode);
625 for (i = size; i > 0; i--) {
626 if (*w > 0xFFFF) {
627 wchar_t ordinal = *w++;
628 ordinal -= 0x10000;
629 *u++ = 0xD800 | (ordinal >> 10);
630 *u++ = 0xDC00 | (ordinal & 0x3FF);
631 }
632 else
633 *u++ = *w++;
634 }
635 }
636 return (PyObject *)unicode;
637}
638
639#else
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 PyUnicodeObject *unicode;
645
646 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 PyErr_BadInternalCall();
650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Martin v. Löwis790465f2008-04-05 20:41:37 +0000653 if (size == -1) {
654 size = wcslen(w);
655 }
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 unicode = _PyUnicode_New(size);
658 if (!unicode)
659 return NULL;
660
661 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000664#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000666 register Py_UNICODE *u;
667 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--)
670 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 }
672#endif
673
674 return (PyObject *)unicode;
675}
676
Mark Dickinson081dfee2009-03-18 14:47:41 +0000677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
Walter Dörwald346737f2007-05-31 10:44:43 +0000681static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000684{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 *fmt++ = '%';
686 if (width) {
687 if (zeropad)
688 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width);
690 }
691 if (precision)
692 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag)
694 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702#else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706#endif
707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000708 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T;
710 while (*f)
711 *fmt++ = *f++;
712 }
713 *fmt++ = c;
714 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000715}
716
Victor Stinner96865452011-03-01 23:44:09 +0000717/* helper for PyUnicode_FromFormatV() */
718
719static const char*
720parse_format_flags(const char *f,
721 int *p_width, int *p_precision,
722 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723{
724 int width, precision, longflag, longlongflag, size_tflag;
725
726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727 f++;
728 width = 0;
729 while (Py_ISDIGIT((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 precision = 0;
732 if (*f == '.') {
733 f++;
734 while (Py_ISDIGIT((unsigned)*f))
735 precision = (precision*10) + *f++ - '0';
736 if (*f == '%') {
737 /* "%.3%s" => f points to "3" */
738 f--;
739 }
740 }
741 if (*f == '\0') {
742 /* bogus format "%.1" => go backward, f points to "1" */
743 f--;
744 }
745 if (p_width != NULL)
746 *p_width = width;
747 if (p_precision != NULL)
748 *p_precision = precision;
749
750 /* Handle %ld, %lu, %lld and %llu. */
751 longflag = 0;
752 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +0000753 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +0000754
755 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +0000756 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +0000757 longflag = 1;
758 ++f;
759 }
760#ifdef HAVE_LONG_LONG
761 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +0000762 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000763 longlongflag = 1;
764 f += 2;
765 }
766#endif
767 }
768 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +0000769 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000770 size_tflag = 1;
771 ++f;
772 }
773 if (p_longflag != NULL)
774 *p_longflag = longflag;
775 if (p_longlongflag != NULL)
776 *p_longlongflag = longlongflag;
777 if (p_size_tflag != NULL)
778 *p_size_tflag = size_tflag;
779 return f;
780}
781
Walter Dörwaldd2034312007-05-18 16:29:38 +0000782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
783
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000784/* size of fixed-size buffer for formatting single arguments */
785#define ITEM_BUFFER_LEN 21
786/* maximum number of characters required for output of %ld. 21 characters
787 allows for 64-bit integers (in decimal) and an optional sign. */
788#define MAX_LONG_CHARS 21
789/* maximum number of characters required for output of %lld.
790 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
791 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
793
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794PyObject *
795PyUnicode_FromFormatV(const char *format, va_list vargs)
796{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 va_list count;
798 Py_ssize_t callcount = 0;
799 PyObject **callresults = NULL;
800 PyObject **callresult = NULL;
801 Py_ssize_t n = 0;
802 int width = 0;
803 int precision = 0;
804 int zeropad;
805 const char* f;
806 Py_UNICODE *s;
807 PyObject *string;
808 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000809 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* use abuffer instead of buffer, if we need more space
811 * (which can happen if there's a format specifier with width). */
812 char *abuffer = NULL;
813 char *realbuffer;
814 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000816 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000818 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000819 /* step 1: count the number of %S/%R/%A/%s format specifications
820 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
821 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
822 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000824 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +0000825 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
826 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
827 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000828 ++callcount;
829 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000830 else if (128 <= (unsigned char)*f) {
831 PyErr_Format(PyExc_ValueError,
832 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000833 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000834 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000835 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000836 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000837 }
838 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000839 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000840 if (callcount) {
841 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
842 if (!callresults) {
843 PyErr_NoMemory();
844 return NULL;
845 }
846 callresult = callresults;
847 }
848 /* step 3: figure out how large a buffer we need */
849 for (f = format; *f; f++) {
850 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000851#ifdef HAVE_LONG_LONG
Victor Stinner96865452011-03-01 23:44:09 +0000852 int longlongflag;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000853#endif
Victor Stinner96865452011-03-01 23:44:09 +0000854 const char* p;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000855
Victor Stinner96865452011-03-01 23:44:09 +0000856 p = f;
857 f = parse_format_flags(f, &width, NULL,
858 NULL, &longlongflag, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859
Benjamin Peterson14339b62009-01-31 16:36:08 +0000860 switch (*f) {
861 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000862 {
863#ifndef Py_UNICODE_WIDE
864 int ordinal = va_arg(count, int);
865 if (ordinal > 0xffff)
866 n += 2;
867 else
868 n++;
869#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000870 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000871 n++;
872#endif
873 break;
874 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000875 case '%':
876 n++;
877 break;
878 case 'd': case 'u': case 'i': case 'x':
879 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000880#ifdef HAVE_LONG_LONG
881 if (longlongflag) {
882 if (width < MAX_LONG_LONG_CHARS)
883 width = MAX_LONG_LONG_CHARS;
884 }
885 else
886#endif
887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
888 including sign. Decimal takes the most space. This
889 isn't enough for octal. If a width is specified we
890 need more (which we allocate later). */
891 if (width < MAX_LONG_CHARS)
892 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000893 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000894 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000895 if (abuffersize < width)
896 abuffersize = width;
897 break;
898 case 's':
899 {
900 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000901 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000902 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
903 if (!str)
904 goto fail;
905 n += PyUnicode_GET_SIZE(str);
906 /* Remember the str and switch to the next slot */
907 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000908 break;
909 }
910 case 'U':
911 {
912 PyObject *obj = va_arg(count, PyObject *);
913 assert(obj && PyUnicode_Check(obj));
914 n += PyUnicode_GET_SIZE(obj);
915 break;
916 }
917 case 'V':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000921 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000922 assert(obj || str);
923 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +0000924 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000925 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000926 *callresult++ = NULL;
927 }
928 else {
929 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
930 if (!str_obj)
931 goto fail;
932 n += PyUnicode_GET_SIZE(str_obj);
933 *callresult++ = str_obj;
934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000935 break;
936 }
937 case 'S':
938 {
939 PyObject *obj = va_arg(count, PyObject *);
940 PyObject *str;
941 assert(obj);
942 str = PyObject_Str(obj);
943 if (!str)
944 goto fail;
945 n += PyUnicode_GET_SIZE(str);
946 /* Remember the str and switch to the next slot */
947 *callresult++ = str;
948 break;
949 }
950 case 'R':
951 {
952 PyObject *obj = va_arg(count, PyObject *);
953 PyObject *repr;
954 assert(obj);
955 repr = PyObject_Repr(obj);
956 if (!repr)
957 goto fail;
958 n += PyUnicode_GET_SIZE(repr);
959 /* Remember the repr and switch to the next slot */
960 *callresult++ = repr;
961 break;
962 }
963 case 'A':
964 {
965 PyObject *obj = va_arg(count, PyObject *);
966 PyObject *ascii;
967 assert(obj);
968 ascii = PyObject_ASCII(obj);
969 if (!ascii)
970 goto fail;
971 n += PyUnicode_GET_SIZE(ascii);
972 /* Remember the repr and switch to the next slot */
973 *callresult++ = ascii;
974 break;
975 }
976 case 'p':
977 (void) va_arg(count, int);
978 /* maximum 64-bit pointer representation:
979 * 0xffffffffffffffff
980 * so 19 characters is enough.
981 * XXX I count 18 -- what's the extra for?
982 */
983 n += 19;
984 break;
985 default:
986 /* if we stumble upon an unknown
987 formatting code, copy the rest of
988 the format string to the output
989 string. (we cannot just skip the
990 code, since there's no way to know
991 what's in the argument list) */
992 n += strlen(p);
993 goto expand;
994 }
995 } else
996 n++;
997 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000998 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000999 if (abuffersize > ITEM_BUFFER_LEN) {
1000 /* add 1 for sprintf's trailing null byte */
1001 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 if (!abuffer) {
1003 PyErr_NoMemory();
1004 goto fail;
1005 }
1006 realbuffer = abuffer;
1007 }
1008 else
1009 realbuffer = buffer;
1010 /* step 4: fill the buffer */
1011 /* Since we've analyzed how much space we need for the worst case,
1012 we don't have to resize the string.
1013 There can be no errors beyond this point. */
1014 string = PyUnicode_FromUnicode(NULL, n);
1015 if (!string)
1016 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001017
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 s = PyUnicode_AS_UNICODE(string);
1019 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 for (f = format; *f; f++) {
1022 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001023 const char* p;
1024 int longflag;
1025 int longlongflag;
1026 int size_tflag;
1027
1028 p = f;
1029 zeropad = (f[1] == '0');
1030 f = parse_format_flags(f, &width, &precision,
1031 &longflag, &longlongflag, &size_tflag);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001032
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 switch (*f) {
1034 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001035 {
1036 int ordinal = va_arg(vargs, int);
1037#ifndef Py_UNICODE_WIDE
1038 if (ordinal > 0xffff) {
1039 ordinal -= 0x10000;
1040 *s++ = 0xD800 | (ordinal >> 10);
1041 *s++ = 0xDC00 | (ordinal & 0x3FF);
1042 } else
1043#endif
1044 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001046 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001047 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001048 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
Victor Stinner6d970f42011-03-02 00:04:25 +00001050 width, precision, *f);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 if (longflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001053#ifdef HAVE_LONG_LONG
1054 else if (longlongflag)
1055 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1056#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001057 else if (size_tflag)
1058 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1059 else
1060 sprintf(realbuffer, fmt, va_arg(vargs, int));
1061 appendstring(realbuffer);
1062 break;
1063 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001064 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1065 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001066 if (longflag)
1067 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001068#ifdef HAVE_LONG_LONG
1069 else if (longlongflag)
1070 sprintf(realbuffer, fmt, va_arg(vargs,
1071 unsigned PY_LONG_LONG));
1072#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001073 else if (size_tflag)
1074 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1075 else
1076 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1077 appendstring(realbuffer);
1078 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001079 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001080 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001081 sprintf(realbuffer, fmt, va_arg(vargs, int));
1082 appendstring(realbuffer);
1083 break;
1084 case 's':
1085 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001086 /* unused, since we already have the result */
1087 (void) va_arg(vargs, char *);
1088 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1089 PyUnicode_GET_SIZE(*callresult));
1090 s += PyUnicode_GET_SIZE(*callresult);
1091 /* We're done with the unicode()/repr() => forget it */
1092 Py_DECREF(*callresult);
1093 /* switch to next unicode()/repr() result */
1094 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001095 break;
1096 }
1097 case 'U':
1098 {
1099 PyObject *obj = va_arg(vargs, PyObject *);
1100 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1101 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1102 s += size;
1103 break;
1104 }
1105 case 'V':
1106 {
1107 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001108 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001109 if (obj) {
1110 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1111 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1112 s += size;
1113 } else {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001114 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1115 PyUnicode_GET_SIZE(*callresult));
1116 s += PyUnicode_GET_SIZE(*callresult);
1117 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001119 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001120 break;
1121 }
1122 case 'S':
1123 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001124 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 {
1126 Py_UNICODE *ucopy;
1127 Py_ssize_t usize;
1128 Py_ssize_t upos;
1129 /* unused, since we already have the result */
1130 (void) va_arg(vargs, PyObject *);
1131 ucopy = PyUnicode_AS_UNICODE(*callresult);
1132 usize = PyUnicode_GET_SIZE(*callresult);
1133 for (upos = 0; upos<usize;)
1134 *s++ = ucopy[upos++];
1135 /* We're done with the unicode()/repr() => forget it */
1136 Py_DECREF(*callresult);
1137 /* switch to next unicode()/repr() result */
1138 ++callresult;
1139 break;
1140 }
1141 case 'p':
1142 sprintf(buffer, "%p", va_arg(vargs, void*));
1143 /* %p is ill-defined: ensure leading 0x. */
1144 if (buffer[1] == 'X')
1145 buffer[1] = 'x';
1146 else if (buffer[1] != 'x') {
1147 memmove(buffer+2, buffer, strlen(buffer)+1);
1148 buffer[0] = '0';
1149 buffer[1] = 'x';
1150 }
1151 appendstring(buffer);
1152 break;
1153 case '%':
1154 *s++ = '%';
1155 break;
1156 default:
1157 appendstring(p);
1158 goto end;
1159 }
Victor Stinner1205f272010-09-11 00:54:47 +00001160 }
Victor Stinner1205f272010-09-11 00:54:47 +00001161 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 *s++ = *f;
1163 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001164
Benjamin Peterson29060642009-01-31 22:14:21 +00001165 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 if (callresults)
1167 PyObject_Free(callresults);
1168 if (abuffer)
1169 PyObject_Free(abuffer);
1170 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1171 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001173 if (callresults) {
1174 PyObject **callresult2 = callresults;
1175 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001176 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001177 ++callresult2;
1178 }
1179 PyObject_Free(callresults);
1180 }
1181 if (abuffer)
1182 PyObject_Free(abuffer);
1183 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001184}
1185
1186#undef appendstring
1187
1188PyObject *
1189PyUnicode_FromFormat(const char *format, ...)
1190{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 PyObject* ret;
1192 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001193
1194#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001195 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001196#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001197 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001198#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001199 ret = PyUnicode_FromFormatV(format, vargs);
1200 va_end(vargs);
1201 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001202}
1203
Victor Stinner5593d8a2010-10-02 11:11:27 +00001204/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1205 convert a Unicode object to a wide character string.
1206
1207 - If w is NULL: return the number of wide characters (including the nul
1208 character) required to convert the unicode object. Ignore size argument.
1209
1210 - Otherwise: return the number of wide characters (excluding the nul
1211 character) written into w. Write at most size wide characters (including
1212 the nul character). */
1213static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001214unicode_aswidechar(PyUnicodeObject *unicode,
1215 wchar_t *w,
1216 Py_ssize_t size)
1217{
1218#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001219 Py_ssize_t res;
1220 if (w != NULL) {
1221 res = PyUnicode_GET_SIZE(unicode);
1222 if (size > res)
1223 size = res + 1;
1224 else
1225 res = size;
1226 memcpy(w, unicode->str, size * sizeof(wchar_t));
1227 return res;
1228 }
1229 else
1230 return PyUnicode_GET_SIZE(unicode) + 1;
1231#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1232 register const Py_UNICODE *u;
1233 const Py_UNICODE *uend;
1234 const wchar_t *worig, *wend;
1235 Py_ssize_t nchar;
1236
Victor Stinner137c34c2010-09-29 10:25:54 +00001237 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001238 uend = u + PyUnicode_GET_SIZE(unicode);
1239 if (w != NULL) {
1240 worig = w;
1241 wend = w + size;
1242 while (u != uend && w != wend) {
1243 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1244 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1245 {
1246 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1247 u += 2;
1248 }
1249 else {
1250 *w = *u;
1251 u++;
1252 }
1253 w++;
1254 }
1255 if (w != wend)
1256 *w = L'\0';
1257 return w - worig;
1258 }
1259 else {
1260 nchar = 1; /* nul character at the end */
1261 while (u != uend) {
1262 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1263 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1264 u += 2;
1265 else
1266 u++;
1267 nchar++;
1268 }
1269 }
1270 return nchar;
1271#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1272 register Py_UNICODE *u, *uend, ordinal;
1273 register Py_ssize_t i;
1274 wchar_t *worig, *wend;
1275 Py_ssize_t nchar;
1276
1277 u = PyUnicode_AS_UNICODE(unicode);
1278 uend = u + PyUnicode_GET_SIZE(u);
1279 if (w != NULL) {
1280 worig = w;
1281 wend = w + size;
1282 while (u != uend && w != wend) {
1283 ordinal = *u;
1284 if (ordinal > 0xffff) {
1285 ordinal -= 0x10000;
1286 *w++ = 0xD800 | (ordinal >> 10);
1287 *w++ = 0xDC00 | (ordinal & 0x3FF);
1288 }
1289 else
1290 *w++ = ordinal;
1291 u++;
1292 }
1293 if (w != wend)
1294 *w = 0;
1295 return w - worig;
1296 }
1297 else {
1298 nchar = 1; /* nul character */
1299 while (u != uend) {
1300 if (*u > 0xffff)
1301 nchar += 2;
1302 else
1303 nchar++;
1304 u++;
1305 }
1306 return nchar;
1307 }
1308#else
1309# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001310#endif
1311}
1312
1313Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001315 wchar_t *w,
1316 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317{
1318 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001319 PyErr_BadInternalCall();
1320 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001322 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323}
1324
Victor Stinner137c34c2010-09-29 10:25:54 +00001325wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001326PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001327 Py_ssize_t *size)
1328{
1329 wchar_t* buffer;
1330 Py_ssize_t buflen;
1331
1332 if (unicode == NULL) {
1333 PyErr_BadInternalCall();
1334 return NULL;
1335 }
1336
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001338 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001339 PyErr_NoMemory();
1340 return NULL;
1341 }
1342
Victor Stinner137c34c2010-09-29 10:25:54 +00001343 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1344 if (buffer == NULL) {
1345 PyErr_NoMemory();
1346 return NULL;
1347 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001348 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001349 if (size != NULL)
1350 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001351 return buffer;
1352}
1353
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354#endif
1355
Alexander Belopolsky40018472011-02-26 01:02:56 +00001356PyObject *
1357PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001358{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001359 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001361 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001362 PyErr_SetString(PyExc_ValueError,
1363 "chr() arg not in range(0x110000)");
1364 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001365 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001366
1367#ifndef Py_UNICODE_WIDE
1368 if (ordinal > 0xffff) {
1369 ordinal -= 0x10000;
1370 s[0] = 0xD800 | (ordinal >> 10);
1371 s[1] = 0xDC00 | (ordinal & 0x3FF);
1372 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001373 }
1374#endif
1375
Hye-Shik Chang40574832004-04-06 07:24:51 +00001376 s[0] = (Py_UNICODE)ordinal;
1377 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001378}
1379
Alexander Belopolsky40018472011-02-26 01:02:56 +00001380PyObject *
1381PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001383 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001384 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001385 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001386 Py_INCREF(obj);
1387 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001388 }
1389 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 /* For a Unicode subtype that's not a Unicode object,
1391 return a true Unicode object with the same data. */
1392 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1393 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001394 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001395 PyErr_Format(PyExc_TypeError,
1396 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001397 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001398 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001399}
1400
Alexander Belopolsky40018472011-02-26 01:02:56 +00001401PyObject *
1402PyUnicode_FromEncodedObject(register PyObject *obj,
1403 const char *encoding,
1404 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001405{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001406 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001407 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001408
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 PyErr_BadInternalCall();
1411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Decoding bytes objects is the most common case and should be fast */
1415 if (PyBytes_Check(obj)) {
1416 if (PyBytes_GET_SIZE(obj) == 0) {
1417 Py_INCREF(unicode_empty);
1418 v = (PyObject *) unicode_empty;
1419 }
1420 else {
1421 v = PyUnicode_Decode(
1422 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1423 encoding, errors);
1424 }
1425 return v;
1426 }
1427
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001428 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001429 PyErr_SetString(PyExc_TypeError,
1430 "decoding str is not supported");
1431 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001432 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001433
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001434 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1435 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1436 PyErr_Format(PyExc_TypeError,
1437 "coercing to str: need bytes, bytearray "
1438 "or buffer-like object, %.80s found",
1439 Py_TYPE(obj)->tp_name);
1440 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001441 }
Tim Petersced69f82003-09-16 20:30:58 +00001442
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001443 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001445 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 }
Tim Petersced69f82003-09-16 20:30:58 +00001447 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001448 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001449
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001450 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001451 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
Victor Stinner600d3be2010-06-10 12:00:55 +00001454/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001455 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1456 1 on success. */
1457static int
1458normalize_encoding(const char *encoding,
1459 char *lower,
1460 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001462 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001463 char *l;
1464 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001465
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001466 e = encoding;
1467 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001468 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001469 while (*e) {
1470 if (l == l_end)
1471 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001472 if (Py_ISUPPER(*e)) {
1473 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001474 }
1475 else if (*e == '_') {
1476 *l++ = '-';
1477 e++;
1478 }
1479 else {
1480 *l++ = *e++;
1481 }
1482 }
1483 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001484 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001485}
1486
Alexander Belopolsky40018472011-02-26 01:02:56 +00001487PyObject *
1488PyUnicode_Decode(const char *s,
1489 Py_ssize_t size,
1490 const char *encoding,
1491 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00001492{
1493 PyObject *buffer = NULL, *unicode;
1494 Py_buffer info;
1495 char lower[11]; /* Enough for any encoding shortcut */
1496
1497 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001499
1500 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001501 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001502 if ((strcmp(lower, "utf-8") == 0) ||
1503 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001504 return PyUnicode_DecodeUTF8(s, size, errors);
1505 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001506 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001507 (strcmp(lower, "iso-8859-1") == 0))
1508 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001509#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001510 else if (strcmp(lower, "mbcs") == 0)
1511 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001512#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001513 else if (strcmp(lower, "ascii") == 0)
1514 return PyUnicode_DecodeASCII(s, size, errors);
1515 else if (strcmp(lower, "utf-16") == 0)
1516 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1517 else if (strcmp(lower, "utf-32") == 0)
1518 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520
1521 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001522 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001523 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001524 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001525 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 if (buffer == NULL)
1527 goto onError;
1528 unicode = PyCodec_Decode(buffer, encoding, errors);
1529 if (unicode == NULL)
1530 goto onError;
1531 if (!PyUnicode_Check(unicode)) {
1532 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001533 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001534 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535 Py_DECREF(unicode);
1536 goto onError;
1537 }
1538 Py_DECREF(buffer);
1539 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001540
Benjamin Peterson29060642009-01-31 22:14:21 +00001541 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 Py_XDECREF(buffer);
1543 return NULL;
1544}
1545
Alexander Belopolsky40018472011-02-26 01:02:56 +00001546PyObject *
1547PyUnicode_AsDecodedObject(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 return v;
1566
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001568 return NULL;
1569}
1570
Alexander Belopolsky40018472011-02-26 01:02:56 +00001571PyObject *
1572PyUnicode_AsDecodedUnicode(PyObject *unicode,
1573 const char *encoding,
1574 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575{
1576 PyObject *v;
1577
1578 if (!PyUnicode_Check(unicode)) {
1579 PyErr_BadArgument();
1580 goto onError;
1581 }
1582
1583 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001585
1586 /* Decode via the codec registry */
1587 v = PyCodec_Decode(unicode, encoding, errors);
1588 if (v == NULL)
1589 goto onError;
1590 if (!PyUnicode_Check(v)) {
1591 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001592 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001593 Py_TYPE(v)->tp_name);
1594 Py_DECREF(v);
1595 goto onError;
1596 }
1597 return v;
1598
Benjamin Peterson29060642009-01-31 22:14:21 +00001599 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001600 return NULL;
1601}
1602
Alexander Belopolsky40018472011-02-26 01:02:56 +00001603PyObject *
1604PyUnicode_Encode(const Py_UNICODE *s,
1605 Py_ssize_t size,
1606 const char *encoding,
1607 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608{
1609 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001610
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 unicode = PyUnicode_FromUnicode(s, size);
1612 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1615 Py_DECREF(unicode);
1616 return v;
1617}
1618
Alexander Belopolsky40018472011-02-26 01:02:56 +00001619PyObject *
1620PyUnicode_AsEncodedObject(PyObject *unicode,
1621 const char *encoding,
1622 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001623{
1624 PyObject *v;
1625
1626 if (!PyUnicode_Check(unicode)) {
1627 PyErr_BadArgument();
1628 goto onError;
1629 }
1630
1631 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001632 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001633
1634 /* Encode via the codec registry */
1635 v = PyCodec_Encode(unicode, encoding, errors);
1636 if (v == NULL)
1637 goto onError;
1638 return v;
1639
Benjamin Peterson29060642009-01-31 22:14:21 +00001640 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001641 return NULL;
1642}
1643
Victor Stinnerad158722010-10-27 00:25:46 +00001644PyObject *
1645PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001646{
Victor Stinner313a1202010-06-11 23:56:51 +00001647#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649 PyUnicode_GET_SIZE(unicode),
1650 NULL);
1651#elif defined(__APPLE__)
1652 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1653 PyUnicode_GET_SIZE(unicode),
1654 "surrogateescape");
1655#else
1656 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001657 return PyUnicode_AsEncodedString(unicode,
1658 Py_FileSystemDefaultEncoding,
1659 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001660 }
1661 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001662 /* locale encoding with surrogateescape */
1663 wchar_t *wchar;
1664 char *bytes;
1665 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001666 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001667
1668 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1669 if (wchar == NULL)
1670 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001671 bytes = _Py_wchar2char(wchar, &error_pos);
1672 if (bytes == NULL) {
1673 if (error_pos != (size_t)-1) {
1674 char *errmsg = strerror(errno);
1675 PyObject *exc = NULL;
1676 if (errmsg == NULL)
1677 errmsg = "Py_wchar2char() failed";
1678 raise_encode_exception(&exc,
1679 "filesystemencoding",
1680 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1681 error_pos, error_pos+1,
1682 errmsg);
1683 Py_XDECREF(exc);
1684 }
1685 else
1686 PyErr_NoMemory();
1687 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001688 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001689 }
1690 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001691
1692 bytes_obj = PyBytes_FromString(bytes);
1693 PyMem_Free(bytes);
1694 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001695 }
Victor Stinnerad158722010-10-27 00:25:46 +00001696#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001697}
1698
Alexander Belopolsky40018472011-02-26 01:02:56 +00001699PyObject *
1700PyUnicode_AsEncodedString(PyObject *unicode,
1701 const char *encoding,
1702 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703{
1704 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001705 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001706
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 if (!PyUnicode_Check(unicode)) {
1708 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 }
Fred Drakee4315f52000-05-09 19:53:39 +00001711
Tim Petersced69f82003-09-16 20:30:58 +00001712 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001713 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1714 PyUnicode_GET_SIZE(unicode),
1715 errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001716
1717 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001718 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001719 if ((strcmp(lower, "utf-8") == 0) ||
1720 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001721 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1722 PyUnicode_GET_SIZE(unicode),
1723 errors);
1724 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001725 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001726 (strcmp(lower, "iso-8859-1") == 0))
1727 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1728 PyUnicode_GET_SIZE(unicode),
1729 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001730#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001731 else if (strcmp(lower, "mbcs") == 0)
1732 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1733 PyUnicode_GET_SIZE(unicode),
1734 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001735#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001736 else if (strcmp(lower, "ascii") == 0)
1737 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1738 PyUnicode_GET_SIZE(unicode),
1739 errors);
1740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741
1742 /* Encode via the codec registry */
1743 v = PyCodec_Encode(unicode, encoding, errors);
1744 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001745 return NULL;
1746
1747 /* The normal path */
1748 if (PyBytes_Check(v))
1749 return v;
1750
1751 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001752 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001753 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001754 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001755
1756 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1757 "encoder %s returned bytearray instead of bytes",
1758 encoding);
1759 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001760 Py_DECREF(v);
1761 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001762 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001763
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001764 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1765 Py_DECREF(v);
1766 return b;
1767 }
1768
1769 PyErr_Format(PyExc_TypeError,
1770 "encoder did not return a bytes object (type=%.400s)",
1771 Py_TYPE(v)->tp_name);
1772 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001773 return NULL;
1774}
1775
Alexander Belopolsky40018472011-02-26 01:02:56 +00001776PyObject *
1777PyUnicode_AsEncodedUnicode(PyObject *unicode,
1778 const char *encoding,
1779 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001780{
1781 PyObject *v;
1782
1783 if (!PyUnicode_Check(unicode)) {
1784 PyErr_BadArgument();
1785 goto onError;
1786 }
1787
1788 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001789 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001790
1791 /* Encode via the codec registry */
1792 v = PyCodec_Encode(unicode, encoding, errors);
1793 if (v == NULL)
1794 goto onError;
1795 if (!PyUnicode_Check(v)) {
1796 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001797 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001798 Py_TYPE(v)->tp_name);
1799 Py_DECREF(v);
1800 goto onError;
1801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001803
Benjamin Peterson29060642009-01-31 22:14:21 +00001804 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 return NULL;
1806}
1807
Alexander Belopolsky40018472011-02-26 01:02:56 +00001808PyObject *
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001809_PyUnicode_AsDefaultEncodedString(PyObject *unicode)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001810{
1811 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001812 if (v)
1813 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001814 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001815 PyUnicode_GET_SIZE(unicode),
1816 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001817 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001818 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001819 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001820 return v;
1821}
1822
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001823PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001824PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001825 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001826 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1827}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001828
Christian Heimes5894ba72007-11-04 11:43:14 +00001829PyObject*
1830PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1831{
Victor Stinnerad158722010-10-27 00:25:46 +00001832#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1833 return PyUnicode_DecodeMBCS(s, size, NULL);
1834#elif defined(__APPLE__)
1835 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1836#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001837 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1838 can be undefined. If it is case, decode using UTF-8. The following assumes
1839 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1840 bootstrapping process where the codecs aren't ready yet.
1841 */
1842 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001843 return PyUnicode_Decode(s, size,
1844 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001845 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001846 }
1847 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001848 /* locale encoding with surrogateescape */
1849 wchar_t *wchar;
1850 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001851 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001852
1853 if (s[size] != '\0' || size != strlen(s)) {
1854 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1855 return NULL;
1856 }
1857
Victor Stinner168e1172010-10-16 23:16:16 +00001858 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001859 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001860 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001861
Victor Stinner168e1172010-10-16 23:16:16 +00001862 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001863 PyMem_Free(wchar);
1864 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001865 }
Victor Stinnerad158722010-10-27 00:25:46 +00001866#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001867}
1868
Martin v. Löwis011e8422009-05-05 04:43:17 +00001869
1870int
1871PyUnicode_FSConverter(PyObject* arg, void* addr)
1872{
1873 PyObject *output = NULL;
1874 Py_ssize_t size;
1875 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001876 if (arg == NULL) {
1877 Py_DECREF(*(PyObject**)addr);
1878 return 1;
1879 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001880 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001881 output = arg;
1882 Py_INCREF(output);
1883 }
1884 else {
1885 arg = PyUnicode_FromObject(arg);
1886 if (!arg)
1887 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001888 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001889 Py_DECREF(arg);
1890 if (!output)
1891 return 0;
1892 if (!PyBytes_Check(output)) {
1893 Py_DECREF(output);
1894 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1895 return 0;
1896 }
1897 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001898 size = PyBytes_GET_SIZE(output);
1899 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001900 if (size != strlen(data)) {
1901 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1902 Py_DECREF(output);
1903 return 0;
1904 }
1905 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001906 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001907}
1908
1909
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001910int
1911PyUnicode_FSDecoder(PyObject* arg, void* addr)
1912{
1913 PyObject *output = NULL;
1914 Py_ssize_t size;
1915 void *data;
1916 if (arg == NULL) {
1917 Py_DECREF(*(PyObject**)addr);
1918 return 1;
1919 }
1920 if (PyUnicode_Check(arg)) {
1921 output = arg;
1922 Py_INCREF(output);
1923 }
1924 else {
1925 arg = PyBytes_FromObject(arg);
1926 if (!arg)
1927 return 0;
1928 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1929 PyBytes_GET_SIZE(arg));
1930 Py_DECREF(arg);
1931 if (!output)
1932 return 0;
1933 if (!PyUnicode_Check(output)) {
1934 Py_DECREF(output);
1935 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1936 return 0;
1937 }
1938 }
1939 size = PyUnicode_GET_SIZE(output);
1940 data = PyUnicode_AS_UNICODE(output);
1941 if (size != Py_UNICODE_strlen(data)) {
1942 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1943 Py_DECREF(output);
1944 return 0;
1945 }
1946 *(PyObject**)addr = output;
1947 return Py_CLEANUP_SUPPORTED;
1948}
1949
1950
Martin v. Löwis5b222132007-06-10 09:51:05 +00001951char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001952_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001953{
Christian Heimesf3863112007-11-22 07:46:41 +00001954 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001955 if (!PyUnicode_Check(unicode)) {
1956 PyErr_BadArgument();
1957 return NULL;
1958 }
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001959 bytes = _PyUnicode_AsDefaultEncodedString(unicode);
Christian Heimesf3863112007-11-22 07:46:41 +00001960 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001961 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001962 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001963 *psize = PyBytes_GET_SIZE(bytes);
1964 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001965}
1966
1967char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001968_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001969{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001970 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001971}
1972
Alexander Belopolsky40018472011-02-26 01:02:56 +00001973Py_UNICODE *
1974PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975{
1976 if (!PyUnicode_Check(unicode)) {
1977 PyErr_BadArgument();
1978 goto onError;
1979 }
1980 return PyUnicode_AS_UNICODE(unicode);
1981
Benjamin Peterson29060642009-01-31 22:14:21 +00001982 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983 return NULL;
1984}
1985
Alexander Belopolsky40018472011-02-26 01:02:56 +00001986Py_ssize_t
1987PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988{
1989 if (!PyUnicode_Check(unicode)) {
1990 PyErr_BadArgument();
1991 goto onError;
1992 }
1993 return PyUnicode_GET_SIZE(unicode);
1994
Benjamin Peterson29060642009-01-31 22:14:21 +00001995 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996 return -1;
1997}
1998
Alexander Belopolsky40018472011-02-26 01:02:56 +00001999const char *
2000PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002001{
Victor Stinner42cb4622010-09-01 19:39:01 +00002002 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002003}
2004
Victor Stinner554f3f02010-06-16 23:33:54 +00002005/* create or adjust a UnicodeDecodeError */
2006static void
2007make_decode_exception(PyObject **exceptionObject,
2008 const char *encoding,
2009 const char *input, Py_ssize_t length,
2010 Py_ssize_t startpos, Py_ssize_t endpos,
2011 const char *reason)
2012{
2013 if (*exceptionObject == NULL) {
2014 *exceptionObject = PyUnicodeDecodeError_Create(
2015 encoding, input, length, startpos, endpos, reason);
2016 }
2017 else {
2018 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2019 goto onError;
2020 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2021 goto onError;
2022 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2023 goto onError;
2024 }
2025 return;
2026
2027onError:
2028 Py_DECREF(*exceptionObject);
2029 *exceptionObject = NULL;
2030}
2031
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032/* error handling callback helper:
2033 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002034 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035 and adjust various state variables.
2036 return 0 on success, -1 on error
2037*/
2038
Alexander Belopolsky40018472011-02-26 01:02:56 +00002039static int
2040unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2041 const char *encoding, const char *reason,
2042 const char **input, const char **inend, Py_ssize_t *startinpos,
2043 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2044 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002045{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002046 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002047
2048 PyObject *restuple = NULL;
2049 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002050 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002051 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002052 Py_ssize_t requiredsize;
2053 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002055 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002056 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 int res = -1;
2058
2059 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002060 *errorHandler = PyCodec_LookupError(errors);
2061 if (*errorHandler == NULL)
2062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002063 }
2064
Victor Stinner554f3f02010-06-16 23:33:54 +00002065 make_decode_exception(exceptionObject,
2066 encoding,
2067 *input, *inend - *input,
2068 *startinpos, *endinpos,
2069 reason);
2070 if (*exceptionObject == NULL)
2071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072
2073 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2074 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002075 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002076 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002077 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002078 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 }
2080 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002081 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002082
2083 /* Copy back the bytes variables, which might have been modified by the
2084 callback */
2085 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2086 if (!inputobj)
2087 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002088 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002089 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002090 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002091 *input = PyBytes_AS_STRING(inputobj);
2092 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002093 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002094 /* we can DECREF safely, as the exception has another reference,
2095 so the object won't go away. */
2096 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002097
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002098 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002099 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002100 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002101 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2102 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002103 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002104
2105 /* need more space? (at least enough for what we
2106 have+the replacement+the rest of the string (starting
2107 at the new input position), so we won't have to check space
2108 when there are no errors in the rest of the string) */
2109 repptr = PyUnicode_AS_UNICODE(repunicode);
2110 repsize = PyUnicode_GET_SIZE(repunicode);
2111 requiredsize = *outpos + repsize + insize-newpos;
2112 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002113 if (requiredsize<2*outsize)
2114 requiredsize = 2*outsize;
2115 if (_PyUnicode_Resize(output, requiredsize) < 0)
2116 goto onError;
2117 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002118 }
2119 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002120 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121 Py_UNICODE_COPY(*outptr, repptr, repsize);
2122 *outptr += repsize;
2123 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002124
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 /* we made it! */
2126 res = 0;
2127
Benjamin Peterson29060642009-01-31 22:14:21 +00002128 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002129 Py_XDECREF(restuple);
2130 return res;
2131}
2132
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133/* --- UTF-7 Codec -------------------------------------------------------- */
2134
Antoine Pitrou244651a2009-05-04 18:56:13 +00002135/* See RFC2152 for details. We encode conservatively and decode liberally. */
2136
2137/* Three simple macros defining base-64. */
2138
2139/* Is c a base-64 character? */
2140
2141#define IS_BASE64(c) \
2142 (((c) >= 'A' && (c) <= 'Z') || \
2143 ((c) >= 'a' && (c) <= 'z') || \
2144 ((c) >= '0' && (c) <= '9') || \
2145 (c) == '+' || (c) == '/')
2146
2147/* given that c is a base-64 character, what is its base-64 value? */
2148
2149#define FROM_BASE64(c) \
2150 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2151 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2152 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2153 (c) == '+' ? 62 : 63)
2154
2155/* What is the base-64 character of the bottom 6 bits of n? */
2156
2157#define TO_BASE64(n) \
2158 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2159
2160/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2161 * decoded as itself. We are permissive on decoding; the only ASCII
2162 * byte not decoding to itself is the + which begins a base64
2163 * string. */
2164
2165#define DECODE_DIRECT(c) \
2166 ((c) <= 127 && (c) != '+')
2167
2168/* The UTF-7 encoder treats ASCII characters differently according to
2169 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2170 * the above). See RFC2152. This array identifies these different
2171 * sets:
2172 * 0 : "Set D"
2173 * alphanumeric and '(),-./:?
2174 * 1 : "Set O"
2175 * !"#$%&*;<=>@[]^_`{|}
2176 * 2 : "whitespace"
2177 * ht nl cr sp
2178 * 3 : special (must be base64 encoded)
2179 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2180 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002181
Tim Petersced69f82003-09-16 20:30:58 +00002182static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002183char utf7_category[128] = {
2184/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2185 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2186/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2187 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2188/* sp ! " # $ % & ' ( ) * + , - . / */
2189 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2190/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2192/* @ A B C D E F G H I J K L M N O */
2193 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2194/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2196/* ` a b c d e f g h i j k l m n o */
2197 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2198/* p q r s t u v w x y z { | } ~ del */
2199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002200};
2201
Antoine Pitrou244651a2009-05-04 18:56:13 +00002202/* ENCODE_DIRECT: this character should be encoded as itself. The
2203 * answer depends on whether we are encoding set O as itself, and also
2204 * on whether we are encoding whitespace as itself. RFC2152 makes it
2205 * clear that the answers to these questions vary between
2206 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002207
Antoine Pitrou244651a2009-05-04 18:56:13 +00002208#define ENCODE_DIRECT(c, directO, directWS) \
2209 ((c) < 128 && (c) > 0 && \
2210 ((utf7_category[(c)] == 0) || \
2211 (directWS && (utf7_category[(c)] == 2)) || \
2212 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002213
Alexander Belopolsky40018472011-02-26 01:02:56 +00002214PyObject *
2215PyUnicode_DecodeUTF7(const char *s,
2216 Py_ssize_t size,
2217 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002218{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002219 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2220}
2221
Antoine Pitrou244651a2009-05-04 18:56:13 +00002222/* The decoder. The only state we preserve is our read position,
2223 * i.e. how many characters we have consumed. So if we end in the
2224 * middle of a shift sequence we have to back off the read position
2225 * and the output to the beginning of the sequence, otherwise we lose
2226 * all the shift state (seen bits, number of bits seen, high
2227 * surrogate). */
2228
Alexander Belopolsky40018472011-02-26 01:02:56 +00002229PyObject *
2230PyUnicode_DecodeUTF7Stateful(const char *s,
2231 Py_ssize_t size,
2232 const char *errors,
2233 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002234{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002235 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002236 Py_ssize_t startinpos;
2237 Py_ssize_t endinpos;
2238 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002239 const char *e;
2240 PyUnicodeObject *unicode;
2241 Py_UNICODE *p;
2242 const char *errmsg = "";
2243 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002244 Py_UNICODE *shiftOutStart;
2245 unsigned int base64bits = 0;
2246 unsigned long base64buffer = 0;
2247 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002248 PyObject *errorHandler = NULL;
2249 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002250
2251 unicode = _PyUnicode_New(size);
2252 if (!unicode)
2253 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002254 if (size == 0) {
2255 if (consumed)
2256 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002257 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002258 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002259
2260 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002261 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002262 e = s + size;
2263
2264 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002265 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002266 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002267 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002268
Antoine Pitrou244651a2009-05-04 18:56:13 +00002269 if (inShift) { /* in a base-64 section */
2270 if (IS_BASE64(ch)) { /* consume a base-64 character */
2271 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2272 base64bits += 6;
2273 s++;
2274 if (base64bits >= 16) {
2275 /* we have enough bits for a UTF-16 value */
2276 Py_UNICODE outCh = (Py_UNICODE)
2277 (base64buffer >> (base64bits-16));
2278 base64bits -= 16;
2279 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2280 if (surrogate) {
2281 /* expecting a second surrogate */
2282 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2283#ifdef Py_UNICODE_WIDE
2284 *p++ = (((surrogate & 0x3FF)<<10)
2285 | (outCh & 0x3FF)) + 0x10000;
2286#else
2287 *p++ = surrogate;
2288 *p++ = outCh;
2289#endif
2290 surrogate = 0;
2291 }
2292 else {
2293 surrogate = 0;
2294 errmsg = "second surrogate missing";
2295 goto utf7Error;
2296 }
2297 }
2298 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2299 /* first surrogate */
2300 surrogate = outCh;
2301 }
2302 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2303 errmsg = "unexpected second surrogate";
2304 goto utf7Error;
2305 }
2306 else {
2307 *p++ = outCh;
2308 }
2309 }
2310 }
2311 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002312 inShift = 0;
2313 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002314 if (surrogate) {
2315 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002316 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002317 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002318 if (base64bits > 0) { /* left-over bits */
2319 if (base64bits >= 6) {
2320 /* We've seen at least one base-64 character */
2321 errmsg = "partial character in shift sequence";
2322 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002323 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002324 else {
2325 /* Some bits remain; they should be zero */
2326 if (base64buffer != 0) {
2327 errmsg = "non-zero padding bits in shift sequence";
2328 goto utf7Error;
2329 }
2330 }
2331 }
2332 if (ch != '-') {
2333 /* '-' is absorbed; other terminating
2334 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002335 *p++ = ch;
2336 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002337 }
2338 }
2339 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002340 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002341 s++; /* consume '+' */
2342 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002343 s++;
2344 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002345 }
2346 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002347 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002348 shiftOutStart = p;
2349 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002350 }
2351 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002352 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002353 *p++ = ch;
2354 s++;
2355 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002356 else {
2357 startinpos = s-starts;
2358 s++;
2359 errmsg = "unexpected special character";
2360 goto utf7Error;
2361 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002362 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002363utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002364 outpos = p-PyUnicode_AS_UNICODE(unicode);
2365 endinpos = s-starts;
2366 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002367 errors, &errorHandler,
2368 "utf7", errmsg,
2369 &starts, &e, &startinpos, &endinpos, &exc, &s,
2370 &unicode, &outpos, &p))
2371 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002372 }
2373
Antoine Pitrou244651a2009-05-04 18:56:13 +00002374 /* end of string */
2375
2376 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2377 /* if we're in an inconsistent state, that's an error */
2378 if (surrogate ||
2379 (base64bits >= 6) ||
2380 (base64bits > 0 && base64buffer != 0)) {
2381 outpos = p-PyUnicode_AS_UNICODE(unicode);
2382 endinpos = size;
2383 if (unicode_decode_call_errorhandler(
2384 errors, &errorHandler,
2385 "utf7", "unterminated shift sequence",
2386 &starts, &e, &startinpos, &endinpos, &exc, &s,
2387 &unicode, &outpos, &p))
2388 goto onError;
2389 if (s < e)
2390 goto restart;
2391 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002392 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002393
2394 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002395 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002396 if (inShift) {
2397 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002398 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002399 }
2400 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002401 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002402 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002403 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002404
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002405 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002406 goto onError;
2407
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002408 Py_XDECREF(errorHandler);
2409 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002410 return (PyObject *)unicode;
2411
Benjamin Peterson29060642009-01-31 22:14:21 +00002412 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002413 Py_XDECREF(errorHandler);
2414 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002415 Py_DECREF(unicode);
2416 return NULL;
2417}
2418
2419
Alexander Belopolsky40018472011-02-26 01:02:56 +00002420PyObject *
2421PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2422 Py_ssize_t size,
2423 int base64SetO,
2424 int base64WhiteSpace,
2425 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002426{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002427 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002428 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002429 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002430 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002431 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002432 unsigned int base64bits = 0;
2433 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002434 char * out;
2435 char * start;
2436
2437 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002438 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002439
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002440 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002441 return PyErr_NoMemory();
2442
Antoine Pitrou244651a2009-05-04 18:56:13 +00002443 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002444 if (v == NULL)
2445 return NULL;
2446
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002447 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002448 for (;i < size; ++i) {
2449 Py_UNICODE ch = s[i];
2450
Antoine Pitrou244651a2009-05-04 18:56:13 +00002451 if (inShift) {
2452 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2453 /* shifting out */
2454 if (base64bits) { /* output remaining bits */
2455 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2456 base64buffer = 0;
2457 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002458 }
2459 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002460 /* Characters not in the BASE64 set implicitly unshift the sequence
2461 so no '-' is required, except if the character is itself a '-' */
2462 if (IS_BASE64(ch) || ch == '-') {
2463 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002464 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002465 *out++ = (char) ch;
2466 }
2467 else {
2468 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002469 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002471 else { /* not in a shift sequence */
2472 if (ch == '+') {
2473 *out++ = '+';
2474 *out++ = '-';
2475 }
2476 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2477 *out++ = (char) ch;
2478 }
2479 else {
2480 *out++ = '+';
2481 inShift = 1;
2482 goto encode_char;
2483 }
2484 }
2485 continue;
2486encode_char:
2487#ifdef Py_UNICODE_WIDE
2488 if (ch >= 0x10000) {
2489 /* code first surrogate */
2490 base64bits += 16;
2491 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2492 while (base64bits >= 6) {
2493 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2494 base64bits -= 6;
2495 }
2496 /* prepare second surrogate */
2497 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2498 }
2499#endif
2500 base64bits += 16;
2501 base64buffer = (base64buffer << 16) | ch;
2502 while (base64bits >= 6) {
2503 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2504 base64bits -= 6;
2505 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002507 if (base64bits)
2508 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2509 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002510 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002511 if (_PyBytes_Resize(&v, out - start) < 0)
2512 return NULL;
2513 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002514}
2515
Antoine Pitrou244651a2009-05-04 18:56:13 +00002516#undef IS_BASE64
2517#undef FROM_BASE64
2518#undef TO_BASE64
2519#undef DECODE_DIRECT
2520#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002521
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522/* --- UTF-8 Codec -------------------------------------------------------- */
2523
Tim Petersced69f82003-09-16 20:30:58 +00002524static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002526 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2527 illegal prefix. See RFC 3629 for details */
2528 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2529 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002530 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2532 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2533 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2536 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2538 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2540 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2541 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2542 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2543 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544};
2545
Alexander Belopolsky40018472011-02-26 01:02:56 +00002546PyObject *
2547PyUnicode_DecodeUTF8(const char *s,
2548 Py_ssize_t size,
2549 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550{
Walter Dörwald69652032004-09-07 20:24:22 +00002551 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2552}
2553
Antoine Pitrouab868312009-01-10 15:40:25 +00002554/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2555#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2556
2557/* Mask to quickly check whether a C 'long' contains a
2558 non-ASCII, UTF8-encoded char. */
2559#if (SIZEOF_LONG == 8)
2560# define ASCII_CHAR_MASK 0x8080808080808080L
2561#elif (SIZEOF_LONG == 4)
2562# define ASCII_CHAR_MASK 0x80808080L
2563#else
2564# error C 'long' size should be either 4 or 8!
2565#endif
2566
Alexander Belopolsky40018472011-02-26 01:02:56 +00002567PyObject *
2568PyUnicode_DecodeUTF8Stateful(const char *s,
2569 Py_ssize_t size,
2570 const char *errors,
2571 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002572{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002573 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002575 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002576 Py_ssize_t startinpos;
2577 Py_ssize_t endinpos;
2578 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002579 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 PyUnicodeObject *unicode;
2581 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002582 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 PyObject *errorHandler = NULL;
2584 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585
2586 /* Note: size will always be longer than the resulting Unicode
2587 character count */
2588 unicode = _PyUnicode_New(size);
2589 if (!unicode)
2590 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002591 if (size == 0) {
2592 if (consumed)
2593 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596
2597 /* Unpack UTF-8 encoded data */
2598 p = unicode->str;
2599 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002600 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601
2602 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002603 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604
2605 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002606 /* Fast path for runs of ASCII characters. Given that common UTF-8
2607 input will consist of an overwhelming majority of ASCII
2608 characters, we try to optimize for this case by checking
2609 as many characters as a C 'long' can contain.
2610 First, check if we can do an aligned read, as most CPUs have
2611 a penalty for unaligned reads.
2612 */
2613 if (!((size_t) s & LONG_PTR_MASK)) {
2614 /* Help register allocation */
2615 register const char *_s = s;
2616 register Py_UNICODE *_p = p;
2617 while (_s < aligned_end) {
2618 /* Read a whole long at a time (either 4 or 8 bytes),
2619 and do a fast unrolled copy if it only contains ASCII
2620 characters. */
2621 unsigned long data = *(unsigned long *) _s;
2622 if (data & ASCII_CHAR_MASK)
2623 break;
2624 _p[0] = (unsigned char) _s[0];
2625 _p[1] = (unsigned char) _s[1];
2626 _p[2] = (unsigned char) _s[2];
2627 _p[3] = (unsigned char) _s[3];
2628#if (SIZEOF_LONG == 8)
2629 _p[4] = (unsigned char) _s[4];
2630 _p[5] = (unsigned char) _s[5];
2631 _p[6] = (unsigned char) _s[6];
2632 _p[7] = (unsigned char) _s[7];
2633#endif
2634 _s += SIZEOF_LONG;
2635 _p += SIZEOF_LONG;
2636 }
2637 s = _s;
2638 p = _p;
2639 if (s == e)
2640 break;
2641 ch = (unsigned char)*s;
2642 }
2643 }
2644
2645 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002646 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647 s++;
2648 continue;
2649 }
2650
2651 n = utf8_code_length[ch];
2652
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002653 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002654 if (consumed)
2655 break;
2656 else {
2657 errmsg = "unexpected end of data";
2658 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002659 endinpos = startinpos+1;
2660 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2661 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002662 goto utf8Error;
2663 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665
2666 switch (n) {
2667
2668 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002669 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002670 startinpos = s-starts;
2671 endinpos = startinpos+1;
2672 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673
2674 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002675 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002676 startinpos = s-starts;
2677 endinpos = startinpos+1;
2678 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679
2680 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002681 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002682 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002684 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 goto utf8Error;
2686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002688 assert ((ch > 0x007F) && (ch <= 0x07FF));
2689 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 break;
2691
2692 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002693 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2694 will result in surrogates in range d800-dfff. Surrogates are
2695 not valid UTF-8 so they are rejected.
2696 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2697 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002698 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002699 (s[2] & 0xc0) != 0x80 ||
2700 ((unsigned char)s[0] == 0xE0 &&
2701 (unsigned char)s[1] < 0xA0) ||
2702 ((unsigned char)s[0] == 0xED &&
2703 (unsigned char)s[1] > 0x9F)) {
2704 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002705 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002706 endinpos = startinpos + 1;
2707
2708 /* if s[1] first two bits are 1 and 0, then the invalid
2709 continuation byte is s[2], so increment endinpos by 1,
2710 if not, s[1] is invalid and endinpos doesn't need to
2711 be incremented. */
2712 if ((s[1] & 0xC0) == 0x80)
2713 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002714 goto utf8Error;
2715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002717 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2718 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002719 break;
2720
2721 case 4:
2722 if ((s[1] & 0xc0) != 0x80 ||
2723 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002724 (s[3] & 0xc0) != 0x80 ||
2725 ((unsigned char)s[0] == 0xF0 &&
2726 (unsigned char)s[1] < 0x90) ||
2727 ((unsigned char)s[0] == 0xF4 &&
2728 (unsigned char)s[1] > 0x8F)) {
2729 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002730 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002731 endinpos = startinpos + 1;
2732 if ((s[1] & 0xC0) == 0x80) {
2733 endinpos++;
2734 if ((s[2] & 0xC0) == 0x80)
2735 endinpos++;
2736 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002737 goto utf8Error;
2738 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002739 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002740 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2741 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2742
Fredrik Lundh8f455852001-06-27 18:59:43 +00002743#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002744 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002745#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002746 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002747
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002748 /* translate from 10000..10FFFF to 0..FFFF */
2749 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002750
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002751 /* high surrogate = top 10 bits added to D800 */
2752 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002753
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002754 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002755 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002756#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 }
2759 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002760 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002761
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 utf8Error:
2763 outpos = p-PyUnicode_AS_UNICODE(unicode);
2764 if (unicode_decode_call_errorhandler(
2765 errors, &errorHandler,
2766 "utf8", errmsg,
2767 &starts, &e, &startinpos, &endinpos, &exc, &s,
2768 &unicode, &outpos, &p))
2769 goto onError;
2770 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 }
Walter Dörwald69652032004-09-07 20:24:22 +00002772 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002773 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774
2775 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002776 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 goto onError;
2778
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 Py_XDECREF(errorHandler);
2780 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 return (PyObject *)unicode;
2782
Benjamin Peterson29060642009-01-31 22:14:21 +00002783 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002784 Py_XDECREF(errorHandler);
2785 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 Py_DECREF(unicode);
2787 return NULL;
2788}
2789
Antoine Pitrouab868312009-01-10 15:40:25 +00002790#undef ASCII_CHAR_MASK
2791
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002792#ifdef __APPLE__
2793
2794/* Simplified UTF-8 decoder using surrogateescape error handler,
2795 used to decode the command line arguments on Mac OS X. */
2796
2797wchar_t*
2798_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2799{
2800 int n;
2801 const char *e;
2802 wchar_t *unicode, *p;
2803
2804 /* Note: size will always be longer than the resulting Unicode
2805 character count */
2806 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2807 PyErr_NoMemory();
2808 return NULL;
2809 }
2810 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2811 if (!unicode)
2812 return NULL;
2813
2814 /* Unpack UTF-8 encoded data */
2815 p = unicode;
2816 e = s + size;
2817 while (s < e) {
2818 Py_UCS4 ch = (unsigned char)*s;
2819
2820 if (ch < 0x80) {
2821 *p++ = (wchar_t)ch;
2822 s++;
2823 continue;
2824 }
2825
2826 n = utf8_code_length[ch];
2827 if (s + n > e) {
2828 goto surrogateescape;
2829 }
2830
2831 switch (n) {
2832 case 0:
2833 case 1:
2834 goto surrogateescape;
2835
2836 case 2:
2837 if ((s[1] & 0xc0) != 0x80)
2838 goto surrogateescape;
2839 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2840 assert ((ch > 0x007F) && (ch <= 0x07FF));
2841 *p++ = (wchar_t)ch;
2842 break;
2843
2844 case 3:
2845 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2846 will result in surrogates in range d800-dfff. Surrogates are
2847 not valid UTF-8 so they are rejected.
2848 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2849 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2850 if ((s[1] & 0xc0) != 0x80 ||
2851 (s[2] & 0xc0) != 0x80 ||
2852 ((unsigned char)s[0] == 0xE0 &&
2853 (unsigned char)s[1] < 0xA0) ||
2854 ((unsigned char)s[0] == 0xED &&
2855 (unsigned char)s[1] > 0x9F)) {
2856
2857 goto surrogateescape;
2858 }
2859 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2860 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2861 *p++ = (Py_UNICODE)ch;
2862 break;
2863
2864 case 4:
2865 if ((s[1] & 0xc0) != 0x80 ||
2866 (s[2] & 0xc0) != 0x80 ||
2867 (s[3] & 0xc0) != 0x80 ||
2868 ((unsigned char)s[0] == 0xF0 &&
2869 (unsigned char)s[1] < 0x90) ||
2870 ((unsigned char)s[0] == 0xF4 &&
2871 (unsigned char)s[1] > 0x8F)) {
2872 goto surrogateescape;
2873 }
2874 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2875 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2876 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2877
2878#if SIZEOF_WCHAR_T == 4
2879 *p++ = (wchar_t)ch;
2880#else
2881 /* compute and append the two surrogates: */
2882
2883 /* translate from 10000..10FFFF to 0..FFFF */
2884 ch -= 0x10000;
2885
2886 /* high surrogate = top 10 bits added to D800 */
2887 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2888
2889 /* low surrogate = bottom 10 bits added to DC00 */
2890 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2891#endif
2892 break;
2893 }
2894 s += n;
2895 continue;
2896
2897 surrogateescape:
2898 *p++ = 0xDC00 + ch;
2899 s++;
2900 }
2901 *p = L'\0';
2902 return unicode;
2903}
2904
2905#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002906
Tim Peters602f7402002-04-27 18:03:26 +00002907/* Allocation strategy: if the string is short, convert into a stack buffer
2908 and allocate exactly as much space needed at the end. Else allocate the
2909 maximum possible needed (4 result bytes per Unicode character), and return
2910 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002911*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002912PyObject *
2913PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002914 Py_ssize_t size,
2915 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916{
Tim Peters602f7402002-04-27 18:03:26 +00002917#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002918
Guido van Rossum98297ee2007-11-06 21:34:58 +00002919 Py_ssize_t i; /* index into s of next input byte */
2920 PyObject *result; /* result string object */
2921 char *p; /* next free byte in output buffer */
2922 Py_ssize_t nallocated; /* number of result bytes allocated */
2923 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002924 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002925 PyObject *errorHandler = NULL;
2926 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002927
Tim Peters602f7402002-04-27 18:03:26 +00002928 assert(s != NULL);
2929 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930
Tim Peters602f7402002-04-27 18:03:26 +00002931 if (size <= MAX_SHORT_UNICHARS) {
2932 /* Write into the stack buffer; nallocated can't overflow.
2933 * At the end, we'll allocate exactly as much heap space as it
2934 * turns out we need.
2935 */
2936 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002937 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002938 p = stackbuf;
2939 }
2940 else {
2941 /* Overallocate on the heap, and give the excess back at the end. */
2942 nallocated = size * 4;
2943 if (nallocated / 4 != size) /* overflow! */
2944 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002945 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002946 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002947 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002948 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002949 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002950
Tim Peters602f7402002-04-27 18:03:26 +00002951 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002952 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002953
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002954 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002955 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002957
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002959 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002960 *p++ = (char)(0xc0 | (ch >> 6));
2961 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002962 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002963#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002964 /* Special case: check for high and low surrogate */
2965 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2966 Py_UCS4 ch2 = s[i];
2967 /* Combine the two surrogates to form a UCS4 value */
2968 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2969 i++;
2970
2971 /* Encode UCS4 Unicode ordinals */
2972 *p++ = (char)(0xf0 | (ch >> 18));
2973 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002974 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2975 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002976 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002977#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002978 Py_ssize_t newpos;
2979 PyObject *rep;
2980 Py_ssize_t repsize, k;
2981 rep = unicode_encode_call_errorhandler
2982 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2983 s, size, &exc, i-1, i, &newpos);
2984 if (!rep)
2985 goto error;
2986
2987 if (PyBytes_Check(rep))
2988 repsize = PyBytes_GET_SIZE(rep);
2989 else
2990 repsize = PyUnicode_GET_SIZE(rep);
2991
2992 if (repsize > 4) {
2993 Py_ssize_t offset;
2994
2995 if (result == NULL)
2996 offset = p - stackbuf;
2997 else
2998 offset = p - PyBytes_AS_STRING(result);
2999
3000 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3001 /* integer overflow */
3002 PyErr_NoMemory();
3003 goto error;
3004 }
3005 nallocated += repsize - 4;
3006 if (result != NULL) {
3007 if (_PyBytes_Resize(&result, nallocated) < 0)
3008 goto error;
3009 } else {
3010 result = PyBytes_FromStringAndSize(NULL, nallocated);
3011 if (result == NULL)
3012 goto error;
3013 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3014 }
3015 p = PyBytes_AS_STRING(result) + offset;
3016 }
3017
3018 if (PyBytes_Check(rep)) {
3019 char *prep = PyBytes_AS_STRING(rep);
3020 for(k = repsize; k > 0; k--)
3021 *p++ = *prep++;
3022 } else /* rep is unicode */ {
3023 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3024 Py_UNICODE c;
3025
3026 for(k=0; k<repsize; k++) {
3027 c = prep[k];
3028 if (0x80 <= c) {
3029 raise_encode_exception(&exc, "utf-8", s, size,
3030 i-1, i, "surrogates not allowed");
3031 goto error;
3032 }
3033 *p++ = (char)prep[k];
3034 }
3035 }
3036 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003037#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003038 }
Victor Stinner445a6232010-04-22 20:01:57 +00003039#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003040 } else if (ch < 0x10000) {
3041 *p++ = (char)(0xe0 | (ch >> 12));
3042 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3043 *p++ = (char)(0x80 | (ch & 0x3f));
3044 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003045 /* Encode UCS4 Unicode ordinals */
3046 *p++ = (char)(0xf0 | (ch >> 18));
3047 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3048 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3049 *p++ = (char)(0x80 | (ch & 0x3f));
3050 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003052
Guido van Rossum98297ee2007-11-06 21:34:58 +00003053 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003054 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003055 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003056 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003057 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003058 }
3059 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003060 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003061 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003062 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003063 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003064 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003065 Py_XDECREF(errorHandler);
3066 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003067 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003068 error:
3069 Py_XDECREF(errorHandler);
3070 Py_XDECREF(exc);
3071 Py_XDECREF(result);
3072 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003073
Tim Peters602f7402002-04-27 18:03:26 +00003074#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075}
3076
Alexander Belopolsky40018472011-02-26 01:02:56 +00003077PyObject *
3078PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 if (!PyUnicode_Check(unicode)) {
3081 PyErr_BadArgument();
3082 return NULL;
3083 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003084 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003085 PyUnicode_GET_SIZE(unicode),
3086 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087}
3088
Walter Dörwald41980ca2007-08-16 21:55:45 +00003089/* --- UTF-32 Codec ------------------------------------------------------- */
3090
3091PyObject *
3092PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003093 Py_ssize_t size,
3094 const char *errors,
3095 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003096{
3097 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3098}
3099
3100PyObject *
3101PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003102 Py_ssize_t size,
3103 const char *errors,
3104 int *byteorder,
3105 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003106{
3107 const char *starts = s;
3108 Py_ssize_t startinpos;
3109 Py_ssize_t endinpos;
3110 Py_ssize_t outpos;
3111 PyUnicodeObject *unicode;
3112 Py_UNICODE *p;
3113#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003114 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003115 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003116#else
3117 const int pairs = 0;
3118#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003119 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003120 int bo = 0; /* assume native ordering by default */
3121 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003122 /* Offsets from q for retrieving bytes in the right order. */
3123#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3124 int iorder[] = {0, 1, 2, 3};
3125#else
3126 int iorder[] = {3, 2, 1, 0};
3127#endif
3128 PyObject *errorHandler = NULL;
3129 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003130
Walter Dörwald41980ca2007-08-16 21:55:45 +00003131 q = (unsigned char *)s;
3132 e = q + size;
3133
3134 if (byteorder)
3135 bo = *byteorder;
3136
3137 /* Check for BOM marks (U+FEFF) in the input and adjust current
3138 byte order setting accordingly. In native mode, the leading BOM
3139 mark is skipped, in all other modes, it is copied to the output
3140 stream as-is (giving a ZWNBSP character). */
3141 if (bo == 0) {
3142 if (size >= 4) {
3143 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003144 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003145#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 if (bom == 0x0000FEFF) {
3147 q += 4;
3148 bo = -1;
3149 }
3150 else if (bom == 0xFFFE0000) {
3151 q += 4;
3152 bo = 1;
3153 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003154#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 if (bom == 0x0000FEFF) {
3156 q += 4;
3157 bo = 1;
3158 }
3159 else if (bom == 0xFFFE0000) {
3160 q += 4;
3161 bo = -1;
3162 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003163#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003164 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003165 }
3166
3167 if (bo == -1) {
3168 /* force LE */
3169 iorder[0] = 0;
3170 iorder[1] = 1;
3171 iorder[2] = 2;
3172 iorder[3] = 3;
3173 }
3174 else if (bo == 1) {
3175 /* force BE */
3176 iorder[0] = 3;
3177 iorder[1] = 2;
3178 iorder[2] = 1;
3179 iorder[3] = 0;
3180 }
3181
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003182 /* On narrow builds we split characters outside the BMP into two
3183 codepoints => count how much extra space we need. */
3184#ifndef Py_UNICODE_WIDE
3185 for (qq = q; qq < e; qq += 4)
3186 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3187 pairs++;
3188#endif
3189
3190 /* This might be one to much, because of a BOM */
3191 unicode = _PyUnicode_New((size+3)/4+pairs);
3192 if (!unicode)
3193 return NULL;
3194 if (size == 0)
3195 return (PyObject *)unicode;
3196
3197 /* Unpack UTF-32 encoded data */
3198 p = unicode->str;
3199
Walter Dörwald41980ca2007-08-16 21:55:45 +00003200 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003201 Py_UCS4 ch;
3202 /* remaining bytes at the end? (size should be divisible by 4) */
3203 if (e-q<4) {
3204 if (consumed)
3205 break;
3206 errmsg = "truncated data";
3207 startinpos = ((const char *)q)-starts;
3208 endinpos = ((const char *)e)-starts;
3209 goto utf32Error;
3210 /* The remaining input chars are ignored if the callback
3211 chooses to skip the input */
3212 }
3213 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3214 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003215
Benjamin Peterson29060642009-01-31 22:14:21 +00003216 if (ch >= 0x110000)
3217 {
3218 errmsg = "codepoint not in range(0x110000)";
3219 startinpos = ((const char *)q)-starts;
3220 endinpos = startinpos+4;
3221 goto utf32Error;
3222 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003223#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 if (ch >= 0x10000)
3225 {
3226 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3227 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3228 }
3229 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003230#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 *p++ = ch;
3232 q += 4;
3233 continue;
3234 utf32Error:
3235 outpos = p-PyUnicode_AS_UNICODE(unicode);
3236 if (unicode_decode_call_errorhandler(
3237 errors, &errorHandler,
3238 "utf32", errmsg,
3239 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3240 &unicode, &outpos, &p))
3241 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003242 }
3243
3244 if (byteorder)
3245 *byteorder = bo;
3246
3247 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003249
3250 /* Adjust length */
3251 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3252 goto onError;
3253
3254 Py_XDECREF(errorHandler);
3255 Py_XDECREF(exc);
3256 return (PyObject *)unicode;
3257
Benjamin Peterson29060642009-01-31 22:14:21 +00003258 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003259 Py_DECREF(unicode);
3260 Py_XDECREF(errorHandler);
3261 Py_XDECREF(exc);
3262 return NULL;
3263}
3264
3265PyObject *
3266PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 Py_ssize_t size,
3268 const char *errors,
3269 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003270{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003271 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003272 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003273 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003274#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003275 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003276#else
3277 const int pairs = 0;
3278#endif
3279 /* Offsets from p for storing byte pairs in the right order. */
3280#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3281 int iorder[] = {0, 1, 2, 3};
3282#else
3283 int iorder[] = {3, 2, 1, 0};
3284#endif
3285
Benjamin Peterson29060642009-01-31 22:14:21 +00003286#define STORECHAR(CH) \
3287 do { \
3288 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3289 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3290 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3291 p[iorder[0]] = (CH) & 0xff; \
3292 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003293 } while(0)
3294
3295 /* In narrow builds we can output surrogate pairs as one codepoint,
3296 so we need less space. */
3297#ifndef Py_UNICODE_WIDE
3298 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003299 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3300 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3301 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003302#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003303 nsize = (size - pairs + (byteorder == 0));
3304 bytesize = nsize * 4;
3305 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003307 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003308 if (v == NULL)
3309 return NULL;
3310
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003311 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003312 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003314 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003315 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003316
3317 if (byteorder == -1) {
3318 /* force LE */
3319 iorder[0] = 0;
3320 iorder[1] = 1;
3321 iorder[2] = 2;
3322 iorder[3] = 3;
3323 }
3324 else if (byteorder == 1) {
3325 /* force BE */
3326 iorder[0] = 3;
3327 iorder[1] = 2;
3328 iorder[2] = 1;
3329 iorder[3] = 0;
3330 }
3331
3332 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003334#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003335 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3336 Py_UCS4 ch2 = *s;
3337 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3338 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3339 s++;
3340 size--;
3341 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003342 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003343#endif
3344 STORECHAR(ch);
3345 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003346
3347 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003348 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003349#undef STORECHAR
3350}
3351
Alexander Belopolsky40018472011-02-26 01:02:56 +00003352PyObject *
3353PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003354{
3355 if (!PyUnicode_Check(unicode)) {
3356 PyErr_BadArgument();
3357 return NULL;
3358 }
3359 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003360 PyUnicode_GET_SIZE(unicode),
3361 NULL,
3362 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003363}
3364
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365/* --- UTF-16 Codec ------------------------------------------------------- */
3366
Tim Peters772747b2001-08-09 22:21:55 +00003367PyObject *
3368PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003369 Py_ssize_t size,
3370 const char *errors,
3371 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372{
Walter Dörwald69652032004-09-07 20:24:22 +00003373 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3374}
3375
Antoine Pitrouab868312009-01-10 15:40:25 +00003376/* Two masks for fast checking of whether a C 'long' may contain
3377 UTF16-encoded surrogate characters. This is an efficient heuristic,
3378 assuming that non-surrogate characters with a code point >= 0x8000 are
3379 rare in most input.
3380 FAST_CHAR_MASK is used when the input is in native byte ordering,
3381 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003382*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003383#if (SIZEOF_LONG == 8)
3384# define FAST_CHAR_MASK 0x8000800080008000L
3385# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3386#elif (SIZEOF_LONG == 4)
3387# define FAST_CHAR_MASK 0x80008000L
3388# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3389#else
3390# error C 'long' size should be either 4 or 8!
3391#endif
3392
Walter Dörwald69652032004-09-07 20:24:22 +00003393PyObject *
3394PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 Py_ssize_t size,
3396 const char *errors,
3397 int *byteorder,
3398 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003399{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003401 Py_ssize_t startinpos;
3402 Py_ssize_t endinpos;
3403 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404 PyUnicodeObject *unicode;
3405 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003406 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003407 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003408 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003409 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003410 /* Offsets from q for retrieving byte pairs in the right order. */
3411#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3412 int ihi = 1, ilo = 0;
3413#else
3414 int ihi = 0, ilo = 1;
3415#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 PyObject *errorHandler = NULL;
3417 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418
3419 /* Note: size will always be longer than the resulting Unicode
3420 character count */
3421 unicode = _PyUnicode_New(size);
3422 if (!unicode)
3423 return NULL;
3424 if (size == 0)
3425 return (PyObject *)unicode;
3426
3427 /* Unpack UTF-16 encoded data */
3428 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003429 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003430 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431
3432 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003433 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003435 /* Check for BOM marks (U+FEFF) in the input and adjust current
3436 byte order setting accordingly. In native mode, the leading BOM
3437 mark is skipped, in all other modes, it is copied to the output
3438 stream as-is (giving a ZWNBSP character). */
3439 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003440 if (size >= 2) {
3441 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003442#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 if (bom == 0xFEFF) {
3444 q += 2;
3445 bo = -1;
3446 }
3447 else if (bom == 0xFFFE) {
3448 q += 2;
3449 bo = 1;
3450 }
Tim Petersced69f82003-09-16 20:30:58 +00003451#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003452 if (bom == 0xFEFF) {
3453 q += 2;
3454 bo = 1;
3455 }
3456 else if (bom == 0xFFFE) {
3457 q += 2;
3458 bo = -1;
3459 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003460#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003461 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463
Tim Peters772747b2001-08-09 22:21:55 +00003464 if (bo == -1) {
3465 /* force LE */
3466 ihi = 1;
3467 ilo = 0;
3468 }
3469 else if (bo == 1) {
3470 /* force BE */
3471 ihi = 0;
3472 ilo = 1;
3473 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003474#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3475 native_ordering = ilo < ihi;
3476#else
3477 native_ordering = ilo > ihi;
3478#endif
Tim Peters772747b2001-08-09 22:21:55 +00003479
Antoine Pitrouab868312009-01-10 15:40:25 +00003480 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003481 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003482 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003483 /* First check for possible aligned read of a C 'long'. Unaligned
3484 reads are more expensive, better to defer to another iteration. */
3485 if (!((size_t) q & LONG_PTR_MASK)) {
3486 /* Fast path for runs of non-surrogate chars. */
3487 register const unsigned char *_q = q;
3488 Py_UNICODE *_p = p;
3489 if (native_ordering) {
3490 /* Native ordering is simple: as long as the input cannot
3491 possibly contain a surrogate char, do an unrolled copy
3492 of several 16-bit code points to the target object.
3493 The non-surrogate check is done on several input bytes
3494 at a time (as many as a C 'long' can contain). */
3495 while (_q < aligned_end) {
3496 unsigned long data = * (unsigned long *) _q;
3497 if (data & FAST_CHAR_MASK)
3498 break;
3499 _p[0] = ((unsigned short *) _q)[0];
3500 _p[1] = ((unsigned short *) _q)[1];
3501#if (SIZEOF_LONG == 8)
3502 _p[2] = ((unsigned short *) _q)[2];
3503 _p[3] = ((unsigned short *) _q)[3];
3504#endif
3505 _q += SIZEOF_LONG;
3506 _p += SIZEOF_LONG / 2;
3507 }
3508 }
3509 else {
3510 /* Byteswapped ordering is similar, but we must decompose
3511 the copy bytewise, and take care of zero'ing out the
3512 upper bytes if the target object is in 32-bit units
3513 (that is, in UCS-4 builds). */
3514 while (_q < aligned_end) {
3515 unsigned long data = * (unsigned long *) _q;
3516 if (data & SWAPPED_FAST_CHAR_MASK)
3517 break;
3518 /* Zero upper bytes in UCS-4 builds */
3519#if (Py_UNICODE_SIZE > 2)
3520 _p[0] = 0;
3521 _p[1] = 0;
3522#if (SIZEOF_LONG == 8)
3523 _p[2] = 0;
3524 _p[3] = 0;
3525#endif
3526#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003527 /* Issue #4916; UCS-4 builds on big endian machines must
3528 fill the two last bytes of each 4-byte unit. */
3529#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3530# define OFF 2
3531#else
3532# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003533#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003534 ((unsigned char *) _p)[OFF + 1] = _q[0];
3535 ((unsigned char *) _p)[OFF + 0] = _q[1];
3536 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3537 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3538#if (SIZEOF_LONG == 8)
3539 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3540 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3541 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3542 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3543#endif
3544#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003545 _q += SIZEOF_LONG;
3546 _p += SIZEOF_LONG / 2;
3547 }
3548 }
3549 p = _p;
3550 q = _q;
3551 if (q >= e)
3552 break;
3553 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555
Benjamin Peterson14339b62009-01-31 16:36:08 +00003556 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003557
3558 if (ch < 0xD800 || ch > 0xDFFF) {
3559 *p++ = ch;
3560 continue;
3561 }
3562
3563 /* UTF-16 code pair: */
3564 if (q > e) {
3565 errmsg = "unexpected end of data";
3566 startinpos = (((const char *)q) - 2) - starts;
3567 endinpos = ((const char *)e) + 1 - starts;
3568 goto utf16Error;
3569 }
3570 if (0xD800 <= ch && ch <= 0xDBFF) {
3571 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3572 q += 2;
3573 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003574#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003575 *p++ = ch;
3576 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003577#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003578 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003579#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003580 continue;
3581 }
3582 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003583 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003584 startinpos = (((const char *)q)-4)-starts;
3585 endinpos = startinpos+2;
3586 goto utf16Error;
3587 }
3588
Benjamin Peterson14339b62009-01-31 16:36:08 +00003589 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 errmsg = "illegal encoding";
3591 startinpos = (((const char *)q)-2)-starts;
3592 endinpos = startinpos+2;
3593 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003594
Benjamin Peterson29060642009-01-31 22:14:21 +00003595 utf16Error:
3596 outpos = p - PyUnicode_AS_UNICODE(unicode);
3597 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003598 errors,
3599 &errorHandler,
3600 "utf16", errmsg,
3601 &starts,
3602 (const char **)&e,
3603 &startinpos,
3604 &endinpos,
3605 &exc,
3606 (const char **)&q,
3607 &unicode,
3608 &outpos,
3609 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003610 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003612 /* remaining byte at the end? (size should be even) */
3613 if (e == q) {
3614 if (!consumed) {
3615 errmsg = "truncated data";
3616 startinpos = ((const char *)q) - starts;
3617 endinpos = ((const char *)e) + 1 - starts;
3618 outpos = p - PyUnicode_AS_UNICODE(unicode);
3619 if (unicode_decode_call_errorhandler(
3620 errors,
3621 &errorHandler,
3622 "utf16", errmsg,
3623 &starts,
3624 (const char **)&e,
3625 &startinpos,
3626 &endinpos,
3627 &exc,
3628 (const char **)&q,
3629 &unicode,
3630 &outpos,
3631 &p))
3632 goto onError;
3633 /* The remaining input chars are ignored if the callback
3634 chooses to skip the input */
3635 }
3636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637
3638 if (byteorder)
3639 *byteorder = bo;
3640
Walter Dörwald69652032004-09-07 20:24:22 +00003641 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003642 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003643
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003645 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 goto onError;
3647
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 Py_XDECREF(errorHandler);
3649 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 return (PyObject *)unicode;
3651
Benjamin Peterson29060642009-01-31 22:14:21 +00003652 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 Py_XDECREF(errorHandler);
3655 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 return NULL;
3657}
3658
Antoine Pitrouab868312009-01-10 15:40:25 +00003659#undef FAST_CHAR_MASK
3660#undef SWAPPED_FAST_CHAR_MASK
3661
Tim Peters772747b2001-08-09 22:21:55 +00003662PyObject *
3663PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 Py_ssize_t size,
3665 const char *errors,
3666 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003668 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003669 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003670 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003671#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003672 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003673#else
3674 const int pairs = 0;
3675#endif
Tim Peters772747b2001-08-09 22:21:55 +00003676 /* Offsets from p for storing byte pairs in the right order. */
3677#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3678 int ihi = 1, ilo = 0;
3679#else
3680 int ihi = 0, ilo = 1;
3681#endif
3682
Benjamin Peterson29060642009-01-31 22:14:21 +00003683#define STORECHAR(CH) \
3684 do { \
3685 p[ihi] = ((CH) >> 8) & 0xff; \
3686 p[ilo] = (CH) & 0xff; \
3687 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003688 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003690#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003691 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003692 if (s[i] >= 0x10000)
3693 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003694#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003695 /* 2 * (size + pairs + (byteorder == 0)) */
3696 if (size > PY_SSIZE_T_MAX ||
3697 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003699 nsize = size + pairs + (byteorder == 0);
3700 bytesize = nsize * 2;
3701 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003703 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 if (v == NULL)
3705 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003707 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003710 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003711 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003712
3713 if (byteorder == -1) {
3714 /* force LE */
3715 ihi = 1;
3716 ilo = 0;
3717 }
3718 else if (byteorder == 1) {
3719 /* force BE */
3720 ihi = 0;
3721 ilo = 1;
3722 }
3723
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003724 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003725 Py_UNICODE ch = *s++;
3726 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003727#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 if (ch >= 0x10000) {
3729 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3730 ch = 0xD800 | ((ch-0x10000) >> 10);
3731 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003732#endif
Tim Peters772747b2001-08-09 22:21:55 +00003733 STORECHAR(ch);
3734 if (ch2)
3735 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003736 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003737
3738 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003739 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003740#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741}
3742
Alexander Belopolsky40018472011-02-26 01:02:56 +00003743PyObject *
3744PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745{
3746 if (!PyUnicode_Check(unicode)) {
3747 PyErr_BadArgument();
3748 return NULL;
3749 }
3750 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003751 PyUnicode_GET_SIZE(unicode),
3752 NULL,
3753 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754}
3755
3756/* --- Unicode Escape Codec ----------------------------------------------- */
3757
Fredrik Lundh06d12682001-01-24 07:59:11 +00003758static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003759
Alexander Belopolsky40018472011-02-26 01:02:56 +00003760PyObject *
3761PyUnicode_DecodeUnicodeEscape(const char *s,
3762 Py_ssize_t size,
3763 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003766 Py_ssize_t startinpos;
3767 Py_ssize_t endinpos;
3768 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003773 char* message;
3774 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 PyObject *errorHandler = NULL;
3776 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003777
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 /* Escaped strings will always be longer than the resulting
3779 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003780 length after conversion to the true value.
3781 (but if the error callback returns a long replacement string
3782 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 v = _PyUnicode_New(size);
3784 if (v == NULL)
3785 goto onError;
3786 if (size == 0)
3787 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003788
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003789 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003791
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 while (s < end) {
3793 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003794 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796
3797 /* Non-escape characters are interpreted as Unicode ordinals */
3798 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003799 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800 continue;
3801 }
3802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003803 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 /* \ - Escapes */
3805 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003806 c = *s++;
3807 if (s > end)
3808 c = '\0'; /* Invalid after \ */
3809 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810
Benjamin Peterson29060642009-01-31 22:14:21 +00003811 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 case '\n': break;
3813 case '\\': *p++ = '\\'; break;
3814 case '\'': *p++ = '\''; break;
3815 case '\"': *p++ = '\"'; break;
3816 case 'b': *p++ = '\b'; break;
3817 case 'f': *p++ = '\014'; break; /* FF */
3818 case 't': *p++ = '\t'; break;
3819 case 'n': *p++ = '\n'; break;
3820 case 'r': *p++ = '\r'; break;
3821 case 'v': *p++ = '\013'; break; /* VT */
3822 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3823
Benjamin Peterson29060642009-01-31 22:14:21 +00003824 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 case '0': case '1': case '2': case '3':
3826 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003827 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003828 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003829 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003830 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003831 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003833 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 break;
3835
Benjamin Peterson29060642009-01-31 22:14:21 +00003836 /* hex escapes */
3837 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003839 digits = 2;
3840 message = "truncated \\xXX escape";
3841 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842
Benjamin Peterson29060642009-01-31 22:14:21 +00003843 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003845 digits = 4;
3846 message = "truncated \\uXXXX escape";
3847 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848
Benjamin Peterson29060642009-01-31 22:14:21 +00003849 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003850 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003851 digits = 8;
3852 message = "truncated \\UXXXXXXXX escape";
3853 hexescape:
3854 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003855 outpos = p-PyUnicode_AS_UNICODE(v);
3856 if (s+digits>end) {
3857 endinpos = size;
3858 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003859 errors, &errorHandler,
3860 "unicodeescape", "end of string in escape sequence",
3861 &starts, &end, &startinpos, &endinpos, &exc, &s,
3862 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003863 goto onError;
3864 goto nextByte;
3865 }
3866 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003867 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003868 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003869 endinpos = (s+i+1)-starts;
3870 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003871 errors, &errorHandler,
3872 "unicodeescape", message,
3873 &starts, &end, &startinpos, &endinpos, &exc, &s,
3874 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003875 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003877 }
3878 chr = (chr<<4) & ~0xF;
3879 if (c >= '0' && c <= '9')
3880 chr += c - '0';
3881 else if (c >= 'a' && c <= 'f')
3882 chr += 10 + c - 'a';
3883 else
3884 chr += 10 + c - 'A';
3885 }
3886 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003887 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 /* _decoding_error will have already written into the
3889 target buffer. */
3890 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003891 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003892 /* when we get here, chr is a 32-bit unicode character */
3893 if (chr <= 0xffff)
3894 /* UCS-2 character */
3895 *p++ = (Py_UNICODE) chr;
3896 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003897 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003898 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003899#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003900 *p++ = chr;
3901#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003902 chr -= 0x10000L;
3903 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003904 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003905#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003906 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 endinpos = s-starts;
3908 outpos = p-PyUnicode_AS_UNICODE(v);
3909 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003910 errors, &errorHandler,
3911 "unicodeescape", "illegal Unicode character",
3912 &starts, &end, &startinpos, &endinpos, &exc, &s,
3913 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003914 goto onError;
3915 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003916 break;
3917
Benjamin Peterson29060642009-01-31 22:14:21 +00003918 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003919 case 'N':
3920 message = "malformed \\N character escape";
3921 if (ucnhash_CAPI == NULL) {
3922 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003923 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003924 if (ucnhash_CAPI == NULL)
3925 goto ucnhashError;
3926 }
3927 if (*s == '{') {
3928 const char *start = s+1;
3929 /* look for the closing brace */
3930 while (*s != '}' && s < end)
3931 s++;
3932 if (s > start && s < end && *s == '}') {
3933 /* found a name. look it up in the unicode database */
3934 message = "unknown Unicode character name";
3935 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003936 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003937 goto store;
3938 }
3939 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003940 endinpos = s-starts;
3941 outpos = p-PyUnicode_AS_UNICODE(v);
3942 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003943 errors, &errorHandler,
3944 "unicodeescape", message,
3945 &starts, &end, &startinpos, &endinpos, &exc, &s,
3946 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003947 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003948 break;
3949
3950 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003951 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 message = "\\ at end of string";
3953 s--;
3954 endinpos = s-starts;
3955 outpos = p-PyUnicode_AS_UNICODE(v);
3956 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003957 errors, &errorHandler,
3958 "unicodeescape", message,
3959 &starts, &end, &startinpos, &endinpos, &exc, &s,
3960 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003961 goto onError;
3962 }
3963 else {
3964 *p++ = '\\';
3965 *p++ = (unsigned char)s[-1];
3966 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003967 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003969 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003970 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003972 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003974 Py_XDECREF(errorHandler);
3975 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003977
Benjamin Peterson29060642009-01-31 22:14:21 +00003978 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003979 PyErr_SetString(
3980 PyExc_UnicodeError,
3981 "\\N escapes not supported (can't load unicodedata module)"
3982 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003983 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 Py_XDECREF(errorHandler);
3985 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003986 return NULL;
3987
Benjamin Peterson29060642009-01-31 22:14:21 +00003988 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 Py_XDECREF(errorHandler);
3991 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 return NULL;
3993}
3994
3995/* Return a Unicode-Escape string version of the Unicode object.
3996
3997 If quotes is true, the string is enclosed in u"" or u'' quotes as
3998 appropriate.
3999
4000*/
4001
Thomas Wouters477c8d52006-05-27 19:21:47 +00004002Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 Py_ssize_t size,
4004 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00004005{
4006 /* like wcschr, but doesn't stop at NULL characters */
4007
4008 while (size-- > 0) {
4009 if (*s == ch)
4010 return s;
4011 s++;
4012 }
4013
4014 return NULL;
4015}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004016
Walter Dörwald79e913e2007-05-12 11:08:06 +00004017static const char *hexdigits = "0123456789abcdef";
4018
Alexander Belopolsky40018472011-02-26 01:02:56 +00004019PyObject *
4020PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4021 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004023 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004026#ifdef Py_UNICODE_WIDE
4027 const Py_ssize_t expandsize = 10;
4028#else
4029 const Py_ssize_t expandsize = 6;
4030#endif
4031
Thomas Wouters89f507f2006-12-13 04:49:30 +00004032 /* XXX(nnorwitz): rather than over-allocating, it would be
4033 better to choose a different scheme. Perhaps scan the
4034 first N-chars of the string and allocate based on that size.
4035 */
4036 /* Initial allocation is based on the longest-possible unichr
4037 escape.
4038
4039 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4040 unichr, so in this case it's the longest unichr escape. In
4041 narrow (UTF-16) builds this is five chars per source unichr
4042 since there are two unichrs in the surrogate pair, so in narrow
4043 (UTF-16) builds it's not the longest unichr escape.
4044
4045 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4046 so in the narrow (UTF-16) build case it's the longest unichr
4047 escape.
4048 */
4049
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004050 if (size == 0)
4051 return PyBytes_FromStringAndSize(NULL, 0);
4052
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004053 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004054 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004055
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004056 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 2
4058 + expandsize*size
4059 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 if (repr == NULL)
4061 return NULL;
4062
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004063 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 while (size-- > 0) {
4066 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004067
Walter Dörwald79e913e2007-05-12 11:08:06 +00004068 /* Escape backslashes */
4069 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 *p++ = '\\';
4071 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004072 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004073 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004074
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004075#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004076 /* Map 21-bit characters to '\U00xxxxxx' */
4077 else if (ch >= 0x10000) {
4078 *p++ = '\\';
4079 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004080 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4081 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4082 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4083 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4084 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4085 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4086 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4087 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004089 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004090#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4092 else if (ch >= 0xD800 && ch < 0xDC00) {
4093 Py_UNICODE ch2;
4094 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004095
Benjamin Peterson29060642009-01-31 22:14:21 +00004096 ch2 = *s++;
4097 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004098 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004099 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4100 *p++ = '\\';
4101 *p++ = 'U';
4102 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4103 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4104 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4105 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4106 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4107 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4108 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4109 *p++ = hexdigits[ucs & 0x0000000F];
4110 continue;
4111 }
4112 /* Fall through: isolated surrogates are copied as-is */
4113 s--;
4114 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004115 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004116#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004117
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004119 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 *p++ = '\\';
4121 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004122 *p++ = hexdigits[(ch >> 12) & 0x000F];
4123 *p++ = hexdigits[(ch >> 8) & 0x000F];
4124 *p++ = hexdigits[(ch >> 4) & 0x000F];
4125 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004127
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004128 /* Map special whitespace to '\t', \n', '\r' */
4129 else if (ch == '\t') {
4130 *p++ = '\\';
4131 *p++ = 't';
4132 }
4133 else if (ch == '\n') {
4134 *p++ = '\\';
4135 *p++ = 'n';
4136 }
4137 else if (ch == '\r') {
4138 *p++ = '\\';
4139 *p++ = 'r';
4140 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004141
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004142 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004143 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004145 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004146 *p++ = hexdigits[(ch >> 4) & 0x000F];
4147 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004148 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004149
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 /* Copy everything else as-is */
4151 else
4152 *p++ = (char) ch;
4153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004155 assert(p - PyBytes_AS_STRING(repr) > 0);
4156 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4157 return NULL;
4158 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159}
4160
Alexander Belopolsky40018472011-02-26 01:02:56 +00004161PyObject *
4162PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004164 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 if (!PyUnicode_Check(unicode)) {
4166 PyErr_BadArgument();
4167 return NULL;
4168 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004169 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4170 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004171 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172}
4173
4174/* --- Raw Unicode Escape Codec ------------------------------------------- */
4175
Alexander Belopolsky40018472011-02-26 01:02:56 +00004176PyObject *
4177PyUnicode_DecodeRawUnicodeEscape(const char *s,
4178 Py_ssize_t size,
4179 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004182 Py_ssize_t startinpos;
4183 Py_ssize_t endinpos;
4184 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187 const char *end;
4188 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 PyObject *errorHandler = NULL;
4190 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004191
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 /* Escaped strings will always be longer than the resulting
4193 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 length after conversion to the true value. (But decoding error
4195 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 v = _PyUnicode_New(size);
4197 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 end = s + size;
4203 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 unsigned char c;
4205 Py_UCS4 x;
4206 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004207 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208
Benjamin Peterson29060642009-01-31 22:14:21 +00004209 /* Non-escape characters are interpreted as Unicode ordinals */
4210 if (*s != '\\') {
4211 *p++ = (unsigned char)*s++;
4212 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004213 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004214 startinpos = s-starts;
4215
4216 /* \u-escapes are only interpreted iff the number of leading
4217 backslashes if odd */
4218 bs = s;
4219 for (;s < end;) {
4220 if (*s != '\\')
4221 break;
4222 *p++ = (unsigned char)*s++;
4223 }
4224 if (((s - bs) & 1) == 0 ||
4225 s >= end ||
4226 (*s != 'u' && *s != 'U')) {
4227 continue;
4228 }
4229 p--;
4230 count = *s=='u' ? 4 : 8;
4231 s++;
4232
4233 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4234 outpos = p-PyUnicode_AS_UNICODE(v);
4235 for (x = 0, i = 0; i < count; ++i, ++s) {
4236 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004237 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 endinpos = s-starts;
4239 if (unicode_decode_call_errorhandler(
4240 errors, &errorHandler,
4241 "rawunicodeescape", "truncated \\uXXXX",
4242 &starts, &end, &startinpos, &endinpos, &exc, &s,
4243 &v, &outpos, &p))
4244 goto onError;
4245 goto nextByte;
4246 }
4247 x = (x<<4) & ~0xF;
4248 if (c >= '0' && c <= '9')
4249 x += c - '0';
4250 else if (c >= 'a' && c <= 'f')
4251 x += 10 + c - 'a';
4252 else
4253 x += 10 + c - 'A';
4254 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004255 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 /* UCS-2 character */
4257 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004258 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 /* UCS-4 character. Either store directly, or as
4260 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004261#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004263#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004264 x -= 0x10000L;
4265 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4266 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004267#endif
4268 } else {
4269 endinpos = s-starts;
4270 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004271 if (unicode_decode_call_errorhandler(
4272 errors, &errorHandler,
4273 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 &starts, &end, &startinpos, &endinpos, &exc, &s,
4275 &v, &outpos, &p))
4276 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004277 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004278 nextByte:
4279 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004281 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 Py_XDECREF(errorHandler);
4284 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004286
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 Py_XDECREF(errorHandler);
4290 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 return NULL;
4292}
4293
Alexander Belopolsky40018472011-02-26 01:02:56 +00004294PyObject *
4295PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4296 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004298 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 char *p;
4300 char *q;
4301
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004302#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004303 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004304#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004305 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004306#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004307
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004308 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004310
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004311 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 if (repr == NULL)
4313 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004314 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004315 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004317 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 while (size-- > 0) {
4319 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004320#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004321 /* Map 32-bit characters to '\Uxxxxxxxx' */
4322 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004323 *p++ = '\\';
4324 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004325 *p++ = hexdigits[(ch >> 28) & 0xf];
4326 *p++ = hexdigits[(ch >> 24) & 0xf];
4327 *p++ = hexdigits[(ch >> 20) & 0xf];
4328 *p++ = hexdigits[(ch >> 16) & 0xf];
4329 *p++ = hexdigits[(ch >> 12) & 0xf];
4330 *p++ = hexdigits[(ch >> 8) & 0xf];
4331 *p++ = hexdigits[(ch >> 4) & 0xf];
4332 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004333 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004334 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004335#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004336 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4337 if (ch >= 0xD800 && ch < 0xDC00) {
4338 Py_UNICODE ch2;
4339 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004340
Benjamin Peterson29060642009-01-31 22:14:21 +00004341 ch2 = *s++;
4342 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004343 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4345 *p++ = '\\';
4346 *p++ = 'U';
4347 *p++ = hexdigits[(ucs >> 28) & 0xf];
4348 *p++ = hexdigits[(ucs >> 24) & 0xf];
4349 *p++ = hexdigits[(ucs >> 20) & 0xf];
4350 *p++ = hexdigits[(ucs >> 16) & 0xf];
4351 *p++ = hexdigits[(ucs >> 12) & 0xf];
4352 *p++ = hexdigits[(ucs >> 8) & 0xf];
4353 *p++ = hexdigits[(ucs >> 4) & 0xf];
4354 *p++ = hexdigits[ucs & 0xf];
4355 continue;
4356 }
4357 /* Fall through: isolated surrogates are copied as-is */
4358 s--;
4359 size++;
4360 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004361#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004362 /* Map 16-bit characters to '\uxxxx' */
4363 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 *p++ = '\\';
4365 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004366 *p++ = hexdigits[(ch >> 12) & 0xf];
4367 *p++ = hexdigits[(ch >> 8) & 0xf];
4368 *p++ = hexdigits[(ch >> 4) & 0xf];
4369 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004371 /* Copy everything else as-is */
4372 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373 *p++ = (char) ch;
4374 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004375 size = p - q;
4376
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004377 assert(size > 0);
4378 if (_PyBytes_Resize(&repr, size) < 0)
4379 return NULL;
4380 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381}
4382
Alexander Belopolsky40018472011-02-26 01:02:56 +00004383PyObject *
4384PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004386 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004388 PyErr_BadArgument();
4389 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004391 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4392 PyUnicode_GET_SIZE(unicode));
4393
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004394 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395}
4396
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004397/* --- Unicode Internal Codec ------------------------------------------- */
4398
Alexander Belopolsky40018472011-02-26 01:02:56 +00004399PyObject *
4400_PyUnicode_DecodeUnicodeInternal(const char *s,
4401 Py_ssize_t size,
4402 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004403{
4404 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004405 Py_ssize_t startinpos;
4406 Py_ssize_t endinpos;
4407 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004408 PyUnicodeObject *v;
4409 Py_UNICODE *p;
4410 const char *end;
4411 const char *reason;
4412 PyObject *errorHandler = NULL;
4413 PyObject *exc = NULL;
4414
Neal Norwitzd43069c2006-01-08 01:12:10 +00004415#ifdef Py_UNICODE_WIDE
4416 Py_UNICODE unimax = PyUnicode_GetMax();
4417#endif
4418
Thomas Wouters89f507f2006-12-13 04:49:30 +00004419 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004420 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4421 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004423 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004424 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004425 p = PyUnicode_AS_UNICODE(v);
4426 end = s + size;
4427
4428 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004429 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004430 /* We have to sanity check the raw data, otherwise doom looms for
4431 some malformed UCS-4 data. */
4432 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004433#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004434 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004435#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004436 end-s < Py_UNICODE_SIZE
4437 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004439 startinpos = s - starts;
4440 if (end-s < Py_UNICODE_SIZE) {
4441 endinpos = end-starts;
4442 reason = "truncated input";
4443 }
4444 else {
4445 endinpos = s - starts + Py_UNICODE_SIZE;
4446 reason = "illegal code point (> 0x10FFFF)";
4447 }
4448 outpos = p - PyUnicode_AS_UNICODE(v);
4449 if (unicode_decode_call_errorhandler(
4450 errors, &errorHandler,
4451 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004452 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004453 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004454 goto onError;
4455 }
4456 }
4457 else {
4458 p++;
4459 s += Py_UNICODE_SIZE;
4460 }
4461 }
4462
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004463 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004464 goto onError;
4465 Py_XDECREF(errorHandler);
4466 Py_XDECREF(exc);
4467 return (PyObject *)v;
4468
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004470 Py_XDECREF(v);
4471 Py_XDECREF(errorHandler);
4472 Py_XDECREF(exc);
4473 return NULL;
4474}
4475
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476/* --- Latin-1 Codec ------------------------------------------------------ */
4477
Alexander Belopolsky40018472011-02-26 01:02:56 +00004478PyObject *
4479PyUnicode_DecodeLatin1(const char *s,
4480 Py_ssize_t size,
4481 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482{
4483 PyUnicodeObject *v;
4484 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004485 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004486
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004488 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 Py_UNICODE r = *(unsigned char*)s;
4490 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004491 }
4492
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 v = _PyUnicode_New(size);
4494 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004499 e = s + size;
4500 /* Unrolling the copy makes it much faster by reducing the looping
4501 overhead. This is similar to what many memcpy() implementations do. */
4502 unrolled_end = e - 4;
4503 while (s < unrolled_end) {
4504 p[0] = (unsigned char) s[0];
4505 p[1] = (unsigned char) s[1];
4506 p[2] = (unsigned char) s[2];
4507 p[3] = (unsigned char) s[3];
4508 s += 4;
4509 p += 4;
4510 }
4511 while (s < e)
4512 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004514
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516 Py_XDECREF(v);
4517 return NULL;
4518}
4519
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004521static void
4522make_encode_exception(PyObject **exceptionObject,
4523 const char *encoding,
4524 const Py_UNICODE *unicode, Py_ssize_t size,
4525 Py_ssize_t startpos, Py_ssize_t endpos,
4526 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 *exceptionObject = PyUnicodeEncodeError_Create(
4530 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531 }
4532 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4534 goto onError;
4535 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4536 goto onError;
4537 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4538 goto onError;
4539 return;
4540 onError:
4541 Py_DECREF(*exceptionObject);
4542 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 }
4544}
4545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004547static void
4548raise_encode_exception(PyObject **exceptionObject,
4549 const char *encoding,
4550 const Py_UNICODE *unicode, Py_ssize_t size,
4551 Py_ssize_t startpos, Py_ssize_t endpos,
4552 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553{
4554 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004555 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004557 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558}
4559
4560/* error handling callback helper:
4561 build arguments, call the callback and check the arguments,
4562 put the result into newpos and return the replacement string, which
4563 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004564static PyObject *
4565unicode_encode_call_errorhandler(const char *errors,
4566 PyObject **errorHandler,
4567 const char *encoding, const char *reason,
4568 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4569 Py_ssize_t startpos, Py_ssize_t endpos,
4570 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004572 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573
4574 PyObject *restuple;
4575 PyObject *resunicode;
4576
4577 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004578 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004580 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 }
4582
4583 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587
4588 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004593 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 Py_DECREF(restuple);
4595 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004597 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 &resunicode, newpos)) {
4599 Py_DECREF(restuple);
4600 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004602 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4603 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4604 Py_DECREF(restuple);
4605 return NULL;
4606 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004608 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004609 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4611 Py_DECREF(restuple);
4612 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 Py_INCREF(resunicode);
4615 Py_DECREF(restuple);
4616 return resunicode;
4617}
4618
Alexander Belopolsky40018472011-02-26 01:02:56 +00004619static PyObject *
4620unicode_encode_ucs1(const Py_UNICODE *p,
4621 Py_ssize_t size,
4622 const char *errors,
4623 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624{
4625 /* output object */
4626 PyObject *res;
4627 /* pointers to the beginning and end+1 of input */
4628 const Py_UNICODE *startp = p;
4629 const Py_UNICODE *endp = p + size;
4630 /* pointer to the beginning of the unencodable characters */
4631 /* const Py_UNICODE *badp = NULL; */
4632 /* pointer into the output */
4633 char *str;
4634 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004635 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004636 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4637 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 PyObject *errorHandler = NULL;
4639 PyObject *exc = NULL;
4640 /* the following variable is used for caching string comparisons
4641 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4642 int known_errorHandler = -1;
4643
4644 /* allocate enough for a simple encoding without
4645 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004646 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004647 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004648 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004650 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004651 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 ressize = size;
4653
4654 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 /* can we encode this? */
4658 if (c<limit) {
4659 /* no overflow check, because we know that the space is enough */
4660 *str++ = (char)c;
4661 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004662 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 else {
4664 Py_ssize_t unicodepos = p-startp;
4665 Py_ssize_t requiredsize;
4666 PyObject *repunicode;
4667 Py_ssize_t repsize;
4668 Py_ssize_t newpos;
4669 Py_ssize_t respos;
4670 Py_UNICODE *uni2;
4671 /* startpos for collecting unencodable chars */
4672 const Py_UNICODE *collstart = p;
4673 const Py_UNICODE *collend = p;
4674 /* find all unecodable characters */
4675 while ((collend < endp) && ((*collend)>=limit))
4676 ++collend;
4677 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4678 if (known_errorHandler==-1) {
4679 if ((errors==NULL) || (!strcmp(errors, "strict")))
4680 known_errorHandler = 1;
4681 else if (!strcmp(errors, "replace"))
4682 known_errorHandler = 2;
4683 else if (!strcmp(errors, "ignore"))
4684 known_errorHandler = 3;
4685 else if (!strcmp(errors, "xmlcharrefreplace"))
4686 known_errorHandler = 4;
4687 else
4688 known_errorHandler = 0;
4689 }
4690 switch (known_errorHandler) {
4691 case 1: /* strict */
4692 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4693 goto onError;
4694 case 2: /* replace */
4695 while (collstart++<collend)
4696 *str++ = '?'; /* fall through */
4697 case 3: /* ignore */
4698 p = collend;
4699 break;
4700 case 4: /* xmlcharrefreplace */
4701 respos = str - PyBytes_AS_STRING(res);
4702 /* determine replacement size (temporarily (mis)uses p) */
4703 for (p = collstart, repsize = 0; p < collend; ++p) {
4704 if (*p<10)
4705 repsize += 2+1+1;
4706 else if (*p<100)
4707 repsize += 2+2+1;
4708 else if (*p<1000)
4709 repsize += 2+3+1;
4710 else if (*p<10000)
4711 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004712#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004713 else
4714 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004715#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 else if (*p<100000)
4717 repsize += 2+5+1;
4718 else if (*p<1000000)
4719 repsize += 2+6+1;
4720 else
4721 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004722#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004723 }
4724 requiredsize = respos+repsize+(endp-collend);
4725 if (requiredsize > ressize) {
4726 if (requiredsize<2*ressize)
4727 requiredsize = 2*ressize;
4728 if (_PyBytes_Resize(&res, requiredsize))
4729 goto onError;
4730 str = PyBytes_AS_STRING(res) + respos;
4731 ressize = requiredsize;
4732 }
4733 /* generate replacement (temporarily (mis)uses p) */
4734 for (p = collstart; p < collend; ++p) {
4735 str += sprintf(str, "&#%d;", (int)*p);
4736 }
4737 p = collend;
4738 break;
4739 default:
4740 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4741 encoding, reason, startp, size, &exc,
4742 collstart-startp, collend-startp, &newpos);
4743 if (repunicode == NULL)
4744 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004745 if (PyBytes_Check(repunicode)) {
4746 /* Directly copy bytes result to output. */
4747 repsize = PyBytes_Size(repunicode);
4748 if (repsize > 1) {
4749 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004750 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004751 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4752 Py_DECREF(repunicode);
4753 goto onError;
4754 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004755 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004756 ressize += repsize-1;
4757 }
4758 memcpy(str, PyBytes_AsString(repunicode), repsize);
4759 str += repsize;
4760 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004761 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004762 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004763 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 /* need more space? (at least enough for what we
4765 have+the replacement+the rest of the string, so
4766 we won't have to check space for encodable characters) */
4767 respos = str - PyBytes_AS_STRING(res);
4768 repsize = PyUnicode_GET_SIZE(repunicode);
4769 requiredsize = respos+repsize+(endp-collend);
4770 if (requiredsize > ressize) {
4771 if (requiredsize<2*ressize)
4772 requiredsize = 2*ressize;
4773 if (_PyBytes_Resize(&res, requiredsize)) {
4774 Py_DECREF(repunicode);
4775 goto onError;
4776 }
4777 str = PyBytes_AS_STRING(res) + respos;
4778 ressize = requiredsize;
4779 }
4780 /* check if there is anything unencodable in the replacement
4781 and copy it to the output */
4782 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4783 c = *uni2;
4784 if (c >= limit) {
4785 raise_encode_exception(&exc, encoding, startp, size,
4786 unicodepos, unicodepos+1, reason);
4787 Py_DECREF(repunicode);
4788 goto onError;
4789 }
4790 *str = (char)c;
4791 }
4792 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004793 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004794 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004795 }
4796 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004797 /* Resize if we allocated to much */
4798 size = str - PyBytes_AS_STRING(res);
4799 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004800 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004801 if (_PyBytes_Resize(&res, size) < 0)
4802 goto onError;
4803 }
4804
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 Py_XDECREF(errorHandler);
4806 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004807 return res;
4808
4809 onError:
4810 Py_XDECREF(res);
4811 Py_XDECREF(errorHandler);
4812 Py_XDECREF(exc);
4813 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814}
4815
Alexander Belopolsky40018472011-02-26 01:02:56 +00004816PyObject *
4817PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4818 Py_ssize_t size,
4819 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822}
4823
Alexander Belopolsky40018472011-02-26 01:02:56 +00004824PyObject *
4825PyUnicode_AsLatin1String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826{
4827 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 PyErr_BadArgument();
4829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 }
4831 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004832 PyUnicode_GET_SIZE(unicode),
4833 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834}
4835
4836/* --- 7-bit ASCII Codec -------------------------------------------------- */
4837
Alexander Belopolsky40018472011-02-26 01:02:56 +00004838PyObject *
4839PyUnicode_DecodeASCII(const char *s,
4840 Py_ssize_t size,
4841 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 PyUnicodeObject *v;
4845 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004846 Py_ssize_t startinpos;
4847 Py_ssize_t endinpos;
4848 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 const char *e;
4850 PyObject *errorHandler = NULL;
4851 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004852
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004854 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 Py_UNICODE r = *(unsigned char*)s;
4856 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004857 }
Tim Petersced69f82003-09-16 20:30:58 +00004858
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 v = _PyUnicode_New(size);
4860 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004863 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004865 e = s + size;
4866 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 register unsigned char c = (unsigned char)*s;
4868 if (c < 128) {
4869 *p++ = c;
4870 ++s;
4871 }
4872 else {
4873 startinpos = s-starts;
4874 endinpos = startinpos + 1;
4875 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4876 if (unicode_decode_call_errorhandler(
4877 errors, &errorHandler,
4878 "ascii", "ordinal not in range(128)",
4879 &starts, &e, &startinpos, &endinpos, &exc, &s,
4880 &v, &outpos, &p))
4881 goto onError;
4882 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004884 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4886 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 Py_XDECREF(errorHandler);
4888 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004890
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 Py_XDECREF(errorHandler);
4894 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 return NULL;
4896}
4897
Alexander Belopolsky40018472011-02-26 01:02:56 +00004898PyObject *
4899PyUnicode_EncodeASCII(const Py_UNICODE *p,
4900 Py_ssize_t size,
4901 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904}
4905
Alexander Belopolsky40018472011-02-26 01:02:56 +00004906PyObject *
4907PyUnicode_AsASCIIString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908{
4909 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 PyErr_BadArgument();
4911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 }
4913 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 PyUnicode_GET_SIZE(unicode),
4915 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916}
4917
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004918#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004919
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004920/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004921
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004922#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004923#define NEED_RETRY
4924#endif
4925
4926/* XXX This code is limited to "true" double-byte encodings, as
4927 a) it assumes an incomplete character consists of a single byte, and
4928 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004930
Alexander Belopolsky40018472011-02-26 01:02:56 +00004931static int
4932is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004933{
4934 const char *curr = s + offset;
4935
4936 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 const char *prev = CharPrev(s, curr);
4938 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004939 }
4940 return 0;
4941}
4942
4943/*
4944 * Decode MBCS string into unicode object. If 'final' is set, converts
4945 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4946 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004947static int
4948decode_mbcs(PyUnicodeObject **v,
4949 const char *s, /* MBCS string */
4950 int size, /* sizeof MBCS string */
4951 int final,
4952 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004953{
4954 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004955 Py_ssize_t n;
4956 DWORD usize;
4957 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004958
4959 assert(size >= 0);
4960
Victor Stinner554f3f02010-06-16 23:33:54 +00004961 /* check and handle 'errors' arg */
4962 if (errors==NULL || strcmp(errors, "strict")==0)
4963 flags = MB_ERR_INVALID_CHARS;
4964 else if (strcmp(errors, "ignore")==0)
4965 flags = 0;
4966 else {
4967 PyErr_Format(PyExc_ValueError,
4968 "mbcs encoding does not support errors='%s'",
4969 errors);
4970 return -1;
4971 }
4972
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004973 /* Skip trailing lead-byte unless 'final' is set */
4974 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004975 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004976
4977 /* First get the size of the result */
4978 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004979 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4980 if (usize==0)
4981 goto mbcs_decode_error;
4982 } else
4983 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004984
4985 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 /* Create unicode object */
4987 *v = _PyUnicode_New(usize);
4988 if (*v == NULL)
4989 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004990 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004991 }
4992 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 /* Extend unicode object */
4994 n = PyUnicode_GET_SIZE(*v);
4995 if (_PyUnicode_Resize(v, n + usize) < 0)
4996 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004997 }
4998
4999 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00005000 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005001 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005002 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
5003 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005005 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005006 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00005007
5008mbcs_decode_error:
5009 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5010 we raise a UnicodeDecodeError - else it is a 'generic'
5011 windows error
5012 */
5013 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5014 /* Ideally, we should get reason from FormatMessage - this
5015 is the Windows 2000 English version of the message
5016 */
5017 PyObject *exc = NULL;
5018 const char *reason = "No mapping for the Unicode character exists "
5019 "in the target multi-byte code page.";
5020 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5021 if (exc != NULL) {
5022 PyCodec_StrictErrors(exc);
5023 Py_DECREF(exc);
5024 }
5025 } else {
5026 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5027 }
5028 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005029}
5030
Alexander Belopolsky40018472011-02-26 01:02:56 +00005031PyObject *
5032PyUnicode_DecodeMBCSStateful(const char *s,
5033 Py_ssize_t size,
5034 const char *errors,
5035 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005036{
5037 PyUnicodeObject *v = NULL;
5038 int done;
5039
5040 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005042
5043#ifdef NEED_RETRY
5044 retry:
5045 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005046 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005047 else
5048#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005049 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005050
5051 if (done < 0) {
5052 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005054 }
5055
5056 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005058
5059#ifdef NEED_RETRY
5060 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 s += done;
5062 size -= done;
5063 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005064 }
5065#endif
5066
5067 return (PyObject *)v;
5068}
5069
Alexander Belopolsky40018472011-02-26 01:02:56 +00005070PyObject *
5071PyUnicode_DecodeMBCS(const char *s,
5072 Py_ssize_t size,
5073 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005074{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005075 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5076}
5077
5078/*
5079 * Convert unicode into string object (MBCS).
5080 * Returns 0 if succeed, -1 otherwise.
5081 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005082static int
5083encode_mbcs(PyObject **repr,
5084 const Py_UNICODE *p, /* unicode */
5085 int size, /* size of unicode */
5086 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005087{
Victor Stinner554f3f02010-06-16 23:33:54 +00005088 BOOL usedDefaultChar = FALSE;
5089 BOOL *pusedDefaultChar;
5090 int mbcssize;
5091 Py_ssize_t n;
5092 PyObject *exc = NULL;
5093 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005094
5095 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005096
Victor Stinner554f3f02010-06-16 23:33:54 +00005097 /* check and handle 'errors' arg */
5098 if (errors==NULL || strcmp(errors, "strict")==0) {
5099 flags = WC_NO_BEST_FIT_CHARS;
5100 pusedDefaultChar = &usedDefaultChar;
5101 } else if (strcmp(errors, "replace")==0) {
5102 flags = 0;
5103 pusedDefaultChar = NULL;
5104 } else {
5105 PyErr_Format(PyExc_ValueError,
5106 "mbcs encoding does not support errors='%s'",
5107 errors);
5108 return -1;
5109 }
5110
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005111 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005112 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005113 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5114 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 if (mbcssize == 0) {
5116 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5117 return -1;
5118 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005119 /* If we used a default char, then we failed! */
5120 if (pusedDefaultChar && *pusedDefaultChar)
5121 goto mbcs_encode_error;
5122 } else {
5123 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005124 }
5125
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005126 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 /* Create string object */
5128 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5129 if (*repr == NULL)
5130 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005131 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005132 }
5133 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 /* Extend string object */
5135 n = PyBytes_Size(*repr);
5136 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5137 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005138 }
5139
5140 /* Do the conversion */
5141 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005143 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5144 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5146 return -1;
5147 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005148 if (pusedDefaultChar && *pusedDefaultChar)
5149 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005150 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005151 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005152
5153mbcs_encode_error:
5154 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5155 Py_XDECREF(exc);
5156 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005157}
5158
Alexander Belopolsky40018472011-02-26 01:02:56 +00005159PyObject *
5160PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5161 Py_ssize_t size,
5162 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005163{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005164 PyObject *repr = NULL;
5165 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005166
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005167#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005169 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005170 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005171 else
5172#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005173 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005174
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005175 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 Py_XDECREF(repr);
5177 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005178 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005179
5180#ifdef NEED_RETRY
5181 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 p += INT_MAX;
5183 size -= INT_MAX;
5184 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005185 }
5186#endif
5187
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005188 return repr;
5189}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005190
Alexander Belopolsky40018472011-02-26 01:02:56 +00005191PyObject *
5192PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005193{
5194 if (!PyUnicode_Check(unicode)) {
5195 PyErr_BadArgument();
5196 return NULL;
5197 }
5198 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 PyUnicode_GET_SIZE(unicode),
5200 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005201}
5202
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005203#undef NEED_RETRY
5204
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005205#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005206
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207/* --- Character Mapping Codec -------------------------------------------- */
5208
Alexander Belopolsky40018472011-02-26 01:02:56 +00005209PyObject *
5210PyUnicode_DecodeCharmap(const char *s,
5211 Py_ssize_t size,
5212 PyObject *mapping,
5213 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005215 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005216 Py_ssize_t startinpos;
5217 Py_ssize_t endinpos;
5218 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005219 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 PyUnicodeObject *v;
5221 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005222 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005223 PyObject *errorHandler = NULL;
5224 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005225 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005226 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005227
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228 /* Default to Latin-1 */
5229 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
5232 v = _PyUnicode_New(size);
5233 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005234 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005239 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 mapstring = PyUnicode_AS_UNICODE(mapping);
5241 maplen = PyUnicode_GET_SIZE(mapping);
5242 while (s < e) {
5243 unsigned char ch = *s;
5244 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 if (ch < maplen)
5247 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 if (x == 0xfffe) {
5250 /* undefined mapping */
5251 outpos = p-PyUnicode_AS_UNICODE(v);
5252 startinpos = s-starts;
5253 endinpos = startinpos+1;
5254 if (unicode_decode_call_errorhandler(
5255 errors, &errorHandler,
5256 "charmap", "character maps to <undefined>",
5257 &starts, &e, &startinpos, &endinpos, &exc, &s,
5258 &v, &outpos, &p)) {
5259 goto onError;
5260 }
5261 continue;
5262 }
5263 *p++ = x;
5264 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005265 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005266 }
5267 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 while (s < e) {
5269 unsigned char ch = *s;
5270 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005271
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5273 w = PyLong_FromLong((long)ch);
5274 if (w == NULL)
5275 goto onError;
5276 x = PyObject_GetItem(mapping, w);
5277 Py_DECREF(w);
5278 if (x == NULL) {
5279 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5280 /* No mapping found means: mapping is undefined. */
5281 PyErr_Clear();
5282 x = Py_None;
5283 Py_INCREF(x);
5284 } else
5285 goto onError;
5286 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005287
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 /* Apply mapping */
5289 if (PyLong_Check(x)) {
5290 long value = PyLong_AS_LONG(x);
5291 if (value < 0 || value > 65535) {
5292 PyErr_SetString(PyExc_TypeError,
5293 "character mapping must be in range(65536)");
5294 Py_DECREF(x);
5295 goto onError;
5296 }
5297 *p++ = (Py_UNICODE)value;
5298 }
5299 else if (x == Py_None) {
5300 /* undefined mapping */
5301 outpos = p-PyUnicode_AS_UNICODE(v);
5302 startinpos = s-starts;
5303 endinpos = startinpos+1;
5304 if (unicode_decode_call_errorhandler(
5305 errors, &errorHandler,
5306 "charmap", "character maps to <undefined>",
5307 &starts, &e, &startinpos, &endinpos, &exc, &s,
5308 &v, &outpos, &p)) {
5309 Py_DECREF(x);
5310 goto onError;
5311 }
5312 Py_DECREF(x);
5313 continue;
5314 }
5315 else if (PyUnicode_Check(x)) {
5316 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005317
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 if (targetsize == 1)
5319 /* 1-1 mapping */
5320 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005321
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 else if (targetsize > 1) {
5323 /* 1-n mapping */
5324 if (targetsize > extrachars) {
5325 /* resize first */
5326 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5327 Py_ssize_t needed = (targetsize - extrachars) + \
5328 (targetsize << 2);
5329 extrachars += needed;
5330 /* XXX overflow detection missing */
5331 if (_PyUnicode_Resize(&v,
5332 PyUnicode_GET_SIZE(v) + needed) < 0) {
5333 Py_DECREF(x);
5334 goto onError;
5335 }
5336 p = PyUnicode_AS_UNICODE(v) + oldpos;
5337 }
5338 Py_UNICODE_COPY(p,
5339 PyUnicode_AS_UNICODE(x),
5340 targetsize);
5341 p += targetsize;
5342 extrachars -= targetsize;
5343 }
5344 /* 1-0 mapping: skip the character */
5345 }
5346 else {
5347 /* wrong return value */
5348 PyErr_SetString(PyExc_TypeError,
5349 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005350 Py_DECREF(x);
5351 goto onError;
5352 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 Py_DECREF(x);
5354 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005355 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 }
5357 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5359 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 Py_XDECREF(errorHandler);
5361 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005363
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365 Py_XDECREF(errorHandler);
5366 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 Py_XDECREF(v);
5368 return NULL;
5369}
5370
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005371/* Charmap encoding: the lookup table */
5372
Alexander Belopolsky40018472011-02-26 01:02:56 +00005373struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 PyObject_HEAD
5375 unsigned char level1[32];
5376 int count2, count3;
5377 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005378};
5379
5380static PyObject*
5381encoding_map_size(PyObject *obj, PyObject* args)
5382{
5383 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005384 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005386}
5387
5388static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005389 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 PyDoc_STR("Return the size (in bytes) of this object") },
5391 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005392};
5393
5394static void
5395encoding_map_dealloc(PyObject* o)
5396{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005397 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005398}
5399
5400static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005401 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 "EncodingMap", /*tp_name*/
5403 sizeof(struct encoding_map), /*tp_basicsize*/
5404 0, /*tp_itemsize*/
5405 /* methods */
5406 encoding_map_dealloc, /*tp_dealloc*/
5407 0, /*tp_print*/
5408 0, /*tp_getattr*/
5409 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005410 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 0, /*tp_repr*/
5412 0, /*tp_as_number*/
5413 0, /*tp_as_sequence*/
5414 0, /*tp_as_mapping*/
5415 0, /*tp_hash*/
5416 0, /*tp_call*/
5417 0, /*tp_str*/
5418 0, /*tp_getattro*/
5419 0, /*tp_setattro*/
5420 0, /*tp_as_buffer*/
5421 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5422 0, /*tp_doc*/
5423 0, /*tp_traverse*/
5424 0, /*tp_clear*/
5425 0, /*tp_richcompare*/
5426 0, /*tp_weaklistoffset*/
5427 0, /*tp_iter*/
5428 0, /*tp_iternext*/
5429 encoding_map_methods, /*tp_methods*/
5430 0, /*tp_members*/
5431 0, /*tp_getset*/
5432 0, /*tp_base*/
5433 0, /*tp_dict*/
5434 0, /*tp_descr_get*/
5435 0, /*tp_descr_set*/
5436 0, /*tp_dictoffset*/
5437 0, /*tp_init*/
5438 0, /*tp_alloc*/
5439 0, /*tp_new*/
5440 0, /*tp_free*/
5441 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005442};
5443
5444PyObject*
5445PyUnicode_BuildEncodingMap(PyObject* string)
5446{
5447 Py_UNICODE *decode;
5448 PyObject *result;
5449 struct encoding_map *mresult;
5450 int i;
5451 int need_dict = 0;
5452 unsigned char level1[32];
5453 unsigned char level2[512];
5454 unsigned char *mlevel1, *mlevel2, *mlevel3;
5455 int count2 = 0, count3 = 0;
5456
5457 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5458 PyErr_BadArgument();
5459 return NULL;
5460 }
5461 decode = PyUnicode_AS_UNICODE(string);
5462 memset(level1, 0xFF, sizeof level1);
5463 memset(level2, 0xFF, sizeof level2);
5464
5465 /* If there isn't a one-to-one mapping of NULL to \0,
5466 or if there are non-BMP characters, we need to use
5467 a mapping dictionary. */
5468 if (decode[0] != 0)
5469 need_dict = 1;
5470 for (i = 1; i < 256; i++) {
5471 int l1, l2;
5472 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005473#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005474 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005475#endif
5476 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005477 need_dict = 1;
5478 break;
5479 }
5480 if (decode[i] == 0xFFFE)
5481 /* unmapped character */
5482 continue;
5483 l1 = decode[i] >> 11;
5484 l2 = decode[i] >> 7;
5485 if (level1[l1] == 0xFF)
5486 level1[l1] = count2++;
5487 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005488 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005489 }
5490
5491 if (count2 >= 0xFF || count3 >= 0xFF)
5492 need_dict = 1;
5493
5494 if (need_dict) {
5495 PyObject *result = PyDict_New();
5496 PyObject *key, *value;
5497 if (!result)
5498 return NULL;
5499 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005500 key = PyLong_FromLong(decode[i]);
5501 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005502 if (!key || !value)
5503 goto failed1;
5504 if (PyDict_SetItem(result, key, value) == -1)
5505 goto failed1;
5506 Py_DECREF(key);
5507 Py_DECREF(value);
5508 }
5509 return result;
5510 failed1:
5511 Py_XDECREF(key);
5512 Py_XDECREF(value);
5513 Py_DECREF(result);
5514 return NULL;
5515 }
5516
5517 /* Create a three-level trie */
5518 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5519 16*count2 + 128*count3 - 1);
5520 if (!result)
5521 return PyErr_NoMemory();
5522 PyObject_Init(result, &EncodingMapType);
5523 mresult = (struct encoding_map*)result;
5524 mresult->count2 = count2;
5525 mresult->count3 = count3;
5526 mlevel1 = mresult->level1;
5527 mlevel2 = mresult->level23;
5528 mlevel3 = mresult->level23 + 16*count2;
5529 memcpy(mlevel1, level1, 32);
5530 memset(mlevel2, 0xFF, 16*count2);
5531 memset(mlevel3, 0, 128*count3);
5532 count3 = 0;
5533 for (i = 1; i < 256; i++) {
5534 int o1, o2, o3, i2, i3;
5535 if (decode[i] == 0xFFFE)
5536 /* unmapped character */
5537 continue;
5538 o1 = decode[i]>>11;
5539 o2 = (decode[i]>>7) & 0xF;
5540 i2 = 16*mlevel1[o1] + o2;
5541 if (mlevel2[i2] == 0xFF)
5542 mlevel2[i2] = count3++;
5543 o3 = decode[i] & 0x7F;
5544 i3 = 128*mlevel2[i2] + o3;
5545 mlevel3[i3] = i;
5546 }
5547 return result;
5548}
5549
5550static int
5551encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5552{
5553 struct encoding_map *map = (struct encoding_map*)mapping;
5554 int l1 = c>>11;
5555 int l2 = (c>>7) & 0xF;
5556 int l3 = c & 0x7F;
5557 int i;
5558
5559#ifdef Py_UNICODE_WIDE
5560 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005562 }
5563#endif
5564 if (c == 0)
5565 return 0;
5566 /* level 1*/
5567 i = map->level1[l1];
5568 if (i == 0xFF) {
5569 return -1;
5570 }
5571 /* level 2*/
5572 i = map->level23[16*i+l2];
5573 if (i == 0xFF) {
5574 return -1;
5575 }
5576 /* level 3 */
5577 i = map->level23[16*map->count2 + 128*i + l3];
5578 if (i == 0) {
5579 return -1;
5580 }
5581 return i;
5582}
5583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005584/* Lookup the character ch in the mapping. If the character
5585 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005586 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005587static PyObject *
5588charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589{
Christian Heimes217cfd12007-12-02 14:31:20 +00005590 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 PyObject *x;
5592
5593 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005595 x = PyObject_GetItem(mapping, w);
5596 Py_DECREF(w);
5597 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5599 /* No mapping found means: mapping is undefined. */
5600 PyErr_Clear();
5601 x = Py_None;
5602 Py_INCREF(x);
5603 return x;
5604 } else
5605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005607 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005609 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 long value = PyLong_AS_LONG(x);
5611 if (value < 0 || value > 255) {
5612 PyErr_SetString(PyExc_TypeError,
5613 "character mapping must be in range(256)");
5614 Py_DECREF(x);
5615 return NULL;
5616 }
5617 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005619 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 /* wrong return value */
5623 PyErr_Format(PyExc_TypeError,
5624 "character mapping must return integer, bytes or None, not %.400s",
5625 x->ob_type->tp_name);
5626 Py_DECREF(x);
5627 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 }
5629}
5630
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005631static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005632charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005633{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005634 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5635 /* exponentially overallocate to minimize reallocations */
5636 if (requiredsize < 2*outsize)
5637 requiredsize = 2*outsize;
5638 if (_PyBytes_Resize(outobj, requiredsize))
5639 return -1;
5640 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005641}
5642
Benjamin Peterson14339b62009-01-31 16:36:08 +00005643typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005644 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00005645} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005647 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 space is available. Return a new reference to the object that
5649 was put in the output buffer, or Py_None, if the mapping was undefined
5650 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005651 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005652static charmapencode_result
5653charmapencode_output(Py_UNICODE c, PyObject *mapping,
5654 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005656 PyObject *rep;
5657 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005658 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005659
Christian Heimes90aa7642007-12-19 02:45:37 +00005660 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005661 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005662 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005663 if (res == -1)
5664 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 if (outsize<requiredsize)
5666 if (charmapencode_resize(outobj, outpos, requiredsize))
5667 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005668 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 outstart[(*outpos)++] = (char)res;
5670 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005671 }
5672
5673 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005676 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 Py_DECREF(rep);
5678 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005679 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 if (PyLong_Check(rep)) {
5681 Py_ssize_t requiredsize = *outpos+1;
5682 if (outsize<requiredsize)
5683 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5684 Py_DECREF(rep);
5685 return enc_EXCEPTION;
5686 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005687 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005689 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 else {
5691 const char *repchars = PyBytes_AS_STRING(rep);
5692 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5693 Py_ssize_t requiredsize = *outpos+repsize;
5694 if (outsize<requiredsize)
5695 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5696 Py_DECREF(rep);
5697 return enc_EXCEPTION;
5698 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005699 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 memcpy(outstart + *outpos, repchars, repsize);
5701 *outpos += repsize;
5702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005704 Py_DECREF(rep);
5705 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706}
5707
5708/* handle an error in PyUnicode_EncodeCharmap
5709 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005710static int
5711charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005712 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005714 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005715 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716{
5717 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005718 Py_ssize_t repsize;
5719 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 Py_UNICODE *uni2;
5721 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005722 Py_ssize_t collstartpos = *inpos;
5723 Py_ssize_t collendpos = *inpos+1;
5724 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 char *encoding = "charmap";
5726 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005727 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005729 /* find all unencodable characters */
5730 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005731 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005732 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 int res = encoding_map_lookup(p[collendpos], mapping);
5734 if (res != -1)
5735 break;
5736 ++collendpos;
5737 continue;
5738 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005739
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 rep = charmapencode_lookup(p[collendpos], mapping);
5741 if (rep==NULL)
5742 return -1;
5743 else if (rep!=Py_None) {
5744 Py_DECREF(rep);
5745 break;
5746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005747 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005749 }
5750 /* cache callback name lookup
5751 * (if not done yet, i.e. it's the first error) */
5752 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 if ((errors==NULL) || (!strcmp(errors, "strict")))
5754 *known_errorHandler = 1;
5755 else if (!strcmp(errors, "replace"))
5756 *known_errorHandler = 2;
5757 else if (!strcmp(errors, "ignore"))
5758 *known_errorHandler = 3;
5759 else if (!strcmp(errors, "xmlcharrefreplace"))
5760 *known_errorHandler = 4;
5761 else
5762 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763 }
5764 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005765 case 1: /* strict */
5766 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5767 return -1;
5768 case 2: /* replace */
5769 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 x = charmapencode_output('?', mapping, res, respos);
5771 if (x==enc_EXCEPTION) {
5772 return -1;
5773 }
5774 else if (x==enc_FAILED) {
5775 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5776 return -1;
5777 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005778 }
5779 /* fall through */
5780 case 3: /* ignore */
5781 *inpos = collendpos;
5782 break;
5783 case 4: /* xmlcharrefreplace */
5784 /* generate replacement (temporarily (mis)uses p) */
5785 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 char buffer[2+29+1+1];
5787 char *cp;
5788 sprintf(buffer, "&#%d;", (int)p[collpos]);
5789 for (cp = buffer; *cp; ++cp) {
5790 x = charmapencode_output(*cp, mapping, res, respos);
5791 if (x==enc_EXCEPTION)
5792 return -1;
5793 else if (x==enc_FAILED) {
5794 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5795 return -1;
5796 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005797 }
5798 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005799 *inpos = collendpos;
5800 break;
5801 default:
5802 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 encoding, reason, p, size, exceptionObject,
5804 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005805 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005807 if (PyBytes_Check(repunicode)) {
5808 /* Directly copy bytes result to output. */
5809 Py_ssize_t outsize = PyBytes_Size(*res);
5810 Py_ssize_t requiredsize;
5811 repsize = PyBytes_Size(repunicode);
5812 requiredsize = *respos + repsize;
5813 if (requiredsize > outsize)
5814 /* Make room for all additional bytes. */
5815 if (charmapencode_resize(res, respos, requiredsize)) {
5816 Py_DECREF(repunicode);
5817 return -1;
5818 }
5819 memcpy(PyBytes_AsString(*res) + *respos,
5820 PyBytes_AsString(repunicode), repsize);
5821 *respos += repsize;
5822 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005823 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005824 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005825 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005826 /* generate replacement */
5827 repsize = PyUnicode_GET_SIZE(repunicode);
5828 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 x = charmapencode_output(*uni2, mapping, res, respos);
5830 if (x==enc_EXCEPTION) {
5831 return -1;
5832 }
5833 else if (x==enc_FAILED) {
5834 Py_DECREF(repunicode);
5835 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5836 return -1;
5837 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005838 }
5839 *inpos = newpos;
5840 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005841 }
5842 return 0;
5843}
5844
Alexander Belopolsky40018472011-02-26 01:02:56 +00005845PyObject *
5846PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5847 Py_ssize_t size,
5848 PyObject *mapping,
5849 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851 /* output object */
5852 PyObject *res = NULL;
5853 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005854 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005856 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 PyObject *errorHandler = NULL;
5858 PyObject *exc = NULL;
5859 /* the following variable is used for caching string comparisons
5860 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5861 * 3=ignore, 4=xmlcharrefreplace */
5862 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863
5864 /* Default to Latin-1 */
5865 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868 /* allocate enough for a simple encoding without
5869 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005870 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871 if (res == NULL)
5872 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005873 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 /* try to encode it */
5878 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5879 if (x==enc_EXCEPTION) /* error */
5880 goto onError;
5881 if (x==enc_FAILED) { /* unencodable character */
5882 if (charmap_encoding_error(p, size, &inpos, mapping,
5883 &exc,
5884 &known_errorHandler, &errorHandler, errors,
5885 &res, &respos)) {
5886 goto onError;
5887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005888 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 else
5890 /* done with this character => adjust input position */
5891 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005894 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005895 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005896 if (_PyBytes_Resize(&res, respos) < 0)
5897 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005898
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005899 Py_XDECREF(exc);
5900 Py_XDECREF(errorHandler);
5901 return res;
5902
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 Py_XDECREF(res);
5905 Py_XDECREF(exc);
5906 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 return NULL;
5908}
5909
Alexander Belopolsky40018472011-02-26 01:02:56 +00005910PyObject *
5911PyUnicode_AsCharmapString(PyObject *unicode,
5912 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913{
5914 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 PyErr_BadArgument();
5916 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 }
5918 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 PyUnicode_GET_SIZE(unicode),
5920 mapping,
5921 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922}
5923
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005924/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005925static void
5926make_translate_exception(PyObject **exceptionObject,
5927 const Py_UNICODE *unicode, Py_ssize_t size,
5928 Py_ssize_t startpos, Py_ssize_t endpos,
5929 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005932 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 }
5935 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5937 goto onError;
5938 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5939 goto onError;
5940 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5941 goto onError;
5942 return;
5943 onError:
5944 Py_DECREF(*exceptionObject);
5945 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 }
5947}
5948
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005949/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005950static void
5951raise_translate_exception(PyObject **exceptionObject,
5952 const Py_UNICODE *unicode, Py_ssize_t size,
5953 Py_ssize_t startpos, Py_ssize_t endpos,
5954 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005955{
5956 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960}
5961
5962/* error handling callback helper:
5963 build arguments, call the callback and check the arguments,
5964 put the result into newpos and return the replacement string, which
5965 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005966static PyObject *
5967unicode_translate_call_errorhandler(const char *errors,
5968 PyObject **errorHandler,
5969 const char *reason,
5970 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5971 Py_ssize_t startpos, Py_ssize_t endpos,
5972 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005973{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005974 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005975
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005976 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005977 PyObject *restuple;
5978 PyObject *resunicode;
5979
5980 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005982 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005984 }
5985
5986 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005988 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005990
5991 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005993 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005995 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005996 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 Py_DECREF(restuple);
5998 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005999 }
6000 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 &resunicode, &i_newpos)) {
6002 Py_DECREF(restuple);
6003 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006005 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006007 else
6008 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006009 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6011 Py_DECREF(restuple);
6012 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006013 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006014 Py_INCREF(resunicode);
6015 Py_DECREF(restuple);
6016 return resunicode;
6017}
6018
6019/* Lookup the character ch in the mapping and put the result in result,
6020 which must be decrefed by the caller.
6021 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006022static int
6023charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006024{
Christian Heimes217cfd12007-12-02 14:31:20 +00006025 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 PyObject *x;
6027
6028 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006030 x = PyObject_GetItem(mapping, w);
6031 Py_DECREF(w);
6032 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6034 /* No mapping found means: use 1:1 mapping. */
6035 PyErr_Clear();
6036 *result = NULL;
6037 return 0;
6038 } else
6039 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006040 }
6041 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 *result = x;
6043 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006045 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 long value = PyLong_AS_LONG(x);
6047 long max = PyUnicode_GetMax();
6048 if (value < 0 || value > max) {
6049 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006050 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 Py_DECREF(x);
6052 return -1;
6053 }
6054 *result = x;
6055 return 0;
6056 }
6057 else if (PyUnicode_Check(x)) {
6058 *result = x;
6059 return 0;
6060 }
6061 else {
6062 /* wrong return value */
6063 PyErr_SetString(PyExc_TypeError,
6064 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006065 Py_DECREF(x);
6066 return -1;
6067 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068}
6069/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 if not reallocate and adjust various state variables.
6071 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006072static int
6073charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006076 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006077 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 /* remember old output position */
6079 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6080 /* exponentially overallocate to minimize reallocations */
6081 if (requiredsize < 2 * oldsize)
6082 requiredsize = 2 * oldsize;
6083 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6084 return -1;
6085 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 }
6087 return 0;
6088}
6089/* lookup the character, put the result in the output string and adjust
6090 various state variables. Return a new reference to the object that
6091 was put in the output buffer in *result, or Py_None, if the mapping was
6092 undefined (in which case no character was written).
6093 The called must decref result.
6094 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006095static int
6096charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6097 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6098 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099{
Walter Dörwald4894c302003-10-24 14:25:28 +00006100 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 /* not found => default to 1:1 mapping */
6104 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006105 }
6106 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006108 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 /* no overflow check, because we know that the space is enough */
6110 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 }
6112 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6114 if (repsize==1) {
6115 /* no overflow check, because we know that the space is enough */
6116 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6117 }
6118 else if (repsize!=0) {
6119 /* more than one character */
6120 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6121 (insize - (curinp-startinp)) +
6122 repsize - 1;
6123 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6124 return -1;
6125 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6126 *outp += repsize;
6127 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 }
6129 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 return 0;
6132}
6133
Alexander Belopolsky40018472011-02-26 01:02:56 +00006134PyObject *
6135PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6136 Py_ssize_t size,
6137 PyObject *mapping,
6138 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 /* output object */
6141 PyObject *res = NULL;
6142 /* pointers to the beginning and end+1 of input */
6143 const Py_UNICODE *startp = p;
6144 const Py_UNICODE *endp = p + size;
6145 /* pointer into the output */
6146 Py_UNICODE *str;
6147 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006148 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006149 char *reason = "character maps to <undefined>";
6150 PyObject *errorHandler = NULL;
6151 PyObject *exc = NULL;
6152 /* the following variable is used for caching string comparisons
6153 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6154 * 3=ignore, 4=xmlcharrefreplace */
6155 int known_errorHandler = -1;
6156
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 PyErr_BadArgument();
6159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006161
6162 /* allocate enough for a simple 1:1 translation without
6163 replacements, if we need more, we'll resize */
6164 res = PyUnicode_FromUnicode(NULL, size);
6165 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006169 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 /* try to encode it */
6173 PyObject *x = NULL;
6174 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6175 Py_XDECREF(x);
6176 goto onError;
6177 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006178 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 if (x!=Py_None) /* it worked => adjust input pointer */
6180 ++p;
6181 else { /* untranslatable character */
6182 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6183 Py_ssize_t repsize;
6184 Py_ssize_t newpos;
6185 Py_UNICODE *uni2;
6186 /* startpos for collecting untranslatable chars */
6187 const Py_UNICODE *collstart = p;
6188 const Py_UNICODE *collend = p+1;
6189 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 /* find all untranslatable characters */
6192 while (collend < endp) {
6193 if (charmaptranslate_lookup(*collend, mapping, &x))
6194 goto onError;
6195 Py_XDECREF(x);
6196 if (x!=Py_None)
6197 break;
6198 ++collend;
6199 }
6200 /* cache callback name lookup
6201 * (if not done yet, i.e. it's the first error) */
6202 if (known_errorHandler==-1) {
6203 if ((errors==NULL) || (!strcmp(errors, "strict")))
6204 known_errorHandler = 1;
6205 else if (!strcmp(errors, "replace"))
6206 known_errorHandler = 2;
6207 else if (!strcmp(errors, "ignore"))
6208 known_errorHandler = 3;
6209 else if (!strcmp(errors, "xmlcharrefreplace"))
6210 known_errorHandler = 4;
6211 else
6212 known_errorHandler = 0;
6213 }
6214 switch (known_errorHandler) {
6215 case 1: /* strict */
6216 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006217 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 case 2: /* replace */
6219 /* No need to check for space, this is a 1:1 replacement */
6220 for (coll = collstart; coll<collend; ++coll)
6221 *str++ = '?';
6222 /* fall through */
6223 case 3: /* ignore */
6224 p = collend;
6225 break;
6226 case 4: /* xmlcharrefreplace */
6227 /* generate replacement (temporarily (mis)uses p) */
6228 for (p = collstart; p < collend; ++p) {
6229 char buffer[2+29+1+1];
6230 char *cp;
6231 sprintf(buffer, "&#%d;", (int)*p);
6232 if (charmaptranslate_makespace(&res, &str,
6233 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6234 goto onError;
6235 for (cp = buffer; *cp; ++cp)
6236 *str++ = *cp;
6237 }
6238 p = collend;
6239 break;
6240 default:
6241 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6242 reason, startp, size, &exc,
6243 collstart-startp, collend-startp, &newpos);
6244 if (repunicode == NULL)
6245 goto onError;
6246 /* generate replacement */
6247 repsize = PyUnicode_GET_SIZE(repunicode);
6248 if (charmaptranslate_makespace(&res, &str,
6249 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6250 Py_DECREF(repunicode);
6251 goto onError;
6252 }
6253 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6254 *str++ = *uni2;
6255 p = startp + newpos;
6256 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006257 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006258 }
6259 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006260 /* Resize if we allocated to much */
6261 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006262 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 if (PyUnicode_Resize(&res, respos) < 0)
6264 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006265 }
6266 Py_XDECREF(exc);
6267 Py_XDECREF(errorHandler);
6268 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271 Py_XDECREF(res);
6272 Py_XDECREF(exc);
6273 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 return NULL;
6275}
6276
Alexander Belopolsky40018472011-02-26 01:02:56 +00006277PyObject *
6278PyUnicode_Translate(PyObject *str,
6279 PyObject *mapping,
6280 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281{
6282 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006283
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 str = PyUnicode_FromObject(str);
6285 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 PyUnicode_GET_SIZE(str),
6289 mapping,
6290 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 Py_DECREF(str);
6292 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006293
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 Py_XDECREF(str);
6296 return NULL;
6297}
Tim Petersced69f82003-09-16 20:30:58 +00006298
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006299PyObject *
6300PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6301 Py_ssize_t length)
6302{
6303 PyObject *result;
6304 Py_UNICODE *p; /* write pointer into result */
6305 Py_ssize_t i;
6306 /* Copy to a new string */
6307 result = (PyObject *)_PyUnicode_New(length);
6308 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6309 if (result == NULL)
6310 return result;
6311 p = PyUnicode_AS_UNICODE(result);
6312 /* Iterate over code points */
6313 for (i = 0; i < length; i++) {
6314 Py_UNICODE ch =s[i];
6315 if (ch > 127) {
6316 int decimal = Py_UNICODE_TODECIMAL(ch);
6317 if (decimal >= 0)
6318 p[i] = '0' + decimal;
6319 }
6320 }
6321 return result;
6322}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006323/* --- Decimal Encoder ---------------------------------------------------- */
6324
Alexander Belopolsky40018472011-02-26 01:02:56 +00006325int
6326PyUnicode_EncodeDecimal(Py_UNICODE *s,
6327 Py_ssize_t length,
6328 char *output,
6329 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006330{
6331 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006332 PyObject *errorHandler = NULL;
6333 PyObject *exc = NULL;
6334 const char *encoding = "decimal";
6335 const char *reason = "invalid decimal Unicode string";
6336 /* the following variable is used for caching string comparisons
6337 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6338 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006339
6340 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 PyErr_BadArgument();
6342 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006343 }
6344
6345 p = s;
6346 end = s + length;
6347 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 register Py_UNICODE ch = *p;
6349 int decimal;
6350 PyObject *repunicode;
6351 Py_ssize_t repsize;
6352 Py_ssize_t newpos;
6353 Py_UNICODE *uni2;
6354 Py_UNICODE *collstart;
6355 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006356
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006358 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 ++p;
6360 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006361 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 decimal = Py_UNICODE_TODECIMAL(ch);
6363 if (decimal >= 0) {
6364 *output++ = '0' + decimal;
6365 ++p;
6366 continue;
6367 }
6368 if (0 < ch && ch < 256) {
6369 *output++ = (char)ch;
6370 ++p;
6371 continue;
6372 }
6373 /* All other characters are considered unencodable */
6374 collstart = p;
6375 collend = p+1;
6376 while (collend < end) {
6377 if ((0 < *collend && *collend < 256) ||
6378 !Py_UNICODE_ISSPACE(*collend) ||
6379 Py_UNICODE_TODECIMAL(*collend))
6380 break;
6381 }
6382 /* cache callback name lookup
6383 * (if not done yet, i.e. it's the first error) */
6384 if (known_errorHandler==-1) {
6385 if ((errors==NULL) || (!strcmp(errors, "strict")))
6386 known_errorHandler = 1;
6387 else if (!strcmp(errors, "replace"))
6388 known_errorHandler = 2;
6389 else if (!strcmp(errors, "ignore"))
6390 known_errorHandler = 3;
6391 else if (!strcmp(errors, "xmlcharrefreplace"))
6392 known_errorHandler = 4;
6393 else
6394 known_errorHandler = 0;
6395 }
6396 switch (known_errorHandler) {
6397 case 1: /* strict */
6398 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6399 goto onError;
6400 case 2: /* replace */
6401 for (p = collstart; p < collend; ++p)
6402 *output++ = '?';
6403 /* fall through */
6404 case 3: /* ignore */
6405 p = collend;
6406 break;
6407 case 4: /* xmlcharrefreplace */
6408 /* generate replacement (temporarily (mis)uses p) */
6409 for (p = collstart; p < collend; ++p)
6410 output += sprintf(output, "&#%d;", (int)*p);
6411 p = collend;
6412 break;
6413 default:
6414 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6415 encoding, reason, s, length, &exc,
6416 collstart-s, collend-s, &newpos);
6417 if (repunicode == NULL)
6418 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006419 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006420 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006421 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6422 Py_DECREF(repunicode);
6423 goto onError;
6424 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 /* generate replacement */
6426 repsize = PyUnicode_GET_SIZE(repunicode);
6427 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6428 Py_UNICODE ch = *uni2;
6429 if (Py_UNICODE_ISSPACE(ch))
6430 *output++ = ' ';
6431 else {
6432 decimal = Py_UNICODE_TODECIMAL(ch);
6433 if (decimal >= 0)
6434 *output++ = '0' + decimal;
6435 else if (0 < ch && ch < 256)
6436 *output++ = (char)ch;
6437 else {
6438 Py_DECREF(repunicode);
6439 raise_encode_exception(&exc, encoding,
6440 s, length, collstart-s, collend-s, reason);
6441 goto onError;
6442 }
6443 }
6444 }
6445 p = s + newpos;
6446 Py_DECREF(repunicode);
6447 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006448 }
6449 /* 0-terminate the output string */
6450 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 Py_XDECREF(exc);
6452 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006453 return 0;
6454
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456 Py_XDECREF(exc);
6457 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006458 return -1;
6459}
6460
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461/* --- Helpers ------------------------------------------------------------ */
6462
Eric Smith8c663262007-08-25 02:26:07 +00006463#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006464#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006465
Thomas Wouters477c8d52006-05-27 19:21:47 +00006466#include "stringlib/count.h"
6467#include "stringlib/find.h"
6468#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006469#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006470
Eric Smith5807c412008-05-11 21:00:57 +00006471#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006472#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006473#include "stringlib/localeutil.h"
6474
Thomas Wouters477c8d52006-05-27 19:21:47 +00006475/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006476#define ADJUST_INDICES(start, end, len) \
6477 if (end > len) \
6478 end = len; \
6479 else if (end < 0) { \
6480 end += len; \
6481 if (end < 0) \
6482 end = 0; \
6483 } \
6484 if (start < 0) { \
6485 start += len; \
6486 if (start < 0) \
6487 start = 0; \
6488 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006489
Alexander Belopolsky40018472011-02-26 01:02:56 +00006490Py_ssize_t
6491PyUnicode_Count(PyObject *str,
6492 PyObject *substr,
6493 Py_ssize_t start,
6494 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006496 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006497 PyUnicodeObject* str_obj;
6498 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006499
Thomas Wouters477c8d52006-05-27 19:21:47 +00006500 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6501 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006503 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6504 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 Py_DECREF(str_obj);
6506 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 }
Tim Petersced69f82003-09-16 20:30:58 +00006508
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006509 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006510 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006511 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6512 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006513 );
6514
6515 Py_DECREF(sub_obj);
6516 Py_DECREF(str_obj);
6517
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 return result;
6519}
6520
Alexander Belopolsky40018472011-02-26 01:02:56 +00006521Py_ssize_t
6522PyUnicode_Find(PyObject *str,
6523 PyObject *sub,
6524 Py_ssize_t start,
6525 Py_ssize_t end,
6526 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006528 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006529
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006531 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006533 sub = PyUnicode_FromObject(sub);
6534 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 Py_DECREF(str);
6536 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 }
Tim Petersced69f82003-09-16 20:30:58 +00006538
Thomas Wouters477c8d52006-05-27 19:21:47 +00006539 if (direction > 0)
6540 result = stringlib_find_slice(
6541 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6542 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6543 start, end
6544 );
6545 else
6546 result = stringlib_rfind_slice(
6547 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6548 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6549 start, end
6550 );
6551
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006553 Py_DECREF(sub);
6554
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 return result;
6556}
6557
Alexander Belopolsky40018472011-02-26 01:02:56 +00006558static int
6559tailmatch(PyUnicodeObject *self,
6560 PyUnicodeObject *substring,
6561 Py_ssize_t start,
6562 Py_ssize_t end,
6563 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 if (substring->length == 0)
6566 return 1;
6567
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006568 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 end -= substring->length;
6570 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
6573 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 if (Py_UNICODE_MATCH(self, end, substring))
6575 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 } else {
6577 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 }
6580
6581 return 0;
6582}
6583
Alexander Belopolsky40018472011-02-26 01:02:56 +00006584Py_ssize_t
6585PyUnicode_Tailmatch(PyObject *str,
6586 PyObject *substr,
6587 Py_ssize_t start,
6588 Py_ssize_t end,
6589 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006591 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006592
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 str = PyUnicode_FromObject(str);
6594 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 substr = PyUnicode_FromObject(substr);
6597 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 Py_DECREF(str);
6599 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 }
Tim Petersced69f82003-09-16 20:30:58 +00006601
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 (PyUnicodeObject *)substr,
6604 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 Py_DECREF(str);
6606 Py_DECREF(substr);
6607 return result;
6608}
6609
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610/* Apply fixfct filter to the Unicode object self and return a
6611 reference to the modified object */
6612
Alexander Belopolsky40018472011-02-26 01:02:56 +00006613static PyObject *
6614fixup(PyUnicodeObject *self,
6615 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616{
6617
6618 PyUnicodeObject *u;
6619
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006620 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006623
6624 Py_UNICODE_COPY(u->str, self->str, self->length);
6625
Tim Peters7a29bd52001-09-12 03:03:31 +00006626 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 /* fixfct should return TRUE if it modified the buffer. If
6628 FALSE, return a reference to the original buffer instead
6629 (to save space, not time) */
6630 Py_INCREF(self);
6631 Py_DECREF(u);
6632 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 }
6634 return (PyObject*) u;
6635}
6636
Alexander Belopolsky40018472011-02-26 01:02:56 +00006637static int
6638fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006640 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 Py_UNICODE *s = self->str;
6642 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006643
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006646
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 ch = Py_UNICODE_TOUPPER(*s);
6648 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 *s = ch;
6651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 s++;
6653 }
6654
6655 return status;
6656}
6657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658static int
6659fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006661 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 Py_UNICODE *s = self->str;
6663 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006664
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006667
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 ch = Py_UNICODE_TOLOWER(*s);
6669 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 *s = ch;
6672 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 s++;
6674 }
6675
6676 return status;
6677}
6678
Alexander Belopolsky40018472011-02-26 01:02:56 +00006679static int
6680fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006682 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 Py_UNICODE *s = self->str;
6684 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006685
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 while (len-- > 0) {
6687 if (Py_UNICODE_ISUPPER(*s)) {
6688 *s = Py_UNICODE_TOLOWER(*s);
6689 status = 1;
6690 } else if (Py_UNICODE_ISLOWER(*s)) {
6691 *s = Py_UNICODE_TOUPPER(*s);
6692 status = 1;
6693 }
6694 s++;
6695 }
6696
6697 return status;
6698}
6699
Alexander Belopolsky40018472011-02-26 01:02:56 +00006700static int
6701fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006703 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006704 Py_UNICODE *s = self->str;
6705 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006706
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006707 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006709 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 *s = Py_UNICODE_TOUPPER(*s);
6711 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006713 s++;
6714 while (--len > 0) {
6715 if (Py_UNICODE_ISUPPER(*s)) {
6716 *s = Py_UNICODE_TOLOWER(*s);
6717 status = 1;
6718 }
6719 s++;
6720 }
6721 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722}
6723
Alexander Belopolsky40018472011-02-26 01:02:56 +00006724static int
6725fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726{
6727 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6728 register Py_UNICODE *e;
6729 int previous_is_cased;
6730
6731 /* Shortcut for single character strings */
6732 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6734 if (*p != ch) {
6735 *p = ch;
6736 return 1;
6737 }
6738 else
6739 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 }
Tim Petersced69f82003-09-16 20:30:58 +00006741
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 e = p + PyUnicode_GET_SIZE(self);
6743 previous_is_cased = 0;
6744 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006746
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 if (previous_is_cased)
6748 *p = Py_UNICODE_TOLOWER(ch);
6749 else
6750 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006751
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 if (Py_UNICODE_ISLOWER(ch) ||
6753 Py_UNICODE_ISUPPER(ch) ||
6754 Py_UNICODE_ISTITLE(ch))
6755 previous_is_cased = 1;
6756 else
6757 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 }
6759 return 1;
6760}
6761
Tim Peters8ce9f162004-08-27 01:49:32 +00006762PyObject *
6763PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764{
Skip Montanaro6543b452004-09-16 03:28:13 +00006765 const Py_UNICODE blank = ' ';
6766 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006767 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006768 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006769 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6770 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006771 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6772 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006773 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006774 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775
Tim Peters05eba1f2004-08-27 21:32:02 +00006776 fseq = PySequence_Fast(seq, "");
6777 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006778 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006779 }
6780
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006781 /* NOTE: the following code can't call back into Python code,
6782 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006783 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006784
Tim Peters05eba1f2004-08-27 21:32:02 +00006785 seqlen = PySequence_Fast_GET_SIZE(fseq);
6786 /* If empty sequence, return u"". */
6787 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006788 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6789 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006790 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006791 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006792 /* If singleton sequence with an exact Unicode, return that. */
6793 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 item = items[0];
6795 if (PyUnicode_CheckExact(item)) {
6796 Py_INCREF(item);
6797 res = (PyUnicodeObject *)item;
6798 goto Done;
6799 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006800 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006801 else {
6802 /* Set up sep and seplen */
6803 if (separator == NULL) {
6804 sep = &blank;
6805 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006806 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006807 else {
6808 if (!PyUnicode_Check(separator)) {
6809 PyErr_Format(PyExc_TypeError,
6810 "separator: expected str instance,"
6811 " %.80s found",
6812 Py_TYPE(separator)->tp_name);
6813 goto onError;
6814 }
6815 sep = PyUnicode_AS_UNICODE(separator);
6816 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006817 }
6818 }
6819
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006820 /* There are at least two things to join, or else we have a subclass
6821 * of str in the sequence.
6822 * Do a pre-pass to figure out the total amount of space we'll
6823 * need (sz), and see whether all argument are strings.
6824 */
6825 sz = 0;
6826 for (i = 0; i < seqlen; i++) {
6827 const Py_ssize_t old_sz = sz;
6828 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 if (!PyUnicode_Check(item)) {
6830 PyErr_Format(PyExc_TypeError,
6831 "sequence item %zd: expected str instance,"
6832 " %.80s found",
6833 i, Py_TYPE(item)->tp_name);
6834 goto onError;
6835 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006836 sz += PyUnicode_GET_SIZE(item);
6837 if (i != 0)
6838 sz += seplen;
6839 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6840 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006842 goto onError;
6843 }
6844 }
Tim Petersced69f82003-09-16 20:30:58 +00006845
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006846 res = _PyUnicode_New(sz);
6847 if (res == NULL)
6848 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006849
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006850 /* Catenate everything. */
6851 res_p = PyUnicode_AS_UNICODE(res);
6852 for (i = 0; i < seqlen; ++i) {
6853 Py_ssize_t itemlen;
6854 item = items[i];
6855 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 /* Copy item, and maybe the separator. */
6857 if (i) {
6858 Py_UNICODE_COPY(res_p, sep, seplen);
6859 res_p += seplen;
6860 }
6861 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6862 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006863 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006864
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006866 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 return (PyObject *)res;
6868
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006870 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006871 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 return NULL;
6873}
6874
Alexander Belopolsky40018472011-02-26 01:02:56 +00006875static PyUnicodeObject *
6876pad(PyUnicodeObject *self,
6877 Py_ssize_t left,
6878 Py_ssize_t right,
6879 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880{
6881 PyUnicodeObject *u;
6882
6883 if (left < 0)
6884 left = 0;
6885 if (right < 0)
6886 right = 0;
6887
Tim Peters7a29bd52001-09-12 03:03:31 +00006888 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 Py_INCREF(self);
6890 return self;
6891 }
6892
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006893 if (left > PY_SSIZE_T_MAX - self->length ||
6894 right > PY_SSIZE_T_MAX - (left + self->length)) {
6895 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6896 return NULL;
6897 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 u = _PyUnicode_New(left + self->length + right);
6899 if (u) {
6900 if (left)
6901 Py_UNICODE_FILL(u->str, fill, left);
6902 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6903 if (right)
6904 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6905 }
6906
6907 return u;
6908}
6909
Alexander Belopolsky40018472011-02-26 01:02:56 +00006910PyObject *
6911PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914
6915 string = PyUnicode_FromObject(string);
6916 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006919 list = stringlib_splitlines(
6920 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6921 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922
6923 Py_DECREF(string);
6924 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925}
6926
Alexander Belopolsky40018472011-02-26 01:02:56 +00006927static PyObject *
6928split(PyUnicodeObject *self,
6929 PyUnicodeObject *substring,
6930 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006933 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006936 return stringlib_split_whitespace(
6937 (PyObject*) self, self->str, self->length, maxcount
6938 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006940 return stringlib_split(
6941 (PyObject*) self, self->str, self->length,
6942 substring->str, substring->length,
6943 maxcount
6944 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945}
6946
Alexander Belopolsky40018472011-02-26 01:02:56 +00006947static PyObject *
6948rsplit(PyUnicodeObject *self,
6949 PyUnicodeObject *substring,
6950 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006951{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006952 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006953 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006954
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006955 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006956 return stringlib_rsplit_whitespace(
6957 (PyObject*) self, self->str, self->length, maxcount
6958 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006959
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006960 return stringlib_rsplit(
6961 (PyObject*) self, self->str, self->length,
6962 substring->str, substring->length,
6963 maxcount
6964 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006965}
6966
Alexander Belopolsky40018472011-02-26 01:02:56 +00006967static PyObject *
6968replace(PyUnicodeObject *self,
6969 PyUnicodeObject *str1,
6970 PyUnicodeObject *str2,
6971 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972{
6973 PyUnicodeObject *u;
6974
6975 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006977 else if (maxcount == 0 || self->length == 0)
6978 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979
Thomas Wouters477c8d52006-05-27 19:21:47 +00006980 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006981 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006982 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006983 if (str1->length == 0)
6984 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006985 if (str1->length == 1) {
6986 /* replace characters */
6987 Py_UNICODE u1, u2;
6988 if (!findchar(self->str, self->length, str1->str[0]))
6989 goto nothing;
6990 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6991 if (!u)
6992 return NULL;
6993 Py_UNICODE_COPY(u->str, self->str, self->length);
6994 u1 = str1->str[0];
6995 u2 = str2->str[0];
6996 for (i = 0; i < u->length; i++)
6997 if (u->str[i] == u1) {
6998 if (--maxcount < 0)
6999 break;
7000 u->str[i] = u2;
7001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007003 i = stringlib_find(
7004 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007006 if (i < 0)
7007 goto nothing;
7008 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7009 if (!u)
7010 return NULL;
7011 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007012
7013 /* change everything in-place, starting with this one */
7014 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7015 i += str1->length;
7016
7017 while ( --maxcount > 0) {
7018 i = stringlib_find(self->str+i, self->length-i,
7019 str1->str, str1->length,
7020 i);
7021 if (i == -1)
7022 break;
7023 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7024 i += str1->length;
7025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007028
Brett Cannonb94767f2011-02-22 20:15:44 +00007029 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007030 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 Py_UNICODE *p;
7032
7033 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007034 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7035 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007036 if (n == 0)
7037 goto nothing;
7038 /* new_size = self->length + n * (str2->length - str1->length)); */
7039 delta = (str2->length - str1->length);
7040 if (delta == 0) {
7041 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007043 product = n * (str2->length - str1->length);
7044 if ((product / (str2->length - str1->length)) != n) {
7045 PyErr_SetString(PyExc_OverflowError,
7046 "replace string is too long");
7047 return NULL;
7048 }
7049 new_size = self->length + product;
7050 if (new_size < 0) {
7051 PyErr_SetString(PyExc_OverflowError,
7052 "replace string is too long");
7053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 }
7055 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007056 u = _PyUnicode_New(new_size);
7057 if (!u)
7058 return NULL;
7059 i = 0;
7060 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007061 if (str1->length > 0) {
7062 while (n-- > 0) {
7063 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007064 j = stringlib_find(self->str+i, self->length-i,
7065 str1->str, str1->length,
7066 i);
7067 if (j == -1)
7068 break;
7069 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007070 /* copy unchanged part [i:j] */
7071 Py_UNICODE_COPY(p, self->str+i, j-i);
7072 p += j - i;
7073 }
7074 /* copy substitution string */
7075 if (str2->length > 0) {
7076 Py_UNICODE_COPY(p, str2->str, str2->length);
7077 p += str2->length;
7078 }
7079 i = j + str1->length;
7080 }
7081 if (i < self->length)
7082 /* copy tail [i:] */
7083 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7084 } else {
7085 /* interleave */
7086 while (n > 0) {
7087 Py_UNICODE_COPY(p, str2->str, str2->length);
7088 p += str2->length;
7089 if (--n <= 0)
7090 break;
7091 *p++ = self->str[i++];
7092 }
7093 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007097
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007099 /* nothing to replace; return original string (when possible) */
7100 if (PyUnicode_CheckExact(self)) {
7101 Py_INCREF(self);
7102 return (PyObject *) self;
7103 }
7104 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105}
7106
7107/* --- Unicode Object Methods --------------------------------------------- */
7108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007109PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007110 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111\n\
7112Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007113characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114
7115static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007116unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 return fixup(self, fixtitle);
7119}
7120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007121PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123\n\
7124Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007125have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126
7127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007128unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 return fixup(self, fixcapitalize);
7131}
7132
7133#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007134PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136\n\
7137Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007138normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139
7140static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007141unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142{
7143 PyObject *list;
7144 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007145 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 /* Split into words */
7148 list = split(self, NULL, -1);
7149 if (!list)
7150 return NULL;
7151
7152 /* Capitalize each word */
7153 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7154 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007155 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 if (item == NULL)
7157 goto onError;
7158 Py_DECREF(PyList_GET_ITEM(list, i));
7159 PyList_SET_ITEM(list, i, item);
7160 }
7161
7162 /* Join the words to form a new string */
7163 item = PyUnicode_Join(NULL, list);
7164
Benjamin Peterson29060642009-01-31 22:14:21 +00007165 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 Py_DECREF(list);
7167 return (PyObject *)item;
7168}
7169#endif
7170
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007171/* Argument converter. Coerces to a single unicode character */
7172
7173static int
7174convert_uc(PyObject *obj, void *addr)
7175{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007176 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7177 PyObject *uniobj;
7178 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007179
Benjamin Peterson14339b62009-01-31 16:36:08 +00007180 uniobj = PyUnicode_FromObject(obj);
7181 if (uniobj == NULL) {
7182 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007183 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007184 return 0;
7185 }
7186 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7187 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007189 Py_DECREF(uniobj);
7190 return 0;
7191 }
7192 unistr = PyUnicode_AS_UNICODE(uniobj);
7193 *fillcharloc = unistr[0];
7194 Py_DECREF(uniobj);
7195 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007196}
7197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007198PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007201Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007202done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203
7204static PyObject *
7205unicode_center(PyUnicodeObject *self, PyObject *args)
7206{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007207 Py_ssize_t marg, left;
7208 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007209 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210
Thomas Woutersde017742006-02-16 19:34:37 +00007211 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 return NULL;
7213
Tim Peters7a29bd52001-09-12 03:03:31 +00007214 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 Py_INCREF(self);
7216 return (PyObject*) self;
7217 }
7218
7219 marg = width - self->length;
7220 left = marg / 2 + (marg & width & 1);
7221
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007222 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223}
7224
Marc-André Lemburge5034372000-08-08 08:04:29 +00007225#if 0
7226
7227/* This code should go into some future Unicode collation support
7228 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007229 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007230
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007231/* speedy UTF-16 code point order comparison */
7232/* gleaned from: */
7233/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7234
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007235static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007236{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007237 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007238 0, 0, 0, 0, 0, 0, 0, 0,
7239 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007240 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007241};
7242
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243static int
7244unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7245{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007246 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007247
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 Py_UNICODE *s1 = str1->str;
7249 Py_UNICODE *s2 = str2->str;
7250
7251 len1 = str1->length;
7252 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007253
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007255 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007256
7257 c1 = *s1++;
7258 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007259
Benjamin Peterson29060642009-01-31 22:14:21 +00007260 if (c1 > (1<<11) * 26)
7261 c1 += utf16Fixup[c1>>11];
7262 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007263 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007264 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007265
7266 if (c1 != c2)
7267 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007268
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007269 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 }
7271
7272 return (len1 < len2) ? -1 : (len1 != len2);
7273}
7274
Marc-André Lemburge5034372000-08-08 08:04:29 +00007275#else
7276
7277static int
7278unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7279{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007280 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007281
7282 Py_UNICODE *s1 = str1->str;
7283 Py_UNICODE *s2 = str2->str;
7284
7285 len1 = str1->length;
7286 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007287
Marc-André Lemburge5034372000-08-08 08:04:29 +00007288 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007289 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007290
Fredrik Lundh45714e92001-06-26 16:39:36 +00007291 c1 = *s1++;
7292 c2 = *s2++;
7293
7294 if (c1 != c2)
7295 return (c1 < c2) ? -1 : 1;
7296
Marc-André Lemburge5034372000-08-08 08:04:29 +00007297 len1--; len2--;
7298 }
7299
7300 return (len1 < len2) ? -1 : (len1 != len2);
7301}
7302
7303#endif
7304
Alexander Belopolsky40018472011-02-26 01:02:56 +00007305int
7306PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007307{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007308 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7309 return unicode_compare((PyUnicodeObject *)left,
7310 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007311 PyErr_Format(PyExc_TypeError,
7312 "Can't compare %.100s and %.100s",
7313 left->ob_type->tp_name,
7314 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315 return -1;
7316}
7317
Martin v. Löwis5b222132007-06-10 09:51:05 +00007318int
7319PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7320{
7321 int i;
7322 Py_UNICODE *id;
7323 assert(PyUnicode_Check(uni));
7324 id = PyUnicode_AS_UNICODE(uni);
7325 /* Compare Unicode string and source character set string */
7326 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 if (id[i] != str[i])
7328 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007329 /* This check keeps Python strings that end in '\0' from comparing equal
7330 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007331 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007333 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007335 return 0;
7336}
7337
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007338
Benjamin Peterson29060642009-01-31 22:14:21 +00007339#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007340 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007341
Alexander Belopolsky40018472011-02-26 01:02:56 +00007342PyObject *
7343PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007344{
7345 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007346
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007347 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7348 PyObject *v;
7349 if (((PyUnicodeObject *) left)->length !=
7350 ((PyUnicodeObject *) right)->length) {
7351 if (op == Py_EQ) {
7352 Py_INCREF(Py_False);
7353 return Py_False;
7354 }
7355 if (op == Py_NE) {
7356 Py_INCREF(Py_True);
7357 return Py_True;
7358 }
7359 }
7360 if (left == right)
7361 result = 0;
7362 else
7363 result = unicode_compare((PyUnicodeObject *)left,
7364 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007365
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007366 /* Convert the return value to a Boolean */
7367 switch (op) {
7368 case Py_EQ:
7369 v = TEST_COND(result == 0);
7370 break;
7371 case Py_NE:
7372 v = TEST_COND(result != 0);
7373 break;
7374 case Py_LE:
7375 v = TEST_COND(result <= 0);
7376 break;
7377 case Py_GE:
7378 v = TEST_COND(result >= 0);
7379 break;
7380 case Py_LT:
7381 v = TEST_COND(result == -1);
7382 break;
7383 case Py_GT:
7384 v = TEST_COND(result == 1);
7385 break;
7386 default:
7387 PyErr_BadArgument();
7388 return NULL;
7389 }
7390 Py_INCREF(v);
7391 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007392 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007393
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007394 Py_INCREF(Py_NotImplemented);
7395 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007396}
7397
Alexander Belopolsky40018472011-02-26 01:02:56 +00007398int
7399PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007400{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007401 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007402 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007403
7404 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007405 sub = PyUnicode_FromObject(element);
7406 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 PyErr_Format(PyExc_TypeError,
7408 "'in <string>' requires string as left operand, not %s",
7409 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007410 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007411 }
7412
Thomas Wouters477c8d52006-05-27 19:21:47 +00007413 str = PyUnicode_FromObject(container);
7414 if (!str) {
7415 Py_DECREF(sub);
7416 return -1;
7417 }
7418
7419 result = stringlib_contains_obj(str, sub);
7420
7421 Py_DECREF(str);
7422 Py_DECREF(sub);
7423
Guido van Rossum403d68b2000-03-13 15:55:09 +00007424 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007425}
7426
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427/* Concat to string or Unicode object giving a new Unicode object. */
7428
Alexander Belopolsky40018472011-02-26 01:02:56 +00007429PyObject *
7430PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431{
7432 PyUnicodeObject *u = NULL, *v = NULL, *w;
7433
7434 /* Coerce the two arguments */
7435 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7436 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7439 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441
7442 /* Shortcuts */
7443 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 Py_DECREF(v);
7445 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 }
7447 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 Py_DECREF(u);
7449 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 }
7451
7452 /* Concat the two Unicode strings */
7453 w = _PyUnicode_New(u->length + v->length);
7454 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 Py_UNICODE_COPY(w->str, u->str, u->length);
7457 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7458
7459 Py_DECREF(u);
7460 Py_DECREF(v);
7461 return (PyObject *)w;
7462
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 Py_XDECREF(u);
7465 Py_XDECREF(v);
7466 return NULL;
7467}
7468
Walter Dörwald1ab83302007-05-18 17:15:44 +00007469void
7470PyUnicode_Append(PyObject **pleft, PyObject *right)
7471{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007472 PyObject *new;
7473 if (*pleft == NULL)
7474 return;
7475 if (right == NULL || !PyUnicode_Check(*pleft)) {
7476 Py_DECREF(*pleft);
7477 *pleft = NULL;
7478 return;
7479 }
7480 new = PyUnicode_Concat(*pleft, right);
7481 Py_DECREF(*pleft);
7482 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007483}
7484
7485void
7486PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7487{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007488 PyUnicode_Append(pleft, right);
7489 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007490}
7491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007492PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007495Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007496string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007497interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498
7499static PyObject *
7500unicode_count(PyUnicodeObject *self, PyObject *args)
7501{
7502 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007503 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007504 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 PyObject *result;
7506
Guido van Rossumb8872e62000-05-09 14:14:27 +00007507 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509 return NULL;
7510
7511 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007512 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007515
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007516 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007517 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007518 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007519 substring->str, substring->length,
7520 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007521 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522
7523 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007524
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525 return result;
7526}
7527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007528PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007529 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007531Encode S using the codec registered for encoding. Default encoding\n\
7532is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007533handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007534a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7535'xmlcharrefreplace' as well as any other name registered with\n\
7536codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537
7538static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007539unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007541 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542 char *encoding = NULL;
7543 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007544
Benjamin Peterson308d6372009-09-18 21:42:35 +00007545 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7546 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007548 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007549}
7550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007551PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553\n\
7554Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007555If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556
7557static PyObject*
7558unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7559{
7560 Py_UNICODE *e;
7561 Py_UNICODE *p;
7562 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007563 Py_UNICODE *qe;
7564 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 PyUnicodeObject *u;
7566 int tabsize = 8;
7567
7568 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007569 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570
Thomas Wouters7e474022000-07-16 12:04:32 +00007571 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007572 i = 0; /* chars up to and including most recent \n or \r */
7573 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7574 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575 for (p = self->str; p < e; p++)
7576 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 if (tabsize > 0) {
7578 incr = tabsize - (j % tabsize); /* cannot overflow */
7579 if (j > PY_SSIZE_T_MAX - incr)
7580 goto overflow1;
7581 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007582 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 if (j > PY_SSIZE_T_MAX - 1)
7586 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587 j++;
7588 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 if (i > PY_SSIZE_T_MAX - j)
7590 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007592 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 }
7594 }
7595
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007596 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007598
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 /* Second pass: create output string and fill it */
7600 u = _PyUnicode_New(i + j);
7601 if (!u)
7602 return NULL;
7603
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007604 j = 0; /* same as in first pass */
7605 q = u->str; /* next output char */
7606 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607
7608 for (p = self->str; p < e; p++)
7609 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 if (tabsize > 0) {
7611 i = tabsize - (j % tabsize);
7612 j += i;
7613 while (i--) {
7614 if (q >= qe)
7615 goto overflow2;
7616 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007617 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007619 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 else {
7621 if (q >= qe)
7622 goto overflow2;
7623 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007624 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625 if (*p == '\n' || *p == '\r')
7626 j = 0;
7627 }
7628
7629 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007630
7631 overflow2:
7632 Py_DECREF(u);
7633 overflow1:
7634 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7635 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636}
7637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007638PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640\n\
7641Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007642such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643arguments start and end are interpreted as in slice notation.\n\
7644\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007645Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646
7647static PyObject *
7648unicode_find(PyUnicodeObject *self, PyObject *args)
7649{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007650 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007651 Py_ssize_t start;
7652 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007653 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654
Christian Heimes9cd17752007-11-18 19:35:23 +00007655 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657
Thomas Wouters477c8d52006-05-27 19:21:47 +00007658 result = stringlib_find_slice(
7659 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7660 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7661 start, end
7662 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663
7664 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007665
Christian Heimes217cfd12007-12-02 14:31:20 +00007666 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667}
7668
7669static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007670unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671{
7672 if (index < 0 || index >= self->length) {
7673 PyErr_SetString(PyExc_IndexError, "string index out of range");
7674 return NULL;
7675 }
7676
7677 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7678}
7679
Guido van Rossumc2504932007-09-18 19:42:40 +00007680/* Believe it or not, this produces the same value for ASCII strings
7681 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007682static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007683unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684{
Guido van Rossumc2504932007-09-18 19:42:40 +00007685 Py_ssize_t len;
7686 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007687 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007688
7689 if (self->hash != -1)
7690 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007691 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007692 p = self->str;
7693 x = *p << 7;
7694 while (--len >= 0)
7695 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007696 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007697 if (x == -1)
7698 x = -2;
7699 self->hash = x;
7700 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701}
7702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007703PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007706Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707
7708static PyObject *
7709unicode_index(PyUnicodeObject *self, PyObject *args)
7710{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007711 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007712 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007713 Py_ssize_t start;
7714 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715
Christian Heimes9cd17752007-11-18 19:35:23 +00007716 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
Thomas Wouters477c8d52006-05-27 19:21:47 +00007719 result = stringlib_find_slice(
7720 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7721 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7722 start, end
7723 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724
7725 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007726
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727 if (result < 0) {
7728 PyErr_SetString(PyExc_ValueError, "substring not found");
7729 return NULL;
7730 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007731
Christian Heimes217cfd12007-12-02 14:31:20 +00007732 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733}
7734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007735PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007738Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007739at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740
7741static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007742unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743{
7744 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7745 register const Py_UNICODE *e;
7746 int cased;
7747
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 /* Shortcut for single character strings */
7749 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007752 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007753 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007755
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 e = p + PyUnicode_GET_SIZE(self);
7757 cased = 0;
7758 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007760
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7762 return PyBool_FromLong(0);
7763 else if (!cased && Py_UNICODE_ISLOWER(ch))
7764 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007766 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767}
7768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007769PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007772Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007773at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774
7775static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007776unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777{
7778 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7779 register const Py_UNICODE *e;
7780 int cased;
7781
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782 /* Shortcut for single character strings */
7783 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007786 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007787 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007789
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790 e = p + PyUnicode_GET_SIZE(self);
7791 cased = 0;
7792 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007794
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7796 return PyBool_FromLong(0);
7797 else if (!cased && Py_UNICODE_ISUPPER(ch))
7798 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007800 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801}
7802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007803PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007806Return True if S is a titlecased string and there is at least one\n\
7807character in S, i.e. upper- and titlecase characters may only\n\
7808follow uncased characters and lowercase characters only cased ones.\n\
7809Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810
7811static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007812unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813{
7814 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7815 register const Py_UNICODE *e;
7816 int cased, previous_is_cased;
7817
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818 /* Shortcut for single character strings */
7819 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7821 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007823 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007824 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007826
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827 e = p + PyUnicode_GET_SIZE(self);
7828 cased = 0;
7829 previous_is_cased = 0;
7830 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007832
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7834 if (previous_is_cased)
7835 return PyBool_FromLong(0);
7836 previous_is_cased = 1;
7837 cased = 1;
7838 }
7839 else if (Py_UNICODE_ISLOWER(ch)) {
7840 if (!previous_is_cased)
7841 return PyBool_FromLong(0);
7842 previous_is_cased = 1;
7843 cased = 1;
7844 }
7845 else
7846 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007848 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849}
7850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007851PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007854Return True if all characters in S are whitespace\n\
7855and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856
7857static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007858unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859{
7860 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7861 register const Py_UNICODE *e;
7862
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863 /* Shortcut for single character strings */
7864 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 Py_UNICODE_ISSPACE(*p))
7866 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007868 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007869 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007871
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 e = p + PyUnicode_GET_SIZE(self);
7873 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 if (!Py_UNICODE_ISSPACE(*p))
7875 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007877 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878}
7879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007880PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007882\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007883Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007884and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007885
7886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007887unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007888{
7889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7890 register const Py_UNICODE *e;
7891
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007892 /* Shortcut for single character strings */
7893 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 Py_UNICODE_ISALPHA(*p))
7895 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007896
7897 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007898 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007900
7901 e = p + PyUnicode_GET_SIZE(self);
7902 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 if (!Py_UNICODE_ISALPHA(*p))
7904 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007905 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007906 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007907}
7908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007909PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007911\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007912Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007913and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007914
7915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007916unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007917{
7918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7919 register const Py_UNICODE *e;
7920
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007921 /* Shortcut for single character strings */
7922 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 Py_UNICODE_ISALNUM(*p))
7924 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007925
7926 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007927 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007929
7930 e = p + PyUnicode_GET_SIZE(self);
7931 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 if (!Py_UNICODE_ISALNUM(*p))
7933 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007934 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007935 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007936}
7937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007938PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007941Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007942False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943
7944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007945unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946{
7947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7948 register const Py_UNICODE *e;
7949
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950 /* Shortcut for single character strings */
7951 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 Py_UNICODE_ISDECIMAL(*p))
7953 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007955 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007956 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007958
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 e = p + PyUnicode_GET_SIZE(self);
7960 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 if (!Py_UNICODE_ISDECIMAL(*p))
7962 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007964 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965}
7966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007967PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007970Return True if all characters in S are digits\n\
7971and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972
7973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007974unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975{
7976 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7977 register const Py_UNICODE *e;
7978
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 /* Shortcut for single character strings */
7980 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 Py_UNICODE_ISDIGIT(*p))
7982 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007984 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007985 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007987
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 e = p + PyUnicode_GET_SIZE(self);
7989 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 if (!Py_UNICODE_ISDIGIT(*p))
7991 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007993 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994}
7995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007996PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007999Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008000False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001
8002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008003unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004{
8005 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8006 register const Py_UNICODE *e;
8007
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 /* Shortcut for single character strings */
8009 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008010 Py_UNICODE_ISNUMERIC(*p))
8011 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008013 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008014 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008016
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 e = p + PyUnicode_GET_SIZE(self);
8018 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 if (!Py_UNICODE_ISNUMERIC(*p))
8020 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008022 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023}
8024
Martin v. Löwis47383402007-08-15 07:32:56 +00008025int
8026PyUnicode_IsIdentifier(PyObject *self)
8027{
8028 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8029 register const Py_UNICODE *e;
8030
8031 /* Special case for empty strings */
8032 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008034
8035 /* PEP 3131 says that the first character must be in
8036 XID_Start and subsequent characters in XID_Continue,
8037 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008039 letters, digits, underscore). However, given the current
8040 definition of XID_Start and XID_Continue, it is sufficient
8041 to check just for these, except that _ must be allowed
8042 as starting an identifier. */
8043 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8044 return 0;
8045
8046 e = p + PyUnicode_GET_SIZE(self);
8047 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 if (!_PyUnicode_IsXidContinue(*p))
8049 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008050 }
8051 return 1;
8052}
8053
8054PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008056\n\
8057Return True if S is a valid identifier according\n\
8058to the language definition.");
8059
8060static PyObject*
8061unicode_isidentifier(PyObject *self)
8062{
8063 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8064}
8065
Georg Brandl559e5d72008-06-11 18:37:52 +00008066PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008068\n\
8069Return True if all characters in S are considered\n\
8070printable in repr() or S is empty, False otherwise.");
8071
8072static PyObject*
8073unicode_isprintable(PyObject *self)
8074{
8075 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8076 register const Py_UNICODE *e;
8077
8078 /* Shortcut for single character strings */
8079 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8080 Py_RETURN_TRUE;
8081 }
8082
8083 e = p + PyUnicode_GET_SIZE(self);
8084 for (; p < e; p++) {
8085 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8086 Py_RETURN_FALSE;
8087 }
8088 }
8089 Py_RETURN_TRUE;
8090}
8091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008092PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008093 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094\n\
8095Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008096iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097
8098static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008099unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008101 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102}
8103
Martin v. Löwis18e16552006-02-15 17:27:45 +00008104static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105unicode_length(PyUnicodeObject *self)
8106{
8107 return self->length;
8108}
8109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008110PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008113Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008114done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115
8116static PyObject *
8117unicode_ljust(PyUnicodeObject *self, PyObject *args)
8118{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008119 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008120 Py_UNICODE fillchar = ' ';
8121
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008122 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 return NULL;
8124
Tim Peters7a29bd52001-09-12 03:03:31 +00008125 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 Py_INCREF(self);
8127 return (PyObject*) self;
8128 }
8129
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008130 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131}
8132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008133PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008136Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137
8138static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008139unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141 return fixup(self, fixlower);
8142}
8143
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008144#define LEFTSTRIP 0
8145#define RIGHTSTRIP 1
8146#define BOTHSTRIP 2
8147
8148/* Arrays indexed by above */
8149static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8150
8151#define STRIPNAME(i) (stripformat[i]+3)
8152
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008153/* externally visible for str.strip(unicode) */
8154PyObject *
8155_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8156{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008157 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8158 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8159 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8160 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8161 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008162
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008164
Benjamin Peterson14339b62009-01-31 16:36:08 +00008165 i = 0;
8166 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8168 i++;
8169 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008171
Benjamin Peterson14339b62009-01-31 16:36:08 +00008172 j = len;
8173 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 do {
8175 j--;
8176 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8177 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008178 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008179
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 Py_INCREF(self);
8182 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 }
8184 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008186}
8187
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188
8189static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008190do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8193 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008194
Benjamin Peterson14339b62009-01-31 16:36:08 +00008195 i = 0;
8196 if (striptype != RIGHTSTRIP) {
8197 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8198 i++;
8199 }
8200 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008201
Benjamin Peterson14339b62009-01-31 16:36:08 +00008202 j = len;
8203 if (striptype != LEFTSTRIP) {
8204 do {
8205 j--;
8206 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8207 j++;
8208 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008209
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8211 Py_INCREF(self);
8212 return (PyObject*)self;
8213 }
8214 else
8215 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216}
8217
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008218
8219static PyObject *
8220do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8221{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008223
Benjamin Peterson14339b62009-01-31 16:36:08 +00008224 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8225 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008226
Benjamin Peterson14339b62009-01-31 16:36:08 +00008227 if (sep != NULL && sep != Py_None) {
8228 if (PyUnicode_Check(sep))
8229 return _PyUnicode_XStrip(self, striptype, sep);
8230 else {
8231 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 "%s arg must be None or str",
8233 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008234 return NULL;
8235 }
8236 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008237
Benjamin Peterson14339b62009-01-31 16:36:08 +00008238 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008239}
8240
8241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008242PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008244\n\
8245Return a copy of the string S with leading and trailing\n\
8246whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008247If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008248
8249static PyObject *
8250unicode_strip(PyUnicodeObject *self, PyObject *args)
8251{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008252 if (PyTuple_GET_SIZE(args) == 0)
8253 return do_strip(self, BOTHSTRIP); /* Common case */
8254 else
8255 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008256}
8257
8258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008259PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008261\n\
8262Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008263If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008264
8265static PyObject *
8266unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8267{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008268 if (PyTuple_GET_SIZE(args) == 0)
8269 return do_strip(self, LEFTSTRIP); /* Common case */
8270 else
8271 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008272}
8273
8274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008275PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008277\n\
8278Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008279If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008280
8281static PyObject *
8282unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8283{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008284 if (PyTuple_GET_SIZE(args) == 0)
8285 return do_strip(self, RIGHTSTRIP); /* Common case */
8286 else
8287 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008288}
8289
8290
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008292unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293{
8294 PyUnicodeObject *u;
8295 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008296 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008297 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298
Georg Brandl222de0f2009-04-12 12:01:50 +00008299 if (len < 1) {
8300 Py_INCREF(unicode_empty);
8301 return (PyObject *)unicode_empty;
8302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303
Tim Peters7a29bd52001-09-12 03:03:31 +00008304 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 /* no repeat, return original string */
8306 Py_INCREF(str);
8307 return (PyObject*) str;
8308 }
Tim Peters8f422462000-09-09 06:13:41 +00008309
8310 /* ensure # of chars needed doesn't overflow int and # of bytes
8311 * needed doesn't overflow size_t
8312 */
8313 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008314 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008315 PyErr_SetString(PyExc_OverflowError,
8316 "repeated string is too long");
8317 return NULL;
8318 }
8319 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8320 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8321 PyErr_SetString(PyExc_OverflowError,
8322 "repeated string is too long");
8323 return NULL;
8324 }
8325 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 if (!u)
8327 return NULL;
8328
8329 p = u->str;
8330
Georg Brandl222de0f2009-04-12 12:01:50 +00008331 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008332 Py_UNICODE_FILL(p, str->str[0], len);
8333 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008334 Py_ssize_t done = str->length; /* number of characters copied this far */
8335 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008337 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008338 Py_UNICODE_COPY(p+done, p, n);
8339 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 }
8342
8343 return (PyObject*) u;
8344}
8345
Alexander Belopolsky40018472011-02-26 01:02:56 +00008346PyObject *
8347PyUnicode_Replace(PyObject *obj,
8348 PyObject *subobj,
8349 PyObject *replobj,
8350 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351{
8352 PyObject *self;
8353 PyObject *str1;
8354 PyObject *str2;
8355 PyObject *result;
8356
8357 self = PyUnicode_FromObject(obj);
8358 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360 str1 = PyUnicode_FromObject(subobj);
8361 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 Py_DECREF(self);
8363 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 }
8365 str2 = PyUnicode_FromObject(replobj);
8366 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 Py_DECREF(self);
8368 Py_DECREF(str1);
8369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 }
Tim Petersced69f82003-09-16 20:30:58 +00008371 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 (PyUnicodeObject *)str1,
8373 (PyUnicodeObject *)str2,
8374 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 Py_DECREF(self);
8376 Py_DECREF(str1);
8377 Py_DECREF(str2);
8378 return result;
8379}
8380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008381PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008382 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383\n\
8384Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008385old replaced by new. If the optional argument count is\n\
8386given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387
8388static PyObject*
8389unicode_replace(PyUnicodeObject *self, PyObject *args)
8390{
8391 PyUnicodeObject *str1;
8392 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008393 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 PyObject *result;
8395
Martin v. Löwis18e16552006-02-15 17:27:45 +00008396 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 return NULL;
8398 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8399 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008402 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 Py_DECREF(str1);
8404 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406
8407 result = replace(self, str1, str2, maxcount);
8408
8409 Py_DECREF(str1);
8410 Py_DECREF(str2);
8411 return result;
8412}
8413
Alexander Belopolsky40018472011-02-26 01:02:56 +00008414static PyObject *
8415unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008417 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008418 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008419 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8420 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8421
8422 /* XXX(nnorwitz): rather than over-allocating, it would be
8423 better to choose a different scheme. Perhaps scan the
8424 first N-chars of the string and allocate based on that size.
8425 */
8426 /* Initial allocation is based on the longest-possible unichr
8427 escape.
8428
8429 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8430 unichr, so in this case it's the longest unichr escape. In
8431 narrow (UTF-16) builds this is five chars per source unichr
8432 since there are two unichrs in the surrogate pair, so in narrow
8433 (UTF-16) builds it's not the longest unichr escape.
8434
8435 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8436 so in the narrow (UTF-16) build case it's the longest unichr
8437 escape.
8438 */
8439
Walter Dörwald1ab83302007-05-18 17:15:44 +00008440 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008442#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008444#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008446#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008448 if (repr == NULL)
8449 return NULL;
8450
Walter Dörwald1ab83302007-05-18 17:15:44 +00008451 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008452
8453 /* Add quote */
8454 *p++ = (findchar(s, size, '\'') &&
8455 !findchar(s, size, '"')) ? '"' : '\'';
8456 while (size-- > 0) {
8457 Py_UNICODE ch = *s++;
8458
8459 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008460 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008461 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008462 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008463 continue;
8464 }
8465
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008467 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008468 *p++ = '\\';
8469 *p++ = 't';
8470 }
8471 else if (ch == '\n') {
8472 *p++ = '\\';
8473 *p++ = 'n';
8474 }
8475 else if (ch == '\r') {
8476 *p++ = '\\';
8477 *p++ = 'r';
8478 }
8479
8480 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008481 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008482 *p++ = '\\';
8483 *p++ = 'x';
8484 *p++ = hexdigits[(ch >> 4) & 0x000F];
8485 *p++ = hexdigits[ch & 0x000F];
8486 }
8487
Georg Brandl559e5d72008-06-11 18:37:52 +00008488 /* Copy ASCII characters as-is */
8489 else if (ch < 0x7F) {
8490 *p++ = ch;
8491 }
8492
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008494 else {
8495 Py_UCS4 ucs = ch;
8496
8497#ifndef Py_UNICODE_WIDE
8498 Py_UNICODE ch2 = 0;
8499 /* Get code point from surrogate pair */
8500 if (size > 0) {
8501 ch2 = *s;
8502 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008504 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008507 size--;
8508 }
8509 }
8510#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008511 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008512 (categories Z* and C* except ASCII space)
8513 */
8514 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8515 /* Map 8-bit characters to '\xhh' */
8516 if (ucs <= 0xff) {
8517 *p++ = '\\';
8518 *p++ = 'x';
8519 *p++ = hexdigits[(ch >> 4) & 0x000F];
8520 *p++ = hexdigits[ch & 0x000F];
8521 }
8522 /* Map 21-bit characters to '\U00xxxxxx' */
8523 else if (ucs >= 0x10000) {
8524 *p++ = '\\';
8525 *p++ = 'U';
8526 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8527 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8528 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8529 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8530 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8531 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8532 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8533 *p++ = hexdigits[ucs & 0x0000000F];
8534 }
8535 /* Map 16-bit characters to '\uxxxx' */
8536 else {
8537 *p++ = '\\';
8538 *p++ = 'u';
8539 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8540 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8541 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8542 *p++ = hexdigits[ucs & 0x000F];
8543 }
8544 }
8545 /* Copy characters as-is */
8546 else {
8547 *p++ = ch;
8548#ifndef Py_UNICODE_WIDE
8549 if (ucs >= 0x10000)
8550 *p++ = ch2;
8551#endif
8552 }
8553 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008554 }
8555 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008556 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008557
8558 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008559 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008560 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561}
8562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008563PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565\n\
8566Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008567such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568arguments start and end are interpreted as in slice notation.\n\
8569\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008570Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571
8572static PyObject *
8573unicode_rfind(PyUnicodeObject *self, PyObject *args)
8574{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008575 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008576 Py_ssize_t start;
8577 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008578 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579
Christian Heimes9cd17752007-11-18 19:35:23 +00008580 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582
Thomas Wouters477c8d52006-05-27 19:21:47 +00008583 result = stringlib_rfind_slice(
8584 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8585 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8586 start, end
8587 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
8589 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008590
Christian Heimes217cfd12007-12-02 14:31:20 +00008591 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592}
8593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008594PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008597Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598
8599static PyObject *
8600unicode_rindex(PyUnicodeObject *self, PyObject *args)
8601{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008602 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008603 Py_ssize_t start;
8604 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008605 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606
Christian Heimes9cd17752007-11-18 19:35:23 +00008607 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008608 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
Thomas Wouters477c8d52006-05-27 19:21:47 +00008610 result = stringlib_rfind_slice(
8611 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8612 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8613 start, end
8614 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615
8616 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008617
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 if (result < 0) {
8619 PyErr_SetString(PyExc_ValueError, "substring not found");
8620 return NULL;
8621 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008622 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623}
8624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008625PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008628Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008629done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630
8631static PyObject *
8632unicode_rjust(PyUnicodeObject *self, PyObject *args)
8633{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008634 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008635 Py_UNICODE fillchar = ' ';
8636
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008637 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 return NULL;
8639
Tim Peters7a29bd52001-09-12 03:03:31 +00008640 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 Py_INCREF(self);
8642 return (PyObject*) self;
8643 }
8644
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008645 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646}
8647
Alexander Belopolsky40018472011-02-26 01:02:56 +00008648PyObject *
8649PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650{
8651 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008652
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 s = PyUnicode_FromObject(s);
8654 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008655 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 if (sep != NULL) {
8657 sep = PyUnicode_FromObject(sep);
8658 if (sep == NULL) {
8659 Py_DECREF(s);
8660 return NULL;
8661 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 }
8663
8664 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8665
8666 Py_DECREF(s);
8667 Py_XDECREF(sep);
8668 return result;
8669}
8670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008671PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673\n\
8674Return a list of the words in S, using sep as the\n\
8675delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008676splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008677whitespace string is a separator and empty strings are\n\
8678removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679
8680static PyObject*
8681unicode_split(PyUnicodeObject *self, PyObject *args)
8682{
8683 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008684 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685
Martin v. Löwis18e16552006-02-15 17:27:45 +00008686 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 return NULL;
8688
8689 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695}
8696
Thomas Wouters477c8d52006-05-27 19:21:47 +00008697PyObject *
8698PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8699{
8700 PyObject* str_obj;
8701 PyObject* sep_obj;
8702 PyObject* out;
8703
8704 str_obj = PyUnicode_FromObject(str_in);
8705 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008707 sep_obj = PyUnicode_FromObject(sep_in);
8708 if (!sep_obj) {
8709 Py_DECREF(str_obj);
8710 return NULL;
8711 }
8712
8713 out = stringlib_partition(
8714 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8715 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8716 );
8717
8718 Py_DECREF(sep_obj);
8719 Py_DECREF(str_obj);
8720
8721 return out;
8722}
8723
8724
8725PyObject *
8726PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8727{
8728 PyObject* str_obj;
8729 PyObject* sep_obj;
8730 PyObject* out;
8731
8732 str_obj = PyUnicode_FromObject(str_in);
8733 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008735 sep_obj = PyUnicode_FromObject(sep_in);
8736 if (!sep_obj) {
8737 Py_DECREF(str_obj);
8738 return NULL;
8739 }
8740
8741 out = stringlib_rpartition(
8742 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8743 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8744 );
8745
8746 Py_DECREF(sep_obj);
8747 Py_DECREF(str_obj);
8748
8749 return out;
8750}
8751
8752PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008754\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008755Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008756the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008757found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008758
8759static PyObject*
8760unicode_partition(PyUnicodeObject *self, PyObject *separator)
8761{
8762 return PyUnicode_Partition((PyObject *)self, separator);
8763}
8764
8765PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008766 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008767\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008768Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008769the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008770separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008771
8772static PyObject*
8773unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8774{
8775 return PyUnicode_RPartition((PyObject *)self, separator);
8776}
8777
Alexander Belopolsky40018472011-02-26 01:02:56 +00008778PyObject *
8779PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008780{
8781 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008782
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008783 s = PyUnicode_FromObject(s);
8784 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008785 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 if (sep != NULL) {
8787 sep = PyUnicode_FromObject(sep);
8788 if (sep == NULL) {
8789 Py_DECREF(s);
8790 return NULL;
8791 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008792 }
8793
8794 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8795
8796 Py_DECREF(s);
8797 Py_XDECREF(sep);
8798 return result;
8799}
8800
8801PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008803\n\
8804Return a list of the words in S, using sep as the\n\
8805delimiter string, starting at the end of the string and\n\
8806working to the front. If maxsplit is given, at most maxsplit\n\
8807splits are done. If sep is not specified, any whitespace string\n\
8808is a separator.");
8809
8810static PyObject*
8811unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8812{
8813 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008814 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008815
Martin v. Löwis18e16552006-02-15 17:27:45 +00008816 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008817 return NULL;
8818
8819 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008820 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008821 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008822 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008823 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008825}
8826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008827PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829\n\
8830Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008831Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008832is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833
8834static PyObject*
8835unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8836{
Guido van Rossum86662912000-04-11 15:38:46 +00008837 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838
Guido van Rossum86662912000-04-11 15:38:46 +00008839 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840 return NULL;
8841
Guido van Rossum86662912000-04-11 15:38:46 +00008842 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843}
8844
8845static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008846PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847{
Walter Dörwald346737f2007-05-31 10:44:43 +00008848 if (PyUnicode_CheckExact(self)) {
8849 Py_INCREF(self);
8850 return self;
8851 } else
8852 /* Subtype -- return genuine unicode string with the same value. */
8853 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8854 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855}
8856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008857PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008858 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859\n\
8860Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008861and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862
8863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008864unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866 return fixup(self, fixswapcase);
8867}
8868
Georg Brandlceee0772007-11-27 23:48:05 +00008869PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008871\n\
8872Return a translation table usable for str.translate().\n\
8873If there is only one argument, it must be a dictionary mapping Unicode\n\
8874ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008875Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008876If there are two arguments, they must be strings of equal length, and\n\
8877in the resulting dictionary, each character in x will be mapped to the\n\
8878character at the same position in y. If there is a third argument, it\n\
8879must be a string, whose characters will be mapped to None in the result.");
8880
8881static PyObject*
8882unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8883{
8884 PyObject *x, *y = NULL, *z = NULL;
8885 PyObject *new = NULL, *key, *value;
8886 Py_ssize_t i = 0;
8887 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008888
Georg Brandlceee0772007-11-27 23:48:05 +00008889 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8890 return NULL;
8891 new = PyDict_New();
8892 if (!new)
8893 return NULL;
8894 if (y != NULL) {
8895 /* x must be a string too, of equal length */
8896 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8897 if (!PyUnicode_Check(x)) {
8898 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8899 "be a string if there is a second argument");
8900 goto err;
8901 }
8902 if (PyUnicode_GET_SIZE(x) != ylen) {
8903 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8904 "arguments must have equal length");
8905 goto err;
8906 }
8907 /* create entries for translating chars in x to those in y */
8908 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008909 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8910 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008911 if (!key || !value)
8912 goto err;
8913 res = PyDict_SetItem(new, key, value);
8914 Py_DECREF(key);
8915 Py_DECREF(value);
8916 if (res < 0)
8917 goto err;
8918 }
8919 /* create entries for deleting chars in z */
8920 if (z != NULL) {
8921 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008922 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008923 if (!key)
8924 goto err;
8925 res = PyDict_SetItem(new, key, Py_None);
8926 Py_DECREF(key);
8927 if (res < 0)
8928 goto err;
8929 }
8930 }
8931 } else {
8932 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008933 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008934 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8935 "to maketrans it must be a dict");
8936 goto err;
8937 }
8938 /* copy entries into the new dict, converting string keys to int keys */
8939 while (PyDict_Next(x, &i, &key, &value)) {
8940 if (PyUnicode_Check(key)) {
8941 /* convert string keys to integer keys */
8942 PyObject *newkey;
8943 if (PyUnicode_GET_SIZE(key) != 1) {
8944 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8945 "table must be of length 1");
8946 goto err;
8947 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008948 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008949 if (!newkey)
8950 goto err;
8951 res = PyDict_SetItem(new, newkey, value);
8952 Py_DECREF(newkey);
8953 if (res < 0)
8954 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008955 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008956 /* just keep integer keys */
8957 if (PyDict_SetItem(new, key, value) < 0)
8958 goto err;
8959 } else {
8960 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8961 "be strings or integers");
8962 goto err;
8963 }
8964 }
8965 }
8966 return new;
8967 err:
8968 Py_DECREF(new);
8969 return NULL;
8970}
8971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008972PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974\n\
8975Return a copy of the string S, where all characters have been mapped\n\
8976through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008977Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008978Unmapped characters are left untouched. Characters mapped to None\n\
8979are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980
8981static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008982unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983{
Georg Brandlceee0772007-11-27 23:48:05 +00008984 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985}
8986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008987PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008990Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991
8992static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008993unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995 return fixup(self, fixupper);
8996}
8997
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008998PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009001Pad a numeric string S with zeros on the left, to fill a field\n\
9002of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003
9004static PyObject *
9005unicode_zfill(PyUnicodeObject *self, PyObject *args)
9006{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009007 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 PyUnicodeObject *u;
9009
Martin v. Löwis18e16552006-02-15 17:27:45 +00009010 Py_ssize_t width;
9011 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012 return NULL;
9013
9014 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009015 if (PyUnicode_CheckExact(self)) {
9016 Py_INCREF(self);
9017 return (PyObject*) self;
9018 }
9019 else
9020 return PyUnicode_FromUnicode(
9021 PyUnicode_AS_UNICODE(self),
9022 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024 }
9025
9026 fill = width - self->length;
9027
9028 u = pad(self, fill, 0, '0');
9029
Walter Dörwald068325e2002-04-15 13:36:47 +00009030 if (u == NULL)
9031 return NULL;
9032
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 if (u->str[fill] == '+' || u->str[fill] == '-') {
9034 /* move sign to beginning of string */
9035 u->str[0] = u->str[fill];
9036 u->str[fill] = '0';
9037 }
9038
9039 return (PyObject*) u;
9040}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041
9042#if 0
9043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009044unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045{
Christian Heimes2202f872008-02-06 14:31:34 +00009046 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009048
9049static PyObject *
9050unicode__decimal2ascii(PyObject *self)
9051{
9052 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9053 PyUnicode_GET_SIZE(self));
9054}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055#endif
9056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009057PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009058 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009060Return True if S starts with the specified prefix, False otherwise.\n\
9061With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009062With optional end, stop comparing S at that position.\n\
9063prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009064
9065static PyObject *
9066unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009067 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009069 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009071 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009072 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009073 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009075 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009076 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9077 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009078 if (PyTuple_Check(subobj)) {
9079 Py_ssize_t i;
9080 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9081 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009083 if (substring == NULL)
9084 return NULL;
9085 result = tailmatch(self, substring, start, end, -1);
9086 Py_DECREF(substring);
9087 if (result) {
9088 Py_RETURN_TRUE;
9089 }
9090 }
9091 /* nothing matched */
9092 Py_RETURN_FALSE;
9093 }
9094 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009097 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009099 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100}
9101
9102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009103PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009104 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009106Return True if S ends with the specified suffix, False otherwise.\n\
9107With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009108With optional end, stop comparing S at that position.\n\
9109suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110
9111static PyObject *
9112unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009115 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009117 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009118 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009119 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009121 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9123 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009124 if (PyTuple_Check(subobj)) {
9125 Py_ssize_t i;
9126 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9127 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009129 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009131 result = tailmatch(self, substring, start, end, +1);
9132 Py_DECREF(substring);
9133 if (result) {
9134 Py_RETURN_TRUE;
9135 }
9136 }
9137 Py_RETURN_FALSE;
9138 }
9139 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009143 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009145 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146}
9147
Eric Smith8c663262007-08-25 02:26:07 +00009148#include "stringlib/string_format.h"
9149
9150PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009152\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009153Return a formatted version of S, using substitutions from args and kwargs.\n\
9154The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009155
Eric Smith27bbca62010-11-04 17:06:58 +00009156PyDoc_STRVAR(format_map__doc__,
9157 "S.format_map(mapping) -> str\n\
9158\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009159Return a formatted version of S, using substitutions from mapping.\n\
9160The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009161
Eric Smith4a7d76d2008-05-30 18:10:19 +00009162static PyObject *
9163unicode__format__(PyObject* self, PyObject* args)
9164{
9165 PyObject *format_spec;
9166
9167 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9168 return NULL;
9169
9170 return _PyUnicode_FormatAdvanced(self,
9171 PyUnicode_AS_UNICODE(format_spec),
9172 PyUnicode_GET_SIZE(format_spec));
9173}
9174
Eric Smith8c663262007-08-25 02:26:07 +00009175PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009176 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009177\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009178Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009179
9180static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009181unicode__sizeof__(PyUnicodeObject *v)
9182{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009183 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9184 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009185}
9186
9187PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009189
9190static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009191unicode_getnewargs(PyUnicodeObject *v)
9192{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009193 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009194}
9195
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196static PyMethodDef unicode_methods[] = {
9197
9198 /* Order is according to common usage: often used methods should
9199 appear first, since lookup is done sequentially. */
9200
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009201 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009202 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9203 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009204 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009205 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9206 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9207 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9208 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9209 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9210 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9211 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009212 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009213 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9214 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9215 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009216 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009217 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9218 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9219 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009220 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009221 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009222 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009223 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009224 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9225 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9226 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9227 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9228 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9229 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9230 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9231 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9232 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9233 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9234 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9235 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9236 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9237 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009238 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009239 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009240 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009241 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009242 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009243 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009244 {"maketrans", (PyCFunction) unicode_maketrans,
9245 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009246 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009247#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009248 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249#endif
9250
9251#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009252 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009253 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009254 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255#endif
9256
Benjamin Peterson14339b62009-01-31 16:36:08 +00009257 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258 {NULL, NULL}
9259};
9260
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009261static PyObject *
9262unicode_mod(PyObject *v, PyObject *w)
9263{
Benjamin Peterson29060642009-01-31 22:14:21 +00009264 if (!PyUnicode_Check(v)) {
9265 Py_INCREF(Py_NotImplemented);
9266 return Py_NotImplemented;
9267 }
9268 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009269}
9270
9271static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009272 0, /*nb_add*/
9273 0, /*nb_subtract*/
9274 0, /*nb_multiply*/
9275 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009276};
9277
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009279 (lenfunc) unicode_length, /* sq_length */
9280 PyUnicode_Concat, /* sq_concat */
9281 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9282 (ssizeargfunc) unicode_getitem, /* sq_item */
9283 0, /* sq_slice */
9284 0, /* sq_ass_item */
9285 0, /* sq_ass_slice */
9286 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287};
9288
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009289static PyObject*
9290unicode_subscript(PyUnicodeObject* self, PyObject* item)
9291{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009292 if (PyIndex_Check(item)) {
9293 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009294 if (i == -1 && PyErr_Occurred())
9295 return NULL;
9296 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009297 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009298 return unicode_getitem(self, i);
9299 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009300 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009301 Py_UNICODE* source_buf;
9302 Py_UNICODE* result_buf;
9303 PyObject* result;
9304
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009305 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009306 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009307 return NULL;
9308 }
9309
9310 if (slicelength <= 0) {
9311 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009312 } else if (start == 0 && step == 1 && slicelength == self->length &&
9313 PyUnicode_CheckExact(self)) {
9314 Py_INCREF(self);
9315 return (PyObject *)self;
9316 } else if (step == 1) {
9317 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009318 } else {
9319 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009320 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9321 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009322
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 if (result_buf == NULL)
9324 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009325
9326 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9327 result_buf[i] = source_buf[cur];
9328 }
Tim Petersced69f82003-09-16 20:30:58 +00009329
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009330 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009331 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009332 return result;
9333 }
9334 } else {
9335 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9336 return NULL;
9337 }
9338}
9339
9340static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009341 (lenfunc)unicode_length, /* mp_length */
9342 (binaryfunc)unicode_subscript, /* mp_subscript */
9343 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009344};
9345
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347/* Helpers for PyUnicode_Format() */
9348
9349static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009350getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009352 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 (*p_argidx)++;
9355 if (arglen < 0)
9356 return args;
9357 else
9358 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359 }
9360 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009361 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 return NULL;
9363}
9364
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009365/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009367static PyObject *
9368formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009370 char *p;
9371 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009373
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374 x = PyFloat_AsDouble(v);
9375 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009376 return NULL;
9377
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009379 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009380
Eric Smith0923d1d2009-04-16 20:16:10 +00009381 p = PyOS_double_to_string(x, type, prec,
9382 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009383 if (p == NULL)
9384 return NULL;
9385 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009386 PyMem_Free(p);
9387 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388}
9389
Tim Peters38fd5b62000-09-21 05:43:11 +00009390static PyObject*
9391formatlong(PyObject *val, int flags, int prec, int type)
9392{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009393 char *buf;
9394 int len;
9395 PyObject *str; /* temporary string object. */
9396 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009397
Benjamin Peterson14339b62009-01-31 16:36:08 +00009398 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9399 if (!str)
9400 return NULL;
9401 result = PyUnicode_FromStringAndSize(buf, len);
9402 Py_DECREF(str);
9403 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009404}
9405
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406static int
9407formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009408 size_t buflen,
9409 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009411 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009412 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009413 if (PyUnicode_GET_SIZE(v) == 1) {
9414 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9415 buf[1] = '\0';
9416 return 1;
9417 }
9418#ifndef Py_UNICODE_WIDE
9419 if (PyUnicode_GET_SIZE(v) == 2) {
9420 /* Decode a valid surrogate pair */
9421 int c0 = PyUnicode_AS_UNICODE(v)[0];
9422 int c1 = PyUnicode_AS_UNICODE(v)[1];
9423 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9424 0xDC00 <= c1 && c1 <= 0xDFFF) {
9425 buf[0] = c0;
9426 buf[1] = c1;
9427 buf[2] = '\0';
9428 return 2;
9429 }
9430 }
9431#endif
9432 goto onError;
9433 }
9434 else {
9435 /* Integer input truncated to a character */
9436 long x;
9437 x = PyLong_AsLong(v);
9438 if (x == -1 && PyErr_Occurred())
9439 goto onError;
9440
9441 if (x < 0 || x > 0x10ffff) {
9442 PyErr_SetString(PyExc_OverflowError,
9443 "%c arg not in range(0x110000)");
9444 return -1;
9445 }
9446
9447#ifndef Py_UNICODE_WIDE
9448 if (x > 0xffff) {
9449 x -= 0x10000;
9450 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9451 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9452 return 2;
9453 }
9454#endif
9455 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009456 buf[1] = '\0';
9457 return 1;
9458 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009459
Benjamin Peterson29060642009-01-31 22:14:21 +00009460 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009461 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009462 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009463 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464}
9465
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009466/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009467 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009468*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009469#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009470
Alexander Belopolsky40018472011-02-26 01:02:56 +00009471PyObject *
9472PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473{
9474 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009475 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 int args_owned = 0;
9477 PyUnicodeObject *result = NULL;
9478 PyObject *dict = NULL;
9479 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009480
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 PyErr_BadInternalCall();
9483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484 }
9485 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009486 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488 fmt = PyUnicode_AS_UNICODE(uformat);
9489 fmtcnt = PyUnicode_GET_SIZE(uformat);
9490
9491 reslen = rescnt = fmtcnt + 100;
9492 result = _PyUnicode_New(reslen);
9493 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009494 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 res = PyUnicode_AS_UNICODE(result);
9496
9497 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 arglen = PyTuple_Size(args);
9499 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500 }
9501 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009502 arglen = -1;
9503 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009505 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009506 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009507 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508
9509 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 if (*fmt != '%') {
9511 if (--rescnt < 0) {
9512 rescnt = fmtcnt + 100;
9513 reslen += rescnt;
9514 if (_PyUnicode_Resize(&result, reslen) < 0)
9515 goto onError;
9516 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9517 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009518 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009520 }
9521 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 /* Got a format specifier */
9523 int flags = 0;
9524 Py_ssize_t width = -1;
9525 int prec = -1;
9526 Py_UNICODE c = '\0';
9527 Py_UNICODE fill;
9528 int isnumok;
9529 PyObject *v = NULL;
9530 PyObject *temp = NULL;
9531 Py_UNICODE *pbuf;
9532 Py_UNICODE sign;
9533 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009534 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535
Benjamin Peterson29060642009-01-31 22:14:21 +00009536 fmt++;
9537 if (*fmt == '(') {
9538 Py_UNICODE *keystart;
9539 Py_ssize_t keylen;
9540 PyObject *key;
9541 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009542
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 if (dict == NULL) {
9544 PyErr_SetString(PyExc_TypeError,
9545 "format requires a mapping");
9546 goto onError;
9547 }
9548 ++fmt;
9549 --fmtcnt;
9550 keystart = fmt;
9551 /* Skip over balanced parentheses */
9552 while (pcount > 0 && --fmtcnt >= 0) {
9553 if (*fmt == ')')
9554 --pcount;
9555 else if (*fmt == '(')
9556 ++pcount;
9557 fmt++;
9558 }
9559 keylen = fmt - keystart - 1;
9560 if (fmtcnt < 0 || pcount > 0) {
9561 PyErr_SetString(PyExc_ValueError,
9562 "incomplete format key");
9563 goto onError;
9564 }
9565#if 0
9566 /* keys are converted to strings using UTF-8 and
9567 then looked up since Python uses strings to hold
9568 variables names etc. in its namespaces and we
9569 wouldn't want to break common idioms. */
9570 key = PyUnicode_EncodeUTF8(keystart,
9571 keylen,
9572 NULL);
9573#else
9574 key = PyUnicode_FromUnicode(keystart, keylen);
9575#endif
9576 if (key == NULL)
9577 goto onError;
9578 if (args_owned) {
9579 Py_DECREF(args);
9580 args_owned = 0;
9581 }
9582 args = PyObject_GetItem(dict, key);
9583 Py_DECREF(key);
9584 if (args == NULL) {
9585 goto onError;
9586 }
9587 args_owned = 1;
9588 arglen = -1;
9589 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 while (--fmtcnt >= 0) {
9592 switch (c = *fmt++) {
9593 case '-': flags |= F_LJUST; continue;
9594 case '+': flags |= F_SIGN; continue;
9595 case ' ': flags |= F_BLANK; continue;
9596 case '#': flags |= F_ALT; continue;
9597 case '0': flags |= F_ZERO; continue;
9598 }
9599 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009600 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009601 if (c == '*') {
9602 v = getnextarg(args, arglen, &argidx);
9603 if (v == NULL)
9604 goto onError;
9605 if (!PyLong_Check(v)) {
9606 PyErr_SetString(PyExc_TypeError,
9607 "* wants int");
9608 goto onError;
9609 }
9610 width = PyLong_AsLong(v);
9611 if (width == -1 && PyErr_Occurred())
9612 goto onError;
9613 if (width < 0) {
9614 flags |= F_LJUST;
9615 width = -width;
9616 }
9617 if (--fmtcnt >= 0)
9618 c = *fmt++;
9619 }
9620 else if (c >= '0' && c <= '9') {
9621 width = c - '0';
9622 while (--fmtcnt >= 0) {
9623 c = *fmt++;
9624 if (c < '0' || c > '9')
9625 break;
9626 if ((width*10) / 10 != width) {
9627 PyErr_SetString(PyExc_ValueError,
9628 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009629 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009630 }
9631 width = width*10 + (c - '0');
9632 }
9633 }
9634 if (c == '.') {
9635 prec = 0;
9636 if (--fmtcnt >= 0)
9637 c = *fmt++;
9638 if (c == '*') {
9639 v = getnextarg(args, arglen, &argidx);
9640 if (v == NULL)
9641 goto onError;
9642 if (!PyLong_Check(v)) {
9643 PyErr_SetString(PyExc_TypeError,
9644 "* wants int");
9645 goto onError;
9646 }
9647 prec = PyLong_AsLong(v);
9648 if (prec == -1 && PyErr_Occurred())
9649 goto onError;
9650 if (prec < 0)
9651 prec = 0;
9652 if (--fmtcnt >= 0)
9653 c = *fmt++;
9654 }
9655 else if (c >= '0' && c <= '9') {
9656 prec = c - '0';
9657 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009658 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 if (c < '0' || c > '9')
9660 break;
9661 if ((prec*10) / 10 != prec) {
9662 PyErr_SetString(PyExc_ValueError,
9663 "prec too big");
9664 goto onError;
9665 }
9666 prec = prec*10 + (c - '0');
9667 }
9668 }
9669 } /* prec */
9670 if (fmtcnt >= 0) {
9671 if (c == 'h' || c == 'l' || c == 'L') {
9672 if (--fmtcnt >= 0)
9673 c = *fmt++;
9674 }
9675 }
9676 if (fmtcnt < 0) {
9677 PyErr_SetString(PyExc_ValueError,
9678 "incomplete format");
9679 goto onError;
9680 }
9681 if (c != '%') {
9682 v = getnextarg(args, arglen, &argidx);
9683 if (v == NULL)
9684 goto onError;
9685 }
9686 sign = 0;
9687 fill = ' ';
9688 switch (c) {
9689
9690 case '%':
9691 pbuf = formatbuf;
9692 /* presume that buffer length is at least 1 */
9693 pbuf[0] = '%';
9694 len = 1;
9695 break;
9696
9697 case 's':
9698 case 'r':
9699 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009700 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009701 temp = v;
9702 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009703 }
9704 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009705 if (c == 's')
9706 temp = PyObject_Str(v);
9707 else if (c == 'r')
9708 temp = PyObject_Repr(v);
9709 else
9710 temp = PyObject_ASCII(v);
9711 if (temp == NULL)
9712 goto onError;
9713 if (PyUnicode_Check(temp))
9714 /* nothing to do */;
9715 else {
9716 Py_DECREF(temp);
9717 PyErr_SetString(PyExc_TypeError,
9718 "%s argument has non-string str()");
9719 goto onError;
9720 }
9721 }
9722 pbuf = PyUnicode_AS_UNICODE(temp);
9723 len = PyUnicode_GET_SIZE(temp);
9724 if (prec >= 0 && len > prec)
9725 len = prec;
9726 break;
9727
9728 case 'i':
9729 case 'd':
9730 case 'u':
9731 case 'o':
9732 case 'x':
9733 case 'X':
9734 if (c == 'i')
9735 c = 'd';
9736 isnumok = 0;
9737 if (PyNumber_Check(v)) {
9738 PyObject *iobj=NULL;
9739
9740 if (PyLong_Check(v)) {
9741 iobj = v;
9742 Py_INCREF(iobj);
9743 }
9744 else {
9745 iobj = PyNumber_Long(v);
9746 }
9747 if (iobj!=NULL) {
9748 if (PyLong_Check(iobj)) {
9749 isnumok = 1;
9750 temp = formatlong(iobj, flags, prec, c);
9751 Py_DECREF(iobj);
9752 if (!temp)
9753 goto onError;
9754 pbuf = PyUnicode_AS_UNICODE(temp);
9755 len = PyUnicode_GET_SIZE(temp);
9756 sign = 1;
9757 }
9758 else {
9759 Py_DECREF(iobj);
9760 }
9761 }
9762 }
9763 if (!isnumok) {
9764 PyErr_Format(PyExc_TypeError,
9765 "%%%c format: a number is required, "
9766 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9767 goto onError;
9768 }
9769 if (flags & F_ZERO)
9770 fill = '0';
9771 break;
9772
9773 case 'e':
9774 case 'E':
9775 case 'f':
9776 case 'F':
9777 case 'g':
9778 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009779 temp = formatfloat(v, flags, prec, c);
9780 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009781 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009782 pbuf = PyUnicode_AS_UNICODE(temp);
9783 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009784 sign = 1;
9785 if (flags & F_ZERO)
9786 fill = '0';
9787 break;
9788
9789 case 'c':
9790 pbuf = formatbuf;
9791 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9792 if (len < 0)
9793 goto onError;
9794 break;
9795
9796 default:
9797 PyErr_Format(PyExc_ValueError,
9798 "unsupported format character '%c' (0x%x) "
9799 "at index %zd",
9800 (31<=c && c<=126) ? (char)c : '?',
9801 (int)c,
9802 (Py_ssize_t)(fmt - 1 -
9803 PyUnicode_AS_UNICODE(uformat)));
9804 goto onError;
9805 }
9806 if (sign) {
9807 if (*pbuf == '-' || *pbuf == '+') {
9808 sign = *pbuf++;
9809 len--;
9810 }
9811 else if (flags & F_SIGN)
9812 sign = '+';
9813 else if (flags & F_BLANK)
9814 sign = ' ';
9815 else
9816 sign = 0;
9817 }
9818 if (width < len)
9819 width = len;
9820 if (rescnt - (sign != 0) < width) {
9821 reslen -= rescnt;
9822 rescnt = width + fmtcnt + 100;
9823 reslen += rescnt;
9824 if (reslen < 0) {
9825 Py_XDECREF(temp);
9826 PyErr_NoMemory();
9827 goto onError;
9828 }
9829 if (_PyUnicode_Resize(&result, reslen) < 0) {
9830 Py_XDECREF(temp);
9831 goto onError;
9832 }
9833 res = PyUnicode_AS_UNICODE(result)
9834 + reslen - rescnt;
9835 }
9836 if (sign) {
9837 if (fill != ' ')
9838 *res++ = sign;
9839 rescnt--;
9840 if (width > len)
9841 width--;
9842 }
9843 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9844 assert(pbuf[0] == '0');
9845 assert(pbuf[1] == c);
9846 if (fill != ' ') {
9847 *res++ = *pbuf++;
9848 *res++ = *pbuf++;
9849 }
9850 rescnt -= 2;
9851 width -= 2;
9852 if (width < 0)
9853 width = 0;
9854 len -= 2;
9855 }
9856 if (width > len && !(flags & F_LJUST)) {
9857 do {
9858 --rescnt;
9859 *res++ = fill;
9860 } while (--width > len);
9861 }
9862 if (fill == ' ') {
9863 if (sign)
9864 *res++ = sign;
9865 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9866 assert(pbuf[0] == '0');
9867 assert(pbuf[1] == c);
9868 *res++ = *pbuf++;
9869 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009870 }
9871 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009872 Py_UNICODE_COPY(res, pbuf, len);
9873 res += len;
9874 rescnt -= len;
9875 while (--width >= len) {
9876 --rescnt;
9877 *res++ = ' ';
9878 }
9879 if (dict && (argidx < arglen) && c != '%') {
9880 PyErr_SetString(PyExc_TypeError,
9881 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009882 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009883 goto onError;
9884 }
9885 Py_XDECREF(temp);
9886 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887 } /* until end */
9888 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009889 PyErr_SetString(PyExc_TypeError,
9890 "not all arguments converted during string formatting");
9891 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892 }
9893
Thomas Woutersa96affe2006-03-12 00:29:36 +00009894 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009895 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009897 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898 }
9899 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900 return (PyObject *)result;
9901
Benjamin Peterson29060642009-01-31 22:14:21 +00009902 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903 Py_XDECREF(result);
9904 Py_DECREF(uformat);
9905 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009906 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907 }
9908 return NULL;
9909}
9910
Jeremy Hylton938ace62002-07-17 16:30:39 +00009911static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009912unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9913
Tim Peters6d6c1a32001-08-02 04:15:00 +00009914static PyObject *
9915unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9916{
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009918 static char *kwlist[] = {"object", "encoding", "errors", 0};
9919 char *encoding = NULL;
9920 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009921
Benjamin Peterson14339b62009-01-31 16:36:08 +00009922 if (type != &PyUnicode_Type)
9923 return unicode_subtype_new(type, args, kwds);
9924 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009925 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009926 return NULL;
9927 if (x == NULL)
9928 return (PyObject *)_PyUnicode_New(0);
9929 if (encoding == NULL && errors == NULL)
9930 return PyObject_Str(x);
9931 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009932 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009933}
9934
Guido van Rossume023fe02001-08-30 03:12:59 +00009935static PyObject *
9936unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9937{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009938 PyUnicodeObject *tmp, *pnew;
9939 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009940
Benjamin Peterson14339b62009-01-31 16:36:08 +00009941 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9942 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9943 if (tmp == NULL)
9944 return NULL;
9945 assert(PyUnicode_Check(tmp));
9946 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9947 if (pnew == NULL) {
9948 Py_DECREF(tmp);
9949 return NULL;
9950 }
9951 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9952 if (pnew->str == NULL) {
9953 _Py_ForgetReference((PyObject *)pnew);
9954 PyObject_Del(pnew);
9955 Py_DECREF(tmp);
9956 return PyErr_NoMemory();
9957 }
9958 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9959 pnew->length = n;
9960 pnew->hash = tmp->hash;
9961 Py_DECREF(tmp);
9962 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009963}
9964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009965PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009966 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009967\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009968Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009969encoding defaults to the current default string encoding.\n\
9970errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009971
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009972static PyObject *unicode_iter(PyObject *seq);
9973
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009975 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009976 "str", /* tp_name */
9977 sizeof(PyUnicodeObject), /* tp_size */
9978 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009980 (destructor)unicode_dealloc, /* tp_dealloc */
9981 0, /* tp_print */
9982 0, /* tp_getattr */
9983 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009984 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009985 unicode_repr, /* tp_repr */
9986 &unicode_as_number, /* tp_as_number */
9987 &unicode_as_sequence, /* tp_as_sequence */
9988 &unicode_as_mapping, /* tp_as_mapping */
9989 (hashfunc) unicode_hash, /* tp_hash*/
9990 0, /* tp_call*/
9991 (reprfunc) unicode_str, /* tp_str */
9992 PyObject_GenericGetAttr, /* tp_getattro */
9993 0, /* tp_setattro */
9994 0, /* tp_as_buffer */
9995 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009996 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009997 unicode_doc, /* tp_doc */
9998 0, /* tp_traverse */
9999 0, /* tp_clear */
10000 PyUnicode_RichCompare, /* tp_richcompare */
10001 0, /* tp_weaklistoffset */
10002 unicode_iter, /* tp_iter */
10003 0, /* tp_iternext */
10004 unicode_methods, /* tp_methods */
10005 0, /* tp_members */
10006 0, /* tp_getset */
10007 &PyBaseObject_Type, /* tp_base */
10008 0, /* tp_dict */
10009 0, /* tp_descr_get */
10010 0, /* tp_descr_set */
10011 0, /* tp_dictoffset */
10012 0, /* tp_init */
10013 0, /* tp_alloc */
10014 unicode_new, /* tp_new */
10015 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016};
10017
10018/* Initialize the Unicode implementation */
10019
Thomas Wouters78890102000-07-22 19:25:51 +000010020void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010022 int i;
10023
Thomas Wouters477c8d52006-05-27 19:21:47 +000010024 /* XXX - move this array to unicodectype.c ? */
10025 Py_UNICODE linebreak[] = {
10026 0x000A, /* LINE FEED */
10027 0x000D, /* CARRIAGE RETURN */
10028 0x001C, /* FILE SEPARATOR */
10029 0x001D, /* GROUP SEPARATOR */
10030 0x001E, /* RECORD SEPARATOR */
10031 0x0085, /* NEXT LINE */
10032 0x2028, /* LINE SEPARATOR */
10033 0x2029, /* PARAGRAPH SEPARATOR */
10034 };
10035
Fred Drakee4315f52000-05-09 19:53:39 +000010036 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010037 free_list = NULL;
10038 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010039 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010040 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010041 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010042
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010043 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010044 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010045 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010046 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010047
10048 /* initialize the linebreak bloom filter */
10049 bloom_linebreak = make_bloom_mask(
10050 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10051 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010052
10053 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010054}
10055
10056/* Finalize the Unicode implementation */
10057
Christian Heimesa156e092008-02-16 07:38:31 +000010058int
10059PyUnicode_ClearFreeList(void)
10060{
10061 int freelist_size = numfree;
10062 PyUnicodeObject *u;
10063
10064 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010065 PyUnicodeObject *v = u;
10066 u = *(PyUnicodeObject **)u;
10067 if (v->str)
10068 PyObject_DEL(v->str);
10069 Py_XDECREF(v->defenc);
10070 PyObject_Del(v);
10071 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010072 }
10073 free_list = NULL;
10074 assert(numfree == 0);
10075 return freelist_size;
10076}
10077
Guido van Rossumd57fd912000-03-10 22:53:23 +000010078void
Thomas Wouters78890102000-07-22 19:25:51 +000010079_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010081 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010083 Py_XDECREF(unicode_empty);
10084 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010085
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010086 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010087 if (unicode_latin1[i]) {
10088 Py_DECREF(unicode_latin1[i]);
10089 unicode_latin1[i] = NULL;
10090 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010091 }
Christian Heimesa156e092008-02-16 07:38:31 +000010092 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010094
Walter Dörwald16807132007-05-25 13:52:07 +000010095void
10096PyUnicode_InternInPlace(PyObject **p)
10097{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010098 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10099 PyObject *t;
10100 if (s == NULL || !PyUnicode_Check(s))
10101 Py_FatalError(
10102 "PyUnicode_InternInPlace: unicode strings only please!");
10103 /* If it's a subclass, we don't really know what putting
10104 it in the interned dict might do. */
10105 if (!PyUnicode_CheckExact(s))
10106 return;
10107 if (PyUnicode_CHECK_INTERNED(s))
10108 return;
10109 if (interned == NULL) {
10110 interned = PyDict_New();
10111 if (interned == NULL) {
10112 PyErr_Clear(); /* Don't leave an exception */
10113 return;
10114 }
10115 }
10116 /* It might be that the GetItem call fails even
10117 though the key is present in the dictionary,
10118 namely when this happens during a stack overflow. */
10119 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010120 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010121 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010122
Benjamin Peterson29060642009-01-31 22:14:21 +000010123 if (t) {
10124 Py_INCREF(t);
10125 Py_DECREF(*p);
10126 *p = t;
10127 return;
10128 }
Walter Dörwald16807132007-05-25 13:52:07 +000010129
Benjamin Peterson14339b62009-01-31 16:36:08 +000010130 PyThreadState_GET()->recursion_critical = 1;
10131 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10132 PyErr_Clear();
10133 PyThreadState_GET()->recursion_critical = 0;
10134 return;
10135 }
10136 PyThreadState_GET()->recursion_critical = 0;
10137 /* The two references in interned are not counted by refcnt.
10138 The deallocator will take care of this */
10139 Py_REFCNT(s) -= 2;
10140 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010141}
10142
10143void
10144PyUnicode_InternImmortal(PyObject **p)
10145{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010146 PyUnicode_InternInPlace(p);
10147 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10148 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10149 Py_INCREF(*p);
10150 }
Walter Dörwald16807132007-05-25 13:52:07 +000010151}
10152
10153PyObject *
10154PyUnicode_InternFromString(const char *cp)
10155{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010156 PyObject *s = PyUnicode_FromString(cp);
10157 if (s == NULL)
10158 return NULL;
10159 PyUnicode_InternInPlace(&s);
10160 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010161}
10162
Alexander Belopolsky40018472011-02-26 01:02:56 +000010163void
10164_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000010165{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010166 PyObject *keys;
10167 PyUnicodeObject *s;
10168 Py_ssize_t i, n;
10169 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010170
Benjamin Peterson14339b62009-01-31 16:36:08 +000010171 if (interned == NULL || !PyDict_Check(interned))
10172 return;
10173 keys = PyDict_Keys(interned);
10174 if (keys == NULL || !PyList_Check(keys)) {
10175 PyErr_Clear();
10176 return;
10177 }
Walter Dörwald16807132007-05-25 13:52:07 +000010178
Benjamin Peterson14339b62009-01-31 16:36:08 +000010179 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10180 detector, interned unicode strings are not forcibly deallocated;
10181 rather, we give them their stolen references back, and then clear
10182 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010183
Benjamin Peterson14339b62009-01-31 16:36:08 +000010184 n = PyList_GET_SIZE(keys);
10185 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010186 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010187 for (i = 0; i < n; i++) {
10188 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10189 switch (s->state) {
10190 case SSTATE_NOT_INTERNED:
10191 /* XXX Shouldn't happen */
10192 break;
10193 case SSTATE_INTERNED_IMMORTAL:
10194 Py_REFCNT(s) += 1;
10195 immortal_size += s->length;
10196 break;
10197 case SSTATE_INTERNED_MORTAL:
10198 Py_REFCNT(s) += 2;
10199 mortal_size += s->length;
10200 break;
10201 default:
10202 Py_FatalError("Inconsistent interned string state.");
10203 }
10204 s->state = SSTATE_NOT_INTERNED;
10205 }
10206 fprintf(stderr, "total size of all interned strings: "
10207 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10208 "mortal/immortal\n", mortal_size, immortal_size);
10209 Py_DECREF(keys);
10210 PyDict_Clear(interned);
10211 Py_DECREF(interned);
10212 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010213}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010214
10215
10216/********************* Unicode Iterator **************************/
10217
10218typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010219 PyObject_HEAD
10220 Py_ssize_t it_index;
10221 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010222} unicodeiterobject;
10223
10224static void
10225unicodeiter_dealloc(unicodeiterobject *it)
10226{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010227 _PyObject_GC_UNTRACK(it);
10228 Py_XDECREF(it->it_seq);
10229 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010230}
10231
10232static int
10233unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10234{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010235 Py_VISIT(it->it_seq);
10236 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010237}
10238
10239static PyObject *
10240unicodeiter_next(unicodeiterobject *it)
10241{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010242 PyUnicodeObject *seq;
10243 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010244
Benjamin Peterson14339b62009-01-31 16:36:08 +000010245 assert(it != NULL);
10246 seq = it->it_seq;
10247 if (seq == NULL)
10248 return NULL;
10249 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010250
Benjamin Peterson14339b62009-01-31 16:36:08 +000010251 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10252 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010253 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010254 if (item != NULL)
10255 ++it->it_index;
10256 return item;
10257 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010258
Benjamin Peterson14339b62009-01-31 16:36:08 +000010259 Py_DECREF(seq);
10260 it->it_seq = NULL;
10261 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010262}
10263
10264static PyObject *
10265unicodeiter_len(unicodeiterobject *it)
10266{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010267 Py_ssize_t len = 0;
10268 if (it->it_seq)
10269 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10270 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010271}
10272
10273PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10274
10275static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010276 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010277 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010278 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010279};
10280
10281PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010282 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10283 "str_iterator", /* tp_name */
10284 sizeof(unicodeiterobject), /* tp_basicsize */
10285 0, /* tp_itemsize */
10286 /* methods */
10287 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10288 0, /* tp_print */
10289 0, /* tp_getattr */
10290 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010291 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010292 0, /* tp_repr */
10293 0, /* tp_as_number */
10294 0, /* tp_as_sequence */
10295 0, /* tp_as_mapping */
10296 0, /* tp_hash */
10297 0, /* tp_call */
10298 0, /* tp_str */
10299 PyObject_GenericGetAttr, /* tp_getattro */
10300 0, /* tp_setattro */
10301 0, /* tp_as_buffer */
10302 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10303 0, /* tp_doc */
10304 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10305 0, /* tp_clear */
10306 0, /* tp_richcompare */
10307 0, /* tp_weaklistoffset */
10308 PyObject_SelfIter, /* tp_iter */
10309 (iternextfunc)unicodeiter_next, /* tp_iternext */
10310 unicodeiter_methods, /* tp_methods */
10311 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010312};
10313
10314static PyObject *
10315unicode_iter(PyObject *seq)
10316{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010317 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010318
Benjamin Peterson14339b62009-01-31 16:36:08 +000010319 if (!PyUnicode_Check(seq)) {
10320 PyErr_BadInternalCall();
10321 return NULL;
10322 }
10323 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10324 if (it == NULL)
10325 return NULL;
10326 it->it_index = 0;
10327 Py_INCREF(seq);
10328 it->it_seq = (PyUnicodeObject *)seq;
10329 _PyObject_GC_TRACK(it);
10330 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010331}
10332
Martin v. Löwis5b222132007-06-10 09:51:05 +000010333size_t
10334Py_UNICODE_strlen(const Py_UNICODE *u)
10335{
10336 int res = 0;
10337 while(*u++)
10338 res++;
10339 return res;
10340}
10341
10342Py_UNICODE*
10343Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10344{
10345 Py_UNICODE *u = s1;
10346 while ((*u++ = *s2++));
10347 return s1;
10348}
10349
10350Py_UNICODE*
10351Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10352{
10353 Py_UNICODE *u = s1;
10354 while ((*u++ = *s2++))
10355 if (n-- == 0)
10356 break;
10357 return s1;
10358}
10359
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010360Py_UNICODE*
10361Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10362{
10363 Py_UNICODE *u1 = s1;
10364 u1 += Py_UNICODE_strlen(u1);
10365 Py_UNICODE_strcpy(u1, s2);
10366 return s1;
10367}
10368
Martin v. Löwis5b222132007-06-10 09:51:05 +000010369int
10370Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10371{
10372 while (*s1 && *s2 && *s1 == *s2)
10373 s1++, s2++;
10374 if (*s1 && *s2)
10375 return (*s1 < *s2) ? -1 : +1;
10376 if (*s1)
10377 return 1;
10378 if (*s2)
10379 return -1;
10380 return 0;
10381}
10382
Victor Stinneref8d95c2010-08-16 22:03:11 +000010383int
10384Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10385{
10386 register Py_UNICODE u1, u2;
10387 for (; n != 0; n--) {
10388 u1 = *s1;
10389 u2 = *s2;
10390 if (u1 != u2)
10391 return (u1 < u2) ? -1 : +1;
10392 if (u1 == '\0')
10393 return 0;
10394 s1++;
10395 s2++;
10396 }
10397 return 0;
10398}
10399
Martin v. Löwis5b222132007-06-10 09:51:05 +000010400Py_UNICODE*
10401Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10402{
10403 const Py_UNICODE *p;
10404 for (p = s; *p; p++)
10405 if (*p == c)
10406 return (Py_UNICODE*)p;
10407 return NULL;
10408}
10409
Victor Stinner331ea922010-08-10 16:37:20 +000010410Py_UNICODE*
10411Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10412{
10413 const Py_UNICODE *p;
10414 p = s + Py_UNICODE_strlen(s);
10415 while (p != s) {
10416 p--;
10417 if (*p == c)
10418 return (Py_UNICODE*)p;
10419 }
10420 return NULL;
10421}
10422
Victor Stinner71133ff2010-09-01 23:43:53 +000010423Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010424PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010425{
10426 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10427 Py_UNICODE *copy;
10428 Py_ssize_t size;
10429
10430 /* Ensure we won't overflow the size. */
10431 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10432 PyErr_NoMemory();
10433 return NULL;
10434 }
10435 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10436 size *= sizeof(Py_UNICODE);
10437 copy = PyMem_Malloc(size);
10438 if (copy == NULL) {
10439 PyErr_NoMemory();
10440 return NULL;
10441 }
10442 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10443 return copy;
10444}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010445
Georg Brandl66c221e2010-10-14 07:04:07 +000010446/* A _string module, to export formatter_parser and formatter_field_name_split
10447 to the string.Formatter class implemented in Python. */
10448
10449static PyMethodDef _string_methods[] = {
10450 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10451 METH_O, PyDoc_STR("split the argument as a field name")},
10452 {"formatter_parser", (PyCFunction) formatter_parser,
10453 METH_O, PyDoc_STR("parse the argument as a format string")},
10454 {NULL, NULL}
10455};
10456
10457static struct PyModuleDef _string_module = {
10458 PyModuleDef_HEAD_INIT,
10459 "_string",
10460 PyDoc_STR("string helper module"),
10461 0,
10462 _string_methods,
10463 NULL,
10464 NULL,
10465 NULL,
10466 NULL
10467};
10468
10469PyMODINIT_FUNC
10470PyInit__string(void)
10471{
10472 return PyModule_Create(&_string_module);
10473}
10474
10475
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010476#ifdef __cplusplus
10477}
10478#endif