blob: 7fec6e50bef7172c9a90671d0c4460478fa65c3d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Alexander Belopolsky40018472011-02-26 01:02:56 +0000145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
Alexander Belopolsky40018472011-02-26 01:02:56 +0000151static void
152raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Alexander Belopolsky40018472011-02-26 01:02:56 +0000228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static int
261unicode_resize(register PyUnicodeObject *unicode,
262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000302 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303 }
304 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 return 0;
307}
308
309/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000310 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312
313 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316*/
317
Alexander Belopolsky40018472011-02-26 01:02:56 +0000318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320{
321 register PyUnicodeObject *unicode;
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty);
326 return unicode_empty;
327 }
328
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000329 /* Ensure we won't overflow the size. */
330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331 return (PyUnicodeObject *)PyErr_NoMemory();
332 }
333
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000335 if (free_list) {
336 unicode = free_list;
337 free_list = *(PyUnicodeObject **)unicode;
338 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 if (unicode->str) {
340 /* Keep-Alive optimization: we only upsize the buffer,
341 never downsize it. */
342 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000343 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 PyObject_DEL(unicode->str);
345 unicode->str = NULL;
346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 }
352 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 }
354 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 if (unicode == NULL)
358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000363 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 PyErr_NoMemory();
365 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000367 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000368 * the caller fails before initializing str -- unicode_resize()
369 * reads str[0], and the Keep-Alive optimization can keep memory
370 * allocated for str alive across a call to unicode_dealloc(unicode).
371 * We don't want unicode_resize to read uninitialized memory in
372 * that case.
373 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000374 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000383 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000386 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388}
389
Alexander Belopolsky40018472011-02-26 01:02:56 +0000390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392{
Walter Dörwald16807132007-05-25 13:52:07 +0000393 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000394 case SSTATE_NOT_INTERNED:
395 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000396
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError(
402 "deletion of interned string failed");
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_IMMORTAL:
406 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 default:
409 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410 }
411
Guido van Rossum604ddf82001-12-06 20:03:56 +0000412 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000414 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416 PyObject_DEL(unicode->str);
417 unicode->str = NULL;
418 unicode->length = 0;
419 }
420 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000421 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 }
423 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000424 *(PyUnicodeObject **)unicode = free_list;
425 free_list = unicode;
426 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000429 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 }
433}
434
Alexander Belopolsky40018472011-02-26 01:02:56 +0000435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000437{
438 register PyUnicodeObject *v;
439
440 /* Argument checks */
441 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 PyErr_BadInternalCall();
448 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 }
450
451 /* Resizing unicode_empty and single character objects is not
452 possible since these are being shared. We simply return a fresh
453 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000454 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 (v == unicode_empty || v->length == 1)) {
456 PyUnicodeObject *w = _PyUnicode_New(length);
457 if (w == NULL)
458 return -1;
459 Py_UNICODE_COPY(w->str, v->str,
460 length < v->length ? length : v->length);
461 Py_DECREF(*unicode);
462 *unicode = w;
463 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000464 }
465
466 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */
468 return unicode_resize(v, length);
469}
470
Alexander Belopolsky40018472011-02-26 01:02:56 +0000471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473{
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476
Alexander Belopolsky40018472011-02-26 01:02:56 +0000477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 PyUnicodeObject *unicode;
481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */
484 if (u != NULL) {
485
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000490 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000491
492 /* Single character Unicode objects in the Latin-1 range are
493 shared when using this constructor */
494 if (size == 1 && *u < 256) {
495 unicode = unicode_latin1[*u];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = *u;
501 unicode_latin1[*u] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 /* Copy the Unicode data into the new object */
513 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000514 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515
516 return (PyObject *)unicode;
517}
518
Alexander Belopolsky40018472011-02-26 01:02:56 +0000519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Alexander Belopolsky40018472011-02-26 01:02:56 +0000567PyObject *
568PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569{
570 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL;
574 }
575
576 return PyUnicode_FromStringAndSize(u, size);
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579#ifdef HAVE_WCHAR_H
580
Mark Dickinson081dfee2009-03-18 14:47:41 +0000581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */
589
Alexander Belopolsky40018472011-02-26 01:02:56 +0000590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +0000592{
593 PyUnicodeObject *unicode;
594 register Py_ssize_t i;
595 Py_ssize_t alloc;
596 const wchar_t *orig_w;
597
598 if (w == NULL) {
599 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 if (size == -1) {
606 size = wcslen(w);
607 }
608
609 alloc = size;
610 orig_w = w;
611 for (i = size; i > 0; i--) {
612 if (*w > 0xFFFF)
613 alloc++;
614 w++;
615 }
616 w = orig_w;
617 unicode = _PyUnicode_New(alloc);
618 if (!unicode)
619 return NULL;
620
621 /* Copy the wchar_t data into the new object */
622 {
623 register Py_UNICODE *u;
624 u = PyUnicode_AS_UNICODE(unicode);
625 for (i = size; i > 0; i--) {
626 if (*w > 0xFFFF) {
627 wchar_t ordinal = *w++;
628 ordinal -= 0x10000;
629 *u++ = 0xD800 | (ordinal >> 10);
630 *u++ = 0xDC00 | (ordinal & 0x3FF);
631 }
632 else
633 *u++ = *w++;
634 }
635 }
636 return (PyObject *)unicode;
637}
638
639#else
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 PyUnicodeObject *unicode;
645
646 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 PyErr_BadInternalCall();
650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Martin v. Löwis790465f2008-04-05 20:41:37 +0000653 if (size == -1) {
654 size = wcslen(w);
655 }
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 unicode = _PyUnicode_New(size);
658 if (!unicode)
659 return NULL;
660
661 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000664#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000666 register Py_UNICODE *u;
667 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--)
670 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 }
672#endif
673
674 return (PyObject *)unicode;
675}
676
Mark Dickinson081dfee2009-03-18 14:47:41 +0000677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
Walter Dörwald346737f2007-05-31 10:44:43 +0000681static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000684{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 *fmt++ = '%';
686 if (width) {
687 if (zeropad)
688 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width);
690 }
691 if (precision)
692 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag)
694 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702#else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706#endif
707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000708 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T;
710 while (*f)
711 *fmt++ = *f++;
712 }
713 *fmt++ = c;
714 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000715}
716
Walter Dörwaldd2034312007-05-18 16:29:38 +0000717#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
718
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000719/* size of fixed-size buffer for formatting single arguments */
720#define ITEM_BUFFER_LEN 21
721/* maximum number of characters required for output of %ld. 21 characters
722 allows for 64-bit integers (in decimal) and an optional sign. */
723#define MAX_LONG_CHARS 21
724/* maximum number of characters required for output of %lld.
725 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
726 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
727#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
728
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729PyObject *
730PyUnicode_FromFormatV(const char *format, va_list vargs)
731{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000732 va_list count;
733 Py_ssize_t callcount = 0;
734 PyObject **callresults = NULL;
735 PyObject **callresult = NULL;
736 Py_ssize_t n = 0;
737 int width = 0;
738 int precision = 0;
739 int zeropad;
740 const char* f;
741 Py_UNICODE *s;
742 PyObject *string;
743 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000744 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000745 /* use abuffer instead of buffer, if we need more space
746 * (which can happen if there's a format specifier with width). */
747 char *abuffer = NULL;
748 char *realbuffer;
749 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000750 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000752
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000753 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000754 /* step 1: count the number of %S/%R/%A/%s format specifications
755 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
756 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
757 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000759 if (*f == '%') {
760 if (*(f+1)=='%')
761 continue;
Victor Stinner2512a8b2011-03-01 22:46:52 +0000762 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000763 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000764 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000765 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000766 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000767 ;
768 if (*f == 's')
769 ++callcount;
770 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000771 else if (128 <= (unsigned char)*f) {
772 PyErr_Format(PyExc_ValueError,
773 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000774 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000775 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000776 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000777 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000778 }
779 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000780 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000781 if (callcount) {
782 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
783 if (!callresults) {
784 PyErr_NoMemory();
785 return NULL;
786 }
787 callresult = callresults;
788 }
789 /* step 3: figure out how large a buffer we need */
790 for (f = format; *f; f++) {
791 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000792#ifdef HAVE_LONG_LONG
793 int longlongflag = 0;
794#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 const char* p = f;
796 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000797 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000798 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000799 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000801
Benjamin Peterson14339b62009-01-31 16:36:08 +0000802 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
803 * they don't affect the amount of space we reserve.
804 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000805 if (*f == 'l') {
806 if (f[1] == 'd' || f[1] == 'u') {
807 ++f;
808 }
809#ifdef HAVE_LONG_LONG
810 else if (f[1] == 'l' &&
811 (f[2] == 'd' || f[2] == 'u')) {
812 longlongflag = 1;
813 f += 2;
814 }
815#endif
816 }
817 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000818 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000819 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820
Benjamin Peterson14339b62009-01-31 16:36:08 +0000821 switch (*f) {
822 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000823 {
824#ifndef Py_UNICODE_WIDE
825 int ordinal = va_arg(count, int);
826 if (ordinal > 0xffff)
827 n += 2;
828 else
829 n++;
830#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000831 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000832 n++;
833#endif
834 break;
835 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000836 case '%':
837 n++;
838 break;
839 case 'd': case 'u': case 'i': case 'x':
840 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000841#ifdef HAVE_LONG_LONG
842 if (longlongflag) {
843 if (width < MAX_LONG_LONG_CHARS)
844 width = MAX_LONG_LONG_CHARS;
845 }
846 else
847#endif
848 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
849 including sign. Decimal takes the most space. This
850 isn't enough for octal. If a width is specified we
851 need more (which we allocate later). */
852 if (width < MAX_LONG_CHARS)
853 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000854 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000855 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000856 if (abuffersize < width)
857 abuffersize = width;
858 break;
859 case 's':
860 {
861 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000862 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000863 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
864 if (!str)
865 goto fail;
866 n += PyUnicode_GET_SIZE(str);
867 /* Remember the str and switch to the next slot */
868 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000869 break;
870 }
871 case 'U':
872 {
873 PyObject *obj = va_arg(count, PyObject *);
874 assert(obj && PyUnicode_Check(obj));
875 n += PyUnicode_GET_SIZE(obj);
876 break;
877 }
878 case 'V':
879 {
880 PyObject *obj = va_arg(count, PyObject *);
881 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000882 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000883 assert(obj || str);
884 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +0000885 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000886 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000887 *callresult++ = NULL;
888 }
889 else {
890 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
891 if (!str_obj)
892 goto fail;
893 n += PyUnicode_GET_SIZE(str_obj);
894 *callresult++ = str_obj;
895 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000896 break;
897 }
898 case 'S':
899 {
900 PyObject *obj = va_arg(count, PyObject *);
901 PyObject *str;
902 assert(obj);
903 str = PyObject_Str(obj);
904 if (!str)
905 goto fail;
906 n += PyUnicode_GET_SIZE(str);
907 /* Remember the str and switch to the next slot */
908 *callresult++ = str;
909 break;
910 }
911 case 'R':
912 {
913 PyObject *obj = va_arg(count, PyObject *);
914 PyObject *repr;
915 assert(obj);
916 repr = PyObject_Repr(obj);
917 if (!repr)
918 goto fail;
919 n += PyUnicode_GET_SIZE(repr);
920 /* Remember the repr and switch to the next slot */
921 *callresult++ = repr;
922 break;
923 }
924 case 'A':
925 {
926 PyObject *obj = va_arg(count, PyObject *);
927 PyObject *ascii;
928 assert(obj);
929 ascii = PyObject_ASCII(obj);
930 if (!ascii)
931 goto fail;
932 n += PyUnicode_GET_SIZE(ascii);
933 /* Remember the repr and switch to the next slot */
934 *callresult++ = ascii;
935 break;
936 }
937 case 'p':
938 (void) va_arg(count, int);
939 /* maximum 64-bit pointer representation:
940 * 0xffffffffffffffff
941 * so 19 characters is enough.
942 * XXX I count 18 -- what's the extra for?
943 */
944 n += 19;
945 break;
946 default:
947 /* if we stumble upon an unknown
948 formatting code, copy the rest of
949 the format string to the output
950 string. (we cannot just skip the
951 code, since there's no way to know
952 what's in the argument list) */
953 n += strlen(p);
954 goto expand;
955 }
956 } else
957 n++;
958 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000959 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000960 if (abuffersize > ITEM_BUFFER_LEN) {
961 /* add 1 for sprintf's trailing null byte */
962 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000963 if (!abuffer) {
964 PyErr_NoMemory();
965 goto fail;
966 }
967 realbuffer = abuffer;
968 }
969 else
970 realbuffer = buffer;
971 /* step 4: fill the buffer */
972 /* Since we've analyzed how much space we need for the worst case,
973 we don't have to resize the string.
974 There can be no errors beyond this point. */
975 string = PyUnicode_FromUnicode(NULL, n);
976 if (!string)
977 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000978
Benjamin Peterson14339b62009-01-31 16:36:08 +0000979 s = PyUnicode_AS_UNICODE(string);
980 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000981
Benjamin Peterson14339b62009-01-31 16:36:08 +0000982 for (f = format; *f; f++) {
983 if (*f == '%') {
984 const char* p = f++;
985 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000986 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000987 int size_tflag = 0;
988 zeropad = (*f == '0');
989 /* parse the width.precision part */
990 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000991 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000992 width = (width*10) + *f++ - '0';
993 precision = 0;
994 if (*f == '.') {
995 f++;
David Malcolm96960882010-11-05 17:23:41 +0000996 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000997 precision = (precision*10) + *f++ - '0';
998 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000999 /* Handle %ld, %lu, %lld and %llu. */
1000 if (*f == 'l') {
1001 if (f[1] == 'd' || f[1] == 'u') {
1002 longflag = 1;
1003 ++f;
1004 }
1005#ifdef HAVE_LONG_LONG
1006 else if (f[1] == 'l' &&
1007 (f[2] == 'd' || f[2] == 'u')) {
1008 longlongflag = 1;
1009 f += 2;
1010 }
1011#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001012 }
1013 /* handle the size_t flag. */
1014 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1015 size_tflag = 1;
1016 ++f;
1017 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001018
Benjamin Peterson14339b62009-01-31 16:36:08 +00001019 switch (*f) {
1020 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001021 {
1022 int ordinal = va_arg(vargs, int);
1023#ifndef Py_UNICODE_WIDE
1024 if (ordinal > 0xffff) {
1025 ordinal -= 0x10000;
1026 *s++ = 0xD800 | (ordinal >> 10);
1027 *s++ = 0xDC00 | (ordinal & 0x3FF);
1028 } else
1029#endif
1030 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001031 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001032 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001034 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1035 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001036 if (longflag)
1037 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001038#ifdef HAVE_LONG_LONG
1039 else if (longlongflag)
1040 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1041#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001042 else if (size_tflag)
1043 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1044 else
1045 sprintf(realbuffer, fmt, va_arg(vargs, int));
1046 appendstring(realbuffer);
1047 break;
1048 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1050 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 if (longflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001053#ifdef HAVE_LONG_LONG
1054 else if (longlongflag)
1055 sprintf(realbuffer, fmt, va_arg(vargs,
1056 unsigned PY_LONG_LONG));
1057#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001058 else if (size_tflag)
1059 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1060 else
1061 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1062 appendstring(realbuffer);
1063 break;
1064 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001065 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001066 sprintf(realbuffer, fmt, va_arg(vargs, int));
1067 appendstring(realbuffer);
1068 break;
1069 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001070 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001071 sprintf(realbuffer, fmt, va_arg(vargs, int));
1072 appendstring(realbuffer);
1073 break;
1074 case 's':
1075 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001076 /* unused, since we already have the result */
1077 (void) va_arg(vargs, char *);
1078 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1079 PyUnicode_GET_SIZE(*callresult));
1080 s += PyUnicode_GET_SIZE(*callresult);
1081 /* We're done with the unicode()/repr() => forget it */
1082 Py_DECREF(*callresult);
1083 /* switch to next unicode()/repr() result */
1084 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001085 break;
1086 }
1087 case 'U':
1088 {
1089 PyObject *obj = va_arg(vargs, PyObject *);
1090 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1091 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1092 s += size;
1093 break;
1094 }
1095 case 'V':
1096 {
1097 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001098 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001099 if (obj) {
1100 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1101 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1102 s += size;
1103 } else {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001104 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1105 PyUnicode_GET_SIZE(*callresult));
1106 s += PyUnicode_GET_SIZE(*callresult);
1107 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001108 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001109 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001110 break;
1111 }
1112 case 'S':
1113 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001114 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001115 {
1116 Py_UNICODE *ucopy;
1117 Py_ssize_t usize;
1118 Py_ssize_t upos;
1119 /* unused, since we already have the result */
1120 (void) va_arg(vargs, PyObject *);
1121 ucopy = PyUnicode_AS_UNICODE(*callresult);
1122 usize = PyUnicode_GET_SIZE(*callresult);
1123 for (upos = 0; upos<usize;)
1124 *s++ = ucopy[upos++];
1125 /* We're done with the unicode()/repr() => forget it */
1126 Py_DECREF(*callresult);
1127 /* switch to next unicode()/repr() result */
1128 ++callresult;
1129 break;
1130 }
1131 case 'p':
1132 sprintf(buffer, "%p", va_arg(vargs, void*));
1133 /* %p is ill-defined: ensure leading 0x. */
1134 if (buffer[1] == 'X')
1135 buffer[1] = 'x';
1136 else if (buffer[1] != 'x') {
1137 memmove(buffer+2, buffer, strlen(buffer)+1);
1138 buffer[0] = '0';
1139 buffer[1] = 'x';
1140 }
1141 appendstring(buffer);
1142 break;
1143 case '%':
1144 *s++ = '%';
1145 break;
1146 default:
1147 appendstring(p);
1148 goto end;
1149 }
Victor Stinner1205f272010-09-11 00:54:47 +00001150 }
Victor Stinner1205f272010-09-11 00:54:47 +00001151 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001152 *s++ = *f;
1153 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001154
Benjamin Peterson29060642009-01-31 22:14:21 +00001155 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001156 if (callresults)
1157 PyObject_Free(callresults);
1158 if (abuffer)
1159 PyObject_Free(abuffer);
1160 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1161 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001162 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001163 if (callresults) {
1164 PyObject **callresult2 = callresults;
1165 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001166 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001167 ++callresult2;
1168 }
1169 PyObject_Free(callresults);
1170 }
1171 if (abuffer)
1172 PyObject_Free(abuffer);
1173 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001174}
1175
1176#undef appendstring
1177
1178PyObject *
1179PyUnicode_FromFormat(const char *format, ...)
1180{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001181 PyObject* ret;
1182 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001183
1184#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001185 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001186#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001187 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001188#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001189 ret = PyUnicode_FromFormatV(format, vargs);
1190 va_end(vargs);
1191 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001192}
1193
Victor Stinner5593d8a2010-10-02 11:11:27 +00001194/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1195 convert a Unicode object to a wide character string.
1196
1197 - If w is NULL: return the number of wide characters (including the nul
1198 character) required to convert the unicode object. Ignore size argument.
1199
1200 - Otherwise: return the number of wide characters (excluding the nul
1201 character) written into w. Write at most size wide characters (including
1202 the nul character). */
1203static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001204unicode_aswidechar(PyUnicodeObject *unicode,
1205 wchar_t *w,
1206 Py_ssize_t size)
1207{
1208#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001209 Py_ssize_t res;
1210 if (w != NULL) {
1211 res = PyUnicode_GET_SIZE(unicode);
1212 if (size > res)
1213 size = res + 1;
1214 else
1215 res = size;
1216 memcpy(w, unicode->str, size * sizeof(wchar_t));
1217 return res;
1218 }
1219 else
1220 return PyUnicode_GET_SIZE(unicode) + 1;
1221#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1222 register const Py_UNICODE *u;
1223 const Py_UNICODE *uend;
1224 const wchar_t *worig, *wend;
1225 Py_ssize_t nchar;
1226
Victor Stinner137c34c2010-09-29 10:25:54 +00001227 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001228 uend = u + PyUnicode_GET_SIZE(unicode);
1229 if (w != NULL) {
1230 worig = w;
1231 wend = w + size;
1232 while (u != uend && w != wend) {
1233 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1234 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1235 {
1236 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1237 u += 2;
1238 }
1239 else {
1240 *w = *u;
1241 u++;
1242 }
1243 w++;
1244 }
1245 if (w != wend)
1246 *w = L'\0';
1247 return w - worig;
1248 }
1249 else {
1250 nchar = 1; /* nul character at the end */
1251 while (u != uend) {
1252 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1253 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1254 u += 2;
1255 else
1256 u++;
1257 nchar++;
1258 }
1259 }
1260 return nchar;
1261#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1262 register Py_UNICODE *u, *uend, ordinal;
1263 register Py_ssize_t i;
1264 wchar_t *worig, *wend;
1265 Py_ssize_t nchar;
1266
1267 u = PyUnicode_AS_UNICODE(unicode);
1268 uend = u + PyUnicode_GET_SIZE(u);
1269 if (w != NULL) {
1270 worig = w;
1271 wend = w + size;
1272 while (u != uend && w != wend) {
1273 ordinal = *u;
1274 if (ordinal > 0xffff) {
1275 ordinal -= 0x10000;
1276 *w++ = 0xD800 | (ordinal >> 10);
1277 *w++ = 0xDC00 | (ordinal & 0x3FF);
1278 }
1279 else
1280 *w++ = ordinal;
1281 u++;
1282 }
1283 if (w != wend)
1284 *w = 0;
1285 return w - worig;
1286 }
1287 else {
1288 nchar = 1; /* nul character */
1289 while (u != uend) {
1290 if (*u > 0xffff)
1291 nchar += 2;
1292 else
1293 nchar++;
1294 u++;
1295 }
1296 return nchar;
1297 }
1298#else
1299# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001300#endif
1301}
1302
1303Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001304PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001305 wchar_t *w,
1306 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307{
1308 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001309 PyErr_BadInternalCall();
1310 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001312 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313}
1314
Victor Stinner137c34c2010-09-29 10:25:54 +00001315wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001316PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001317 Py_ssize_t *size)
1318{
1319 wchar_t* buffer;
1320 Py_ssize_t buflen;
1321
1322 if (unicode == NULL) {
1323 PyErr_BadInternalCall();
1324 return NULL;
1325 }
1326
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001327 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001328 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001329 PyErr_NoMemory();
1330 return NULL;
1331 }
1332
Victor Stinner137c34c2010-09-29 10:25:54 +00001333 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1334 if (buffer == NULL) {
1335 PyErr_NoMemory();
1336 return NULL;
1337 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001338 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001339 if (size != NULL)
1340 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001341 return buffer;
1342}
1343
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344#endif
1345
Alexander Belopolsky40018472011-02-26 01:02:56 +00001346PyObject *
1347PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001348{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001349 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001350
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001351 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001352 PyErr_SetString(PyExc_ValueError,
1353 "chr() arg not in range(0x110000)");
1354 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001355 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001356
1357#ifndef Py_UNICODE_WIDE
1358 if (ordinal > 0xffff) {
1359 ordinal -= 0x10000;
1360 s[0] = 0xD800 | (ordinal >> 10);
1361 s[1] = 0xDC00 | (ordinal & 0x3FF);
1362 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001363 }
1364#endif
1365
Hye-Shik Chang40574832004-04-06 07:24:51 +00001366 s[0] = (Py_UNICODE)ordinal;
1367 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001368}
1369
Alexander Belopolsky40018472011-02-26 01:02:56 +00001370PyObject *
1371PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001373 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001375 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 Py_INCREF(obj);
1377 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001378 }
1379 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001380 /* For a Unicode subtype that's not a Unicode object,
1381 return a true Unicode object with the same data. */
1382 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1383 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001384 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001385 PyErr_Format(PyExc_TypeError,
1386 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001387 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001388 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001389}
1390
Alexander Belopolsky40018472011-02-26 01:02:56 +00001391PyObject *
1392PyUnicode_FromEncodedObject(register PyObject *obj,
1393 const char *encoding,
1394 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001395{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001396 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001397 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001398
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001400 PyErr_BadInternalCall();
1401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001402 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001403
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001404 /* Decoding bytes objects is the most common case and should be fast */
1405 if (PyBytes_Check(obj)) {
1406 if (PyBytes_GET_SIZE(obj) == 0) {
1407 Py_INCREF(unicode_empty);
1408 v = (PyObject *) unicode_empty;
1409 }
1410 else {
1411 v = PyUnicode_Decode(
1412 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1413 encoding, errors);
1414 }
1415 return v;
1416 }
1417
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001418 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001419 PyErr_SetString(PyExc_TypeError,
1420 "decoding str is not supported");
1421 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001422 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001423
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001424 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1425 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1426 PyErr_Format(PyExc_TypeError,
1427 "coercing to str: need bytes, bytearray "
1428 "or buffer-like object, %.80s found",
1429 Py_TYPE(obj)->tp_name);
1430 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001431 }
Tim Petersced69f82003-09-16 20:30:58 +00001432
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001433 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001434 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001435 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436 }
Tim Petersced69f82003-09-16 20:30:58 +00001437 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001438 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001439
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001440 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001441 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001442}
1443
Victor Stinner600d3be2010-06-10 12:00:55 +00001444/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001445 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1446 1 on success. */
1447static int
1448normalize_encoding(const char *encoding,
1449 char *lower,
1450 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001451{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001452 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001453 char *l;
1454 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001455
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001456 e = encoding;
1457 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001458 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001459 while (*e) {
1460 if (l == l_end)
1461 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001462 if (Py_ISUPPER(*e)) {
1463 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001464 }
1465 else if (*e == '_') {
1466 *l++ = '-';
1467 e++;
1468 }
1469 else {
1470 *l++ = *e++;
1471 }
1472 }
1473 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001474 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001475}
1476
Alexander Belopolsky40018472011-02-26 01:02:56 +00001477PyObject *
1478PyUnicode_Decode(const char *s,
1479 Py_ssize_t size,
1480 const char *encoding,
1481 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00001482{
1483 PyObject *buffer = NULL, *unicode;
1484 Py_buffer info;
1485 char lower[11]; /* Enough for any encoding shortcut */
1486
1487 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001488 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001489
1490 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001491 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001492 if ((strcmp(lower, "utf-8") == 0) ||
1493 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001494 return PyUnicode_DecodeUTF8(s, size, errors);
1495 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001496 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001497 (strcmp(lower, "iso-8859-1") == 0))
1498 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001499#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001500 else if (strcmp(lower, "mbcs") == 0)
1501 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001502#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001503 else if (strcmp(lower, "ascii") == 0)
1504 return PyUnicode_DecodeASCII(s, size, errors);
1505 else if (strcmp(lower, "utf-16") == 0)
1506 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1507 else if (strcmp(lower, "utf-32") == 0)
1508 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510
1511 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001512 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001513 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001514 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001515 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516 if (buffer == NULL)
1517 goto onError;
1518 unicode = PyCodec_Decode(buffer, encoding, errors);
1519 if (unicode == NULL)
1520 goto onError;
1521 if (!PyUnicode_Check(unicode)) {
1522 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001523 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001524 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001525 Py_DECREF(unicode);
1526 goto onError;
1527 }
1528 Py_DECREF(buffer);
1529 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532 Py_XDECREF(buffer);
1533 return NULL;
1534}
1535
Alexander Belopolsky40018472011-02-26 01:02:56 +00001536PyObject *
1537PyUnicode_AsDecodedObject(PyObject *unicode,
1538 const char *encoding,
1539 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001540{
1541 PyObject *v;
1542
1543 if (!PyUnicode_Check(unicode)) {
1544 PyErr_BadArgument();
1545 goto onError;
1546 }
1547
1548 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001549 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001550
1551 /* Decode via the codec registry */
1552 v = PyCodec_Decode(unicode, encoding, errors);
1553 if (v == NULL)
1554 goto onError;
1555 return v;
1556
Benjamin Peterson29060642009-01-31 22:14:21 +00001557 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001558 return NULL;
1559}
1560
Alexander Belopolsky40018472011-02-26 01:02:56 +00001561PyObject *
1562PyUnicode_AsDecodedUnicode(PyObject *unicode,
1563 const char *encoding,
1564 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001565{
1566 PyObject *v;
1567
1568 if (!PyUnicode_Check(unicode)) {
1569 PyErr_BadArgument();
1570 goto onError;
1571 }
1572
1573 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575
1576 /* Decode via the codec registry */
1577 v = PyCodec_Decode(unicode, encoding, errors);
1578 if (v == NULL)
1579 goto onError;
1580 if (!PyUnicode_Check(v)) {
1581 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001582 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001583 Py_TYPE(v)->tp_name);
1584 Py_DECREF(v);
1585 goto onError;
1586 }
1587 return v;
1588
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001590 return NULL;
1591}
1592
Alexander Belopolsky40018472011-02-26 01:02:56 +00001593PyObject *
1594PyUnicode_Encode(const Py_UNICODE *s,
1595 Py_ssize_t size,
1596 const char *encoding,
1597 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001598{
1599 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001600
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601 unicode = PyUnicode_FromUnicode(s, size);
1602 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001603 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001604 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1605 Py_DECREF(unicode);
1606 return v;
1607}
1608
Alexander Belopolsky40018472011-02-26 01:02:56 +00001609PyObject *
1610PyUnicode_AsEncodedObject(PyObject *unicode,
1611 const char *encoding,
1612 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001613{
1614 PyObject *v;
1615
1616 if (!PyUnicode_Check(unicode)) {
1617 PyErr_BadArgument();
1618 goto onError;
1619 }
1620
1621 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001622 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001623
1624 /* Encode via the codec registry */
1625 v = PyCodec_Encode(unicode, encoding, errors);
1626 if (v == NULL)
1627 goto onError;
1628 return v;
1629
Benjamin Peterson29060642009-01-31 22:14:21 +00001630 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001631 return NULL;
1632}
1633
Victor Stinnerad158722010-10-27 00:25:46 +00001634PyObject *
1635PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001636{
Victor Stinner313a1202010-06-11 23:56:51 +00001637#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001638 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1639 PyUnicode_GET_SIZE(unicode),
1640 NULL);
1641#elif defined(__APPLE__)
1642 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1643 PyUnicode_GET_SIZE(unicode),
1644 "surrogateescape");
1645#else
1646 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001647 return PyUnicode_AsEncodedString(unicode,
1648 Py_FileSystemDefaultEncoding,
1649 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001650 }
1651 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001652 /* locale encoding with surrogateescape */
1653 wchar_t *wchar;
1654 char *bytes;
1655 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001656 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001657
1658 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1659 if (wchar == NULL)
1660 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001661 bytes = _Py_wchar2char(wchar, &error_pos);
1662 if (bytes == NULL) {
1663 if (error_pos != (size_t)-1) {
1664 char *errmsg = strerror(errno);
1665 PyObject *exc = NULL;
1666 if (errmsg == NULL)
1667 errmsg = "Py_wchar2char() failed";
1668 raise_encode_exception(&exc,
1669 "filesystemencoding",
1670 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1671 error_pos, error_pos+1,
1672 errmsg);
1673 Py_XDECREF(exc);
1674 }
1675 else
1676 PyErr_NoMemory();
1677 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001678 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001679 }
1680 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001681
1682 bytes_obj = PyBytes_FromString(bytes);
1683 PyMem_Free(bytes);
1684 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001685 }
Victor Stinnerad158722010-10-27 00:25:46 +00001686#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001687}
1688
Alexander Belopolsky40018472011-02-26 01:02:56 +00001689PyObject *
1690PyUnicode_AsEncodedString(PyObject *unicode,
1691 const char *encoding,
1692 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693{
1694 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001695 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001696
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697 if (!PyUnicode_Check(unicode)) {
1698 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 }
Fred Drakee4315f52000-05-09 19:53:39 +00001701
Tim Petersced69f82003-09-16 20:30:58 +00001702 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001703 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1704 PyUnicode_GET_SIZE(unicode),
1705 errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001706
1707 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001708 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001709 if ((strcmp(lower, "utf-8") == 0) ||
1710 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001711 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1712 PyUnicode_GET_SIZE(unicode),
1713 errors);
1714 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001715 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001716 (strcmp(lower, "iso-8859-1") == 0))
1717 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1718 PyUnicode_GET_SIZE(unicode),
1719 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001720#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001721 else if (strcmp(lower, "mbcs") == 0)
1722 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1723 PyUnicode_GET_SIZE(unicode),
1724 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001725#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001726 else if (strcmp(lower, "ascii") == 0)
1727 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1728 PyUnicode_GET_SIZE(unicode),
1729 errors);
1730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731
1732 /* Encode via the codec registry */
1733 v = PyCodec_Encode(unicode, encoding, errors);
1734 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001735 return NULL;
1736
1737 /* The normal path */
1738 if (PyBytes_Check(v))
1739 return v;
1740
1741 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001742 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001743 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001744 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001745
1746 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1747 "encoder %s returned bytearray instead of bytes",
1748 encoding);
1749 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001750 Py_DECREF(v);
1751 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001752 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001753
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001754 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1755 Py_DECREF(v);
1756 return b;
1757 }
1758
1759 PyErr_Format(PyExc_TypeError,
1760 "encoder did not return a bytes object (type=%.400s)",
1761 Py_TYPE(v)->tp_name);
1762 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001763 return NULL;
1764}
1765
Alexander Belopolsky40018472011-02-26 01:02:56 +00001766PyObject *
1767PyUnicode_AsEncodedUnicode(PyObject *unicode,
1768 const char *encoding,
1769 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001770{
1771 PyObject *v;
1772
1773 if (!PyUnicode_Check(unicode)) {
1774 PyErr_BadArgument();
1775 goto onError;
1776 }
1777
1778 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001780
1781 /* Encode via the codec registry */
1782 v = PyCodec_Encode(unicode, encoding, errors);
1783 if (v == NULL)
1784 goto onError;
1785 if (!PyUnicode_Check(v)) {
1786 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001787 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001788 Py_TYPE(v)->tp_name);
1789 Py_DECREF(v);
1790 goto onError;
1791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001793
Benjamin Peterson29060642009-01-31 22:14:21 +00001794 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001795 return NULL;
1796}
1797
Alexander Belopolsky40018472011-02-26 01:02:56 +00001798PyObject *
1799_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1800 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001801{
1802 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001803 if (v)
1804 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001805 if (errors != NULL)
1806 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001807 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001808 PyUnicode_GET_SIZE(unicode),
1809 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001810 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001811 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001812 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001813 return v;
1814}
1815
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001816PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001817PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001818 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001819 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1820}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001821
Christian Heimes5894ba72007-11-04 11:43:14 +00001822PyObject*
1823PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1824{
Victor Stinnerad158722010-10-27 00:25:46 +00001825#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1826 return PyUnicode_DecodeMBCS(s, size, NULL);
1827#elif defined(__APPLE__)
1828 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1829#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001830 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1831 can be undefined. If it is case, decode using UTF-8. The following assumes
1832 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1833 bootstrapping process where the codecs aren't ready yet.
1834 */
1835 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001836 return PyUnicode_Decode(s, size,
1837 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001838 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001839 }
1840 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001841 /* locale encoding with surrogateescape */
1842 wchar_t *wchar;
1843 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001844 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001845
1846 if (s[size] != '\0' || size != strlen(s)) {
1847 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1848 return NULL;
1849 }
1850
Victor Stinner168e1172010-10-16 23:16:16 +00001851 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001852 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001853 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001854
Victor Stinner168e1172010-10-16 23:16:16 +00001855 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001856 PyMem_Free(wchar);
1857 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001858 }
Victor Stinnerad158722010-10-27 00:25:46 +00001859#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001860}
1861
Martin v. Löwis011e8422009-05-05 04:43:17 +00001862
1863int
1864PyUnicode_FSConverter(PyObject* arg, void* addr)
1865{
1866 PyObject *output = NULL;
1867 Py_ssize_t size;
1868 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001869 if (arg == NULL) {
1870 Py_DECREF(*(PyObject**)addr);
1871 return 1;
1872 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001873 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001874 output = arg;
1875 Py_INCREF(output);
1876 }
1877 else {
1878 arg = PyUnicode_FromObject(arg);
1879 if (!arg)
1880 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001881 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001882 Py_DECREF(arg);
1883 if (!output)
1884 return 0;
1885 if (!PyBytes_Check(output)) {
1886 Py_DECREF(output);
1887 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1888 return 0;
1889 }
1890 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001891 size = PyBytes_GET_SIZE(output);
1892 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001893 if (size != strlen(data)) {
1894 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1895 Py_DECREF(output);
1896 return 0;
1897 }
1898 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001899 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001900}
1901
1902
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001903int
1904PyUnicode_FSDecoder(PyObject* arg, void* addr)
1905{
1906 PyObject *output = NULL;
1907 Py_ssize_t size;
1908 void *data;
1909 if (arg == NULL) {
1910 Py_DECREF(*(PyObject**)addr);
1911 return 1;
1912 }
1913 if (PyUnicode_Check(arg)) {
1914 output = arg;
1915 Py_INCREF(output);
1916 }
1917 else {
1918 arg = PyBytes_FromObject(arg);
1919 if (!arg)
1920 return 0;
1921 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1922 PyBytes_GET_SIZE(arg));
1923 Py_DECREF(arg);
1924 if (!output)
1925 return 0;
1926 if (!PyUnicode_Check(output)) {
1927 Py_DECREF(output);
1928 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1929 return 0;
1930 }
1931 }
1932 size = PyUnicode_GET_SIZE(output);
1933 data = PyUnicode_AS_UNICODE(output);
1934 if (size != Py_UNICODE_strlen(data)) {
1935 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1936 Py_DECREF(output);
1937 return 0;
1938 }
1939 *(PyObject**)addr = output;
1940 return Py_CLEANUP_SUPPORTED;
1941}
1942
1943
Martin v. Löwis5b222132007-06-10 09:51:05 +00001944char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001945_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001946{
Christian Heimesf3863112007-11-22 07:46:41 +00001947 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001948 if (!PyUnicode_Check(unicode)) {
1949 PyErr_BadArgument();
1950 return NULL;
1951 }
Christian Heimesf3863112007-11-22 07:46:41 +00001952 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1953 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001954 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001955 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001956 *psize = PyBytes_GET_SIZE(bytes);
1957 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001958}
1959
1960char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001961_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001962{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001963 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001964}
1965
Alexander Belopolsky40018472011-02-26 01:02:56 +00001966Py_UNICODE *
1967PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968{
1969 if (!PyUnicode_Check(unicode)) {
1970 PyErr_BadArgument();
1971 goto onError;
1972 }
1973 return PyUnicode_AS_UNICODE(unicode);
1974
Benjamin Peterson29060642009-01-31 22:14:21 +00001975 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 return NULL;
1977}
1978
Alexander Belopolsky40018472011-02-26 01:02:56 +00001979Py_ssize_t
1980PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981{
1982 if (!PyUnicode_Check(unicode)) {
1983 PyErr_BadArgument();
1984 goto onError;
1985 }
1986 return PyUnicode_GET_SIZE(unicode);
1987
Benjamin Peterson29060642009-01-31 22:14:21 +00001988 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 return -1;
1990}
1991
Alexander Belopolsky40018472011-02-26 01:02:56 +00001992const char *
1993PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001994{
Victor Stinner42cb4622010-09-01 19:39:01 +00001995 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001996}
1997
Victor Stinner554f3f02010-06-16 23:33:54 +00001998/* create or adjust a UnicodeDecodeError */
1999static void
2000make_decode_exception(PyObject **exceptionObject,
2001 const char *encoding,
2002 const char *input, Py_ssize_t length,
2003 Py_ssize_t startpos, Py_ssize_t endpos,
2004 const char *reason)
2005{
2006 if (*exceptionObject == NULL) {
2007 *exceptionObject = PyUnicodeDecodeError_Create(
2008 encoding, input, length, startpos, endpos, reason);
2009 }
2010 else {
2011 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2012 goto onError;
2013 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2014 goto onError;
2015 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2016 goto onError;
2017 }
2018 return;
2019
2020onError:
2021 Py_DECREF(*exceptionObject);
2022 *exceptionObject = NULL;
2023}
2024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002025/* error handling callback helper:
2026 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002027 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028 and adjust various state variables.
2029 return 0 on success, -1 on error
2030*/
2031
Alexander Belopolsky40018472011-02-26 01:02:56 +00002032static int
2033unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2034 const char *encoding, const char *reason,
2035 const char **input, const char **inend, Py_ssize_t *startinpos,
2036 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2037 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002038{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002039 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002040
2041 PyObject *restuple = NULL;
2042 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002043 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002044 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002045 Py_ssize_t requiredsize;
2046 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002047 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002048 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002049 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002050 int res = -1;
2051
2052 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002053 *errorHandler = PyCodec_LookupError(errors);
2054 if (*errorHandler == NULL)
2055 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 }
2057
Victor Stinner554f3f02010-06-16 23:33:54 +00002058 make_decode_exception(exceptionObject,
2059 encoding,
2060 *input, *inend - *input,
2061 *startinpos, *endinpos,
2062 reason);
2063 if (*exceptionObject == NULL)
2064 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002065
2066 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2067 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002068 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002070 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 }
2073 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002074 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002075
2076 /* Copy back the bytes variables, which might have been modified by the
2077 callback */
2078 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2079 if (!inputobj)
2080 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002081 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002083 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002084 *input = PyBytes_AS_STRING(inputobj);
2085 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002086 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002087 /* we can DECREF safely, as the exception has another reference,
2088 so the object won't go away. */
2089 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002090
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002091 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002092 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002093 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002094 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2095 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002096 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002097
2098 /* need more space? (at least enough for what we
2099 have+the replacement+the rest of the string (starting
2100 at the new input position), so we won't have to check space
2101 when there are no errors in the rest of the string) */
2102 repptr = PyUnicode_AS_UNICODE(repunicode);
2103 repsize = PyUnicode_GET_SIZE(repunicode);
2104 requiredsize = *outpos + repsize + insize-newpos;
2105 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002106 if (requiredsize<2*outsize)
2107 requiredsize = 2*outsize;
2108 if (_PyUnicode_Resize(output, requiredsize) < 0)
2109 goto onError;
2110 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002111 }
2112 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002113 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002114 Py_UNICODE_COPY(*outptr, repptr, repsize);
2115 *outptr += repsize;
2116 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002117
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002118 /* we made it! */
2119 res = 0;
2120
Benjamin Peterson29060642009-01-31 22:14:21 +00002121 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 Py_XDECREF(restuple);
2123 return res;
2124}
2125
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002126/* --- UTF-7 Codec -------------------------------------------------------- */
2127
Antoine Pitrou244651a2009-05-04 18:56:13 +00002128/* See RFC2152 for details. We encode conservatively and decode liberally. */
2129
2130/* Three simple macros defining base-64. */
2131
2132/* Is c a base-64 character? */
2133
2134#define IS_BASE64(c) \
2135 (((c) >= 'A' && (c) <= 'Z') || \
2136 ((c) >= 'a' && (c) <= 'z') || \
2137 ((c) >= '0' && (c) <= '9') || \
2138 (c) == '+' || (c) == '/')
2139
2140/* given that c is a base-64 character, what is its base-64 value? */
2141
2142#define FROM_BASE64(c) \
2143 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2144 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2145 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2146 (c) == '+' ? 62 : 63)
2147
2148/* What is the base-64 character of the bottom 6 bits of n? */
2149
2150#define TO_BASE64(n) \
2151 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2152
2153/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2154 * decoded as itself. We are permissive on decoding; the only ASCII
2155 * byte not decoding to itself is the + which begins a base64
2156 * string. */
2157
2158#define DECODE_DIRECT(c) \
2159 ((c) <= 127 && (c) != '+')
2160
2161/* The UTF-7 encoder treats ASCII characters differently according to
2162 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2163 * the above). See RFC2152. This array identifies these different
2164 * sets:
2165 * 0 : "Set D"
2166 * alphanumeric and '(),-./:?
2167 * 1 : "Set O"
2168 * !"#$%&*;<=>@[]^_`{|}
2169 * 2 : "whitespace"
2170 * ht nl cr sp
2171 * 3 : special (must be base64 encoded)
2172 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2173 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002174
Tim Petersced69f82003-09-16 20:30:58 +00002175static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002176char utf7_category[128] = {
2177/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2178 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2179/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2180 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2181/* sp ! " # $ % & ' ( ) * + , - . / */
2182 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2183/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2185/* @ A B C D E F G H I J K L M N O */
2186 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2187/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2189/* ` a b c d e f g h i j k l m n o */
2190 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2191/* p q r s t u v w x y z { | } ~ del */
2192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002193};
2194
Antoine Pitrou244651a2009-05-04 18:56:13 +00002195/* ENCODE_DIRECT: this character should be encoded as itself. The
2196 * answer depends on whether we are encoding set O as itself, and also
2197 * on whether we are encoding whitespace as itself. RFC2152 makes it
2198 * clear that the answers to these questions vary between
2199 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002200
Antoine Pitrou244651a2009-05-04 18:56:13 +00002201#define ENCODE_DIRECT(c, directO, directWS) \
2202 ((c) < 128 && (c) > 0 && \
2203 ((utf7_category[(c)] == 0) || \
2204 (directWS && (utf7_category[(c)] == 2)) || \
2205 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206
Alexander Belopolsky40018472011-02-26 01:02:56 +00002207PyObject *
2208PyUnicode_DecodeUTF7(const char *s,
2209 Py_ssize_t size,
2210 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002211{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002212 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2213}
2214
Antoine Pitrou244651a2009-05-04 18:56:13 +00002215/* The decoder. The only state we preserve is our read position,
2216 * i.e. how many characters we have consumed. So if we end in the
2217 * middle of a shift sequence we have to back off the read position
2218 * and the output to the beginning of the sequence, otherwise we lose
2219 * all the shift state (seen bits, number of bits seen, high
2220 * surrogate). */
2221
Alexander Belopolsky40018472011-02-26 01:02:56 +00002222PyObject *
2223PyUnicode_DecodeUTF7Stateful(const char *s,
2224 Py_ssize_t size,
2225 const char *errors,
2226 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002227{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002228 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002229 Py_ssize_t startinpos;
2230 Py_ssize_t endinpos;
2231 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002232 const char *e;
2233 PyUnicodeObject *unicode;
2234 Py_UNICODE *p;
2235 const char *errmsg = "";
2236 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002237 Py_UNICODE *shiftOutStart;
2238 unsigned int base64bits = 0;
2239 unsigned long base64buffer = 0;
2240 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002241 PyObject *errorHandler = NULL;
2242 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002243
2244 unicode = _PyUnicode_New(size);
2245 if (!unicode)
2246 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002247 if (size == 0) {
2248 if (consumed)
2249 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002250 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002251 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002252
2253 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002254 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002255 e = s + size;
2256
2257 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002258 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002259 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002260 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002261
Antoine Pitrou244651a2009-05-04 18:56:13 +00002262 if (inShift) { /* in a base-64 section */
2263 if (IS_BASE64(ch)) { /* consume a base-64 character */
2264 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2265 base64bits += 6;
2266 s++;
2267 if (base64bits >= 16) {
2268 /* we have enough bits for a UTF-16 value */
2269 Py_UNICODE outCh = (Py_UNICODE)
2270 (base64buffer >> (base64bits-16));
2271 base64bits -= 16;
2272 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2273 if (surrogate) {
2274 /* expecting a second surrogate */
2275 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2276#ifdef Py_UNICODE_WIDE
2277 *p++ = (((surrogate & 0x3FF)<<10)
2278 | (outCh & 0x3FF)) + 0x10000;
2279#else
2280 *p++ = surrogate;
2281 *p++ = outCh;
2282#endif
2283 surrogate = 0;
2284 }
2285 else {
2286 surrogate = 0;
2287 errmsg = "second surrogate missing";
2288 goto utf7Error;
2289 }
2290 }
2291 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2292 /* first surrogate */
2293 surrogate = outCh;
2294 }
2295 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2296 errmsg = "unexpected second surrogate";
2297 goto utf7Error;
2298 }
2299 else {
2300 *p++ = outCh;
2301 }
2302 }
2303 }
2304 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002305 inShift = 0;
2306 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002307 if (surrogate) {
2308 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002309 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002310 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002311 if (base64bits > 0) { /* left-over bits */
2312 if (base64bits >= 6) {
2313 /* We've seen at least one base-64 character */
2314 errmsg = "partial character in shift sequence";
2315 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002316 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002317 else {
2318 /* Some bits remain; they should be zero */
2319 if (base64buffer != 0) {
2320 errmsg = "non-zero padding bits in shift sequence";
2321 goto utf7Error;
2322 }
2323 }
2324 }
2325 if (ch != '-') {
2326 /* '-' is absorbed; other terminating
2327 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002328 *p++ = ch;
2329 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002330 }
2331 }
2332 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002333 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002334 s++; /* consume '+' */
2335 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002336 s++;
2337 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002338 }
2339 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002340 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002341 shiftOutStart = p;
2342 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002343 }
2344 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002345 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002346 *p++ = ch;
2347 s++;
2348 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002349 else {
2350 startinpos = s-starts;
2351 s++;
2352 errmsg = "unexpected special character";
2353 goto utf7Error;
2354 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002355 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002356utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002357 outpos = p-PyUnicode_AS_UNICODE(unicode);
2358 endinpos = s-starts;
2359 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002360 errors, &errorHandler,
2361 "utf7", errmsg,
2362 &starts, &e, &startinpos, &endinpos, &exc, &s,
2363 &unicode, &outpos, &p))
2364 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002365 }
2366
Antoine Pitrou244651a2009-05-04 18:56:13 +00002367 /* end of string */
2368
2369 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2370 /* if we're in an inconsistent state, that's an error */
2371 if (surrogate ||
2372 (base64bits >= 6) ||
2373 (base64bits > 0 && base64buffer != 0)) {
2374 outpos = p-PyUnicode_AS_UNICODE(unicode);
2375 endinpos = size;
2376 if (unicode_decode_call_errorhandler(
2377 errors, &errorHandler,
2378 "utf7", "unterminated shift sequence",
2379 &starts, &e, &startinpos, &endinpos, &exc, &s,
2380 &unicode, &outpos, &p))
2381 goto onError;
2382 if (s < e)
2383 goto restart;
2384 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002385 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002386
2387 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002388 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002389 if (inShift) {
2390 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002391 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002392 }
2393 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002394 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002395 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002396 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002397
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002398 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002399 goto onError;
2400
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002401 Py_XDECREF(errorHandler);
2402 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002403 return (PyObject *)unicode;
2404
Benjamin Peterson29060642009-01-31 22:14:21 +00002405 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002406 Py_XDECREF(errorHandler);
2407 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002408 Py_DECREF(unicode);
2409 return NULL;
2410}
2411
2412
Alexander Belopolsky40018472011-02-26 01:02:56 +00002413PyObject *
2414PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2415 Py_ssize_t size,
2416 int base64SetO,
2417 int base64WhiteSpace,
2418 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002419{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002420 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002421 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002422 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002423 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002424 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002425 unsigned int base64bits = 0;
2426 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002427 char * out;
2428 char * start;
2429
2430 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002431 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002432
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002433 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002434 return PyErr_NoMemory();
2435
Antoine Pitrou244651a2009-05-04 18:56:13 +00002436 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002437 if (v == NULL)
2438 return NULL;
2439
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002440 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002441 for (;i < size; ++i) {
2442 Py_UNICODE ch = s[i];
2443
Antoine Pitrou244651a2009-05-04 18:56:13 +00002444 if (inShift) {
2445 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2446 /* shifting out */
2447 if (base64bits) { /* output remaining bits */
2448 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2449 base64buffer = 0;
2450 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002451 }
2452 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002453 /* Characters not in the BASE64 set implicitly unshift the sequence
2454 so no '-' is required, except if the character is itself a '-' */
2455 if (IS_BASE64(ch) || ch == '-') {
2456 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002458 *out++ = (char) ch;
2459 }
2460 else {
2461 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002462 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002463 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002464 else { /* not in a shift sequence */
2465 if (ch == '+') {
2466 *out++ = '+';
2467 *out++ = '-';
2468 }
2469 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2470 *out++ = (char) ch;
2471 }
2472 else {
2473 *out++ = '+';
2474 inShift = 1;
2475 goto encode_char;
2476 }
2477 }
2478 continue;
2479encode_char:
2480#ifdef Py_UNICODE_WIDE
2481 if (ch >= 0x10000) {
2482 /* code first surrogate */
2483 base64bits += 16;
2484 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2485 while (base64bits >= 6) {
2486 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2487 base64bits -= 6;
2488 }
2489 /* prepare second surrogate */
2490 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2491 }
2492#endif
2493 base64bits += 16;
2494 base64buffer = (base64buffer << 16) | ch;
2495 while (base64bits >= 6) {
2496 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2497 base64bits -= 6;
2498 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002499 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002500 if (base64bits)
2501 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2502 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002503 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002504 if (_PyBytes_Resize(&v, out - start) < 0)
2505 return NULL;
2506 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002507}
2508
Antoine Pitrou244651a2009-05-04 18:56:13 +00002509#undef IS_BASE64
2510#undef FROM_BASE64
2511#undef TO_BASE64
2512#undef DECODE_DIRECT
2513#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002514
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515/* --- UTF-8 Codec -------------------------------------------------------- */
2516
Tim Petersced69f82003-09-16 20:30:58 +00002517static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002519 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2520 illegal prefix. See RFC 3629 for details */
2521 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2522 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002523 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002524 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2525 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2526 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2527 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002528 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2531 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2533 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2534 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2535 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2536 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537};
2538
Alexander Belopolsky40018472011-02-26 01:02:56 +00002539PyObject *
2540PyUnicode_DecodeUTF8(const char *s,
2541 Py_ssize_t size,
2542 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543{
Walter Dörwald69652032004-09-07 20:24:22 +00002544 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2545}
2546
Antoine Pitrouab868312009-01-10 15:40:25 +00002547/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2548#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2549
2550/* Mask to quickly check whether a C 'long' contains a
2551 non-ASCII, UTF8-encoded char. */
2552#if (SIZEOF_LONG == 8)
2553# define ASCII_CHAR_MASK 0x8080808080808080L
2554#elif (SIZEOF_LONG == 4)
2555# define ASCII_CHAR_MASK 0x80808080L
2556#else
2557# error C 'long' size should be either 4 or 8!
2558#endif
2559
Alexander Belopolsky40018472011-02-26 01:02:56 +00002560PyObject *
2561PyUnicode_DecodeUTF8Stateful(const char *s,
2562 Py_ssize_t size,
2563 const char *errors,
2564 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002565{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002566 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002567 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002568 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002569 Py_ssize_t startinpos;
2570 Py_ssize_t endinpos;
2571 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002572 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573 PyUnicodeObject *unicode;
2574 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002575 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002576 PyObject *errorHandler = NULL;
2577 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578
2579 /* Note: size will always be longer than the resulting Unicode
2580 character count */
2581 unicode = _PyUnicode_New(size);
2582 if (!unicode)
2583 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002584 if (size == 0) {
2585 if (consumed)
2586 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589
2590 /* Unpack UTF-8 encoded data */
2591 p = unicode->str;
2592 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002593 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594
2595 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002596 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597
2598 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002599 /* Fast path for runs of ASCII characters. Given that common UTF-8
2600 input will consist of an overwhelming majority of ASCII
2601 characters, we try to optimize for this case by checking
2602 as many characters as a C 'long' can contain.
2603 First, check if we can do an aligned read, as most CPUs have
2604 a penalty for unaligned reads.
2605 */
2606 if (!((size_t) s & LONG_PTR_MASK)) {
2607 /* Help register allocation */
2608 register const char *_s = s;
2609 register Py_UNICODE *_p = p;
2610 while (_s < aligned_end) {
2611 /* Read a whole long at a time (either 4 or 8 bytes),
2612 and do a fast unrolled copy if it only contains ASCII
2613 characters. */
2614 unsigned long data = *(unsigned long *) _s;
2615 if (data & ASCII_CHAR_MASK)
2616 break;
2617 _p[0] = (unsigned char) _s[0];
2618 _p[1] = (unsigned char) _s[1];
2619 _p[2] = (unsigned char) _s[2];
2620 _p[3] = (unsigned char) _s[3];
2621#if (SIZEOF_LONG == 8)
2622 _p[4] = (unsigned char) _s[4];
2623 _p[5] = (unsigned char) _s[5];
2624 _p[6] = (unsigned char) _s[6];
2625 _p[7] = (unsigned char) _s[7];
2626#endif
2627 _s += SIZEOF_LONG;
2628 _p += SIZEOF_LONG;
2629 }
2630 s = _s;
2631 p = _p;
2632 if (s == e)
2633 break;
2634 ch = (unsigned char)*s;
2635 }
2636 }
2637
2638 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002639 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 s++;
2641 continue;
2642 }
2643
2644 n = utf8_code_length[ch];
2645
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002646 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002647 if (consumed)
2648 break;
2649 else {
2650 errmsg = "unexpected end of data";
2651 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002652 endinpos = startinpos+1;
2653 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2654 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002655 goto utf8Error;
2656 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658
2659 switch (n) {
2660
2661 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002662 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002663 startinpos = s-starts;
2664 endinpos = startinpos+1;
2665 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666
2667 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002668 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002669 startinpos = s-starts;
2670 endinpos = startinpos+1;
2671 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672
2673 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002674 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002675 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002676 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002677 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002678 goto utf8Error;
2679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002681 assert ((ch > 0x007F) && (ch <= 0x07FF));
2682 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 break;
2684
2685 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002686 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2687 will result in surrogates in range d800-dfff. Surrogates are
2688 not valid UTF-8 so they are rejected.
2689 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2690 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002691 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002692 (s[2] & 0xc0) != 0x80 ||
2693 ((unsigned char)s[0] == 0xE0 &&
2694 (unsigned char)s[1] < 0xA0) ||
2695 ((unsigned char)s[0] == 0xED &&
2696 (unsigned char)s[1] > 0x9F)) {
2697 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002698 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002699 endinpos = startinpos + 1;
2700
2701 /* if s[1] first two bits are 1 and 0, then the invalid
2702 continuation byte is s[2], so increment endinpos by 1,
2703 if not, s[1] is invalid and endinpos doesn't need to
2704 be incremented. */
2705 if ((s[1] & 0xC0) == 0x80)
2706 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002707 goto utf8Error;
2708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002710 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2711 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002712 break;
2713
2714 case 4:
2715 if ((s[1] & 0xc0) != 0x80 ||
2716 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002717 (s[3] & 0xc0) != 0x80 ||
2718 ((unsigned char)s[0] == 0xF0 &&
2719 (unsigned char)s[1] < 0x90) ||
2720 ((unsigned char)s[0] == 0xF4 &&
2721 (unsigned char)s[1] > 0x8F)) {
2722 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002723 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002724 endinpos = startinpos + 1;
2725 if ((s[1] & 0xC0) == 0x80) {
2726 endinpos++;
2727 if ((s[2] & 0xC0) == 0x80)
2728 endinpos++;
2729 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002730 goto utf8Error;
2731 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002732 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002733 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2734 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2735
Fredrik Lundh8f455852001-06-27 18:59:43 +00002736#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002737 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002738#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002739 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002740
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002741 /* translate from 10000..10FFFF to 0..FFFF */
2742 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002743
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002744 /* high surrogate = top 10 bits added to D800 */
2745 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002746
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002747 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002748 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002749#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751 }
2752 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002753 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002754
Benjamin Peterson29060642009-01-31 22:14:21 +00002755 utf8Error:
2756 outpos = p-PyUnicode_AS_UNICODE(unicode);
2757 if (unicode_decode_call_errorhandler(
2758 errors, &errorHandler,
2759 "utf8", errmsg,
2760 &starts, &e, &startinpos, &endinpos, &exc, &s,
2761 &unicode, &outpos, &p))
2762 goto onError;
2763 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 }
Walter Dörwald69652032004-09-07 20:24:22 +00002765 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767
2768 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002769 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 goto onError;
2771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 Py_XDECREF(errorHandler);
2773 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 return (PyObject *)unicode;
2775
Benjamin Peterson29060642009-01-31 22:14:21 +00002776 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 Py_XDECREF(errorHandler);
2778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 Py_DECREF(unicode);
2780 return NULL;
2781}
2782
Antoine Pitrouab868312009-01-10 15:40:25 +00002783#undef ASCII_CHAR_MASK
2784
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002785#ifdef __APPLE__
2786
2787/* Simplified UTF-8 decoder using surrogateescape error handler,
2788 used to decode the command line arguments on Mac OS X. */
2789
2790wchar_t*
2791_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2792{
2793 int n;
2794 const char *e;
2795 wchar_t *unicode, *p;
2796
2797 /* Note: size will always be longer than the resulting Unicode
2798 character count */
2799 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2800 PyErr_NoMemory();
2801 return NULL;
2802 }
2803 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2804 if (!unicode)
2805 return NULL;
2806
2807 /* Unpack UTF-8 encoded data */
2808 p = unicode;
2809 e = s + size;
2810 while (s < e) {
2811 Py_UCS4 ch = (unsigned char)*s;
2812
2813 if (ch < 0x80) {
2814 *p++ = (wchar_t)ch;
2815 s++;
2816 continue;
2817 }
2818
2819 n = utf8_code_length[ch];
2820 if (s + n > e) {
2821 goto surrogateescape;
2822 }
2823
2824 switch (n) {
2825 case 0:
2826 case 1:
2827 goto surrogateescape;
2828
2829 case 2:
2830 if ((s[1] & 0xc0) != 0x80)
2831 goto surrogateescape;
2832 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2833 assert ((ch > 0x007F) && (ch <= 0x07FF));
2834 *p++ = (wchar_t)ch;
2835 break;
2836
2837 case 3:
2838 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2839 will result in surrogates in range d800-dfff. Surrogates are
2840 not valid UTF-8 so they are rejected.
2841 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2842 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2843 if ((s[1] & 0xc0) != 0x80 ||
2844 (s[2] & 0xc0) != 0x80 ||
2845 ((unsigned char)s[0] == 0xE0 &&
2846 (unsigned char)s[1] < 0xA0) ||
2847 ((unsigned char)s[0] == 0xED &&
2848 (unsigned char)s[1] > 0x9F)) {
2849
2850 goto surrogateescape;
2851 }
2852 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2853 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2854 *p++ = (Py_UNICODE)ch;
2855 break;
2856
2857 case 4:
2858 if ((s[1] & 0xc0) != 0x80 ||
2859 (s[2] & 0xc0) != 0x80 ||
2860 (s[3] & 0xc0) != 0x80 ||
2861 ((unsigned char)s[0] == 0xF0 &&
2862 (unsigned char)s[1] < 0x90) ||
2863 ((unsigned char)s[0] == 0xF4 &&
2864 (unsigned char)s[1] > 0x8F)) {
2865 goto surrogateescape;
2866 }
2867 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2868 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2869 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2870
2871#if SIZEOF_WCHAR_T == 4
2872 *p++ = (wchar_t)ch;
2873#else
2874 /* compute and append the two surrogates: */
2875
2876 /* translate from 10000..10FFFF to 0..FFFF */
2877 ch -= 0x10000;
2878
2879 /* high surrogate = top 10 bits added to D800 */
2880 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2881
2882 /* low surrogate = bottom 10 bits added to DC00 */
2883 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2884#endif
2885 break;
2886 }
2887 s += n;
2888 continue;
2889
2890 surrogateescape:
2891 *p++ = 0xDC00 + ch;
2892 s++;
2893 }
2894 *p = L'\0';
2895 return unicode;
2896}
2897
2898#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002899
Tim Peters602f7402002-04-27 18:03:26 +00002900/* Allocation strategy: if the string is short, convert into a stack buffer
2901 and allocate exactly as much space needed at the end. Else allocate the
2902 maximum possible needed (4 result bytes per Unicode character), and return
2903 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002904*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002905PyObject *
2906PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 Py_ssize_t size,
2908 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909{
Tim Peters602f7402002-04-27 18:03:26 +00002910#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002911
Guido van Rossum98297ee2007-11-06 21:34:58 +00002912 Py_ssize_t i; /* index into s of next input byte */
2913 PyObject *result; /* result string object */
2914 char *p; /* next free byte in output buffer */
2915 Py_ssize_t nallocated; /* number of result bytes allocated */
2916 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002917 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002918 PyObject *errorHandler = NULL;
2919 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002920
Tim Peters602f7402002-04-27 18:03:26 +00002921 assert(s != NULL);
2922 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923
Tim Peters602f7402002-04-27 18:03:26 +00002924 if (size <= MAX_SHORT_UNICHARS) {
2925 /* Write into the stack buffer; nallocated can't overflow.
2926 * At the end, we'll allocate exactly as much heap space as it
2927 * turns out we need.
2928 */
2929 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002930 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002931 p = stackbuf;
2932 }
2933 else {
2934 /* Overallocate on the heap, and give the excess back at the end. */
2935 nallocated = size * 4;
2936 if (nallocated / 4 != size) /* overflow! */
2937 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002938 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002939 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002940 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002941 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002942 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002943
Tim Peters602f7402002-04-27 18:03:26 +00002944 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002945 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002946
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002947 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002948 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002950
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002952 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002953 *p++ = (char)(0xc0 | (ch >> 6));
2954 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002955 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002956#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002957 /* Special case: check for high and low surrogate */
2958 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2959 Py_UCS4 ch2 = s[i];
2960 /* Combine the two surrogates to form a UCS4 value */
2961 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2962 i++;
2963
2964 /* Encode UCS4 Unicode ordinals */
2965 *p++ = (char)(0xf0 | (ch >> 18));
2966 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002967 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2968 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002969 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002970#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002971 Py_ssize_t newpos;
2972 PyObject *rep;
2973 Py_ssize_t repsize, k;
2974 rep = unicode_encode_call_errorhandler
2975 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2976 s, size, &exc, i-1, i, &newpos);
2977 if (!rep)
2978 goto error;
2979
2980 if (PyBytes_Check(rep))
2981 repsize = PyBytes_GET_SIZE(rep);
2982 else
2983 repsize = PyUnicode_GET_SIZE(rep);
2984
2985 if (repsize > 4) {
2986 Py_ssize_t offset;
2987
2988 if (result == NULL)
2989 offset = p - stackbuf;
2990 else
2991 offset = p - PyBytes_AS_STRING(result);
2992
2993 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2994 /* integer overflow */
2995 PyErr_NoMemory();
2996 goto error;
2997 }
2998 nallocated += repsize - 4;
2999 if (result != NULL) {
3000 if (_PyBytes_Resize(&result, nallocated) < 0)
3001 goto error;
3002 } else {
3003 result = PyBytes_FromStringAndSize(NULL, nallocated);
3004 if (result == NULL)
3005 goto error;
3006 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3007 }
3008 p = PyBytes_AS_STRING(result) + offset;
3009 }
3010
3011 if (PyBytes_Check(rep)) {
3012 char *prep = PyBytes_AS_STRING(rep);
3013 for(k = repsize; k > 0; k--)
3014 *p++ = *prep++;
3015 } else /* rep is unicode */ {
3016 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3017 Py_UNICODE c;
3018
3019 for(k=0; k<repsize; k++) {
3020 c = prep[k];
3021 if (0x80 <= c) {
3022 raise_encode_exception(&exc, "utf-8", s, size,
3023 i-1, i, "surrogates not allowed");
3024 goto error;
3025 }
3026 *p++ = (char)prep[k];
3027 }
3028 }
3029 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003030#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003031 }
Victor Stinner445a6232010-04-22 20:01:57 +00003032#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003033 } else if (ch < 0x10000) {
3034 *p++ = (char)(0xe0 | (ch >> 12));
3035 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3036 *p++ = (char)(0x80 | (ch & 0x3f));
3037 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003038 /* Encode UCS4 Unicode ordinals */
3039 *p++ = (char)(0xf0 | (ch >> 18));
3040 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3041 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3042 *p++ = (char)(0x80 | (ch & 0x3f));
3043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003045
Guido van Rossum98297ee2007-11-06 21:34:58 +00003046 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003047 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003048 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003049 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003050 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003051 }
3052 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003053 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003054 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003055 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003056 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003057 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003058 Py_XDECREF(errorHandler);
3059 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003060 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003061 error:
3062 Py_XDECREF(errorHandler);
3063 Py_XDECREF(exc);
3064 Py_XDECREF(result);
3065 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003066
Tim Peters602f7402002-04-27 18:03:26 +00003067#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068}
3069
Alexander Belopolsky40018472011-02-26 01:02:56 +00003070PyObject *
3071PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 if (!PyUnicode_Check(unicode)) {
3074 PyErr_BadArgument();
3075 return NULL;
3076 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003077 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 PyUnicode_GET_SIZE(unicode),
3079 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080}
3081
Walter Dörwald41980ca2007-08-16 21:55:45 +00003082/* --- UTF-32 Codec ------------------------------------------------------- */
3083
3084PyObject *
3085PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 Py_ssize_t size,
3087 const char *errors,
3088 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003089{
3090 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3091}
3092
3093PyObject *
3094PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003095 Py_ssize_t size,
3096 const char *errors,
3097 int *byteorder,
3098 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003099{
3100 const char *starts = s;
3101 Py_ssize_t startinpos;
3102 Py_ssize_t endinpos;
3103 Py_ssize_t outpos;
3104 PyUnicodeObject *unicode;
3105 Py_UNICODE *p;
3106#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003107 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003108 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003109#else
3110 const int pairs = 0;
3111#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003112 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003113 int bo = 0; /* assume native ordering by default */
3114 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003115 /* Offsets from q for retrieving bytes in the right order. */
3116#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3117 int iorder[] = {0, 1, 2, 3};
3118#else
3119 int iorder[] = {3, 2, 1, 0};
3120#endif
3121 PyObject *errorHandler = NULL;
3122 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003123
Walter Dörwald41980ca2007-08-16 21:55:45 +00003124 q = (unsigned char *)s;
3125 e = q + size;
3126
3127 if (byteorder)
3128 bo = *byteorder;
3129
3130 /* Check for BOM marks (U+FEFF) in the input and adjust current
3131 byte order setting accordingly. In native mode, the leading BOM
3132 mark is skipped, in all other modes, it is copied to the output
3133 stream as-is (giving a ZWNBSP character). */
3134 if (bo == 0) {
3135 if (size >= 4) {
3136 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003137 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003138#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003139 if (bom == 0x0000FEFF) {
3140 q += 4;
3141 bo = -1;
3142 }
3143 else if (bom == 0xFFFE0000) {
3144 q += 4;
3145 bo = 1;
3146 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003147#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003148 if (bom == 0x0000FEFF) {
3149 q += 4;
3150 bo = 1;
3151 }
3152 else if (bom == 0xFFFE0000) {
3153 q += 4;
3154 bo = -1;
3155 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003156#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003157 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003158 }
3159
3160 if (bo == -1) {
3161 /* force LE */
3162 iorder[0] = 0;
3163 iorder[1] = 1;
3164 iorder[2] = 2;
3165 iorder[3] = 3;
3166 }
3167 else if (bo == 1) {
3168 /* force BE */
3169 iorder[0] = 3;
3170 iorder[1] = 2;
3171 iorder[2] = 1;
3172 iorder[3] = 0;
3173 }
3174
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003175 /* On narrow builds we split characters outside the BMP into two
3176 codepoints => count how much extra space we need. */
3177#ifndef Py_UNICODE_WIDE
3178 for (qq = q; qq < e; qq += 4)
3179 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3180 pairs++;
3181#endif
3182
3183 /* This might be one to much, because of a BOM */
3184 unicode = _PyUnicode_New((size+3)/4+pairs);
3185 if (!unicode)
3186 return NULL;
3187 if (size == 0)
3188 return (PyObject *)unicode;
3189
3190 /* Unpack UTF-32 encoded data */
3191 p = unicode->str;
3192
Walter Dörwald41980ca2007-08-16 21:55:45 +00003193 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003194 Py_UCS4 ch;
3195 /* remaining bytes at the end? (size should be divisible by 4) */
3196 if (e-q<4) {
3197 if (consumed)
3198 break;
3199 errmsg = "truncated data";
3200 startinpos = ((const char *)q)-starts;
3201 endinpos = ((const char *)e)-starts;
3202 goto utf32Error;
3203 /* The remaining input chars are ignored if the callback
3204 chooses to skip the input */
3205 }
3206 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3207 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003208
Benjamin Peterson29060642009-01-31 22:14:21 +00003209 if (ch >= 0x110000)
3210 {
3211 errmsg = "codepoint not in range(0x110000)";
3212 startinpos = ((const char *)q)-starts;
3213 endinpos = startinpos+4;
3214 goto utf32Error;
3215 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003216#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 if (ch >= 0x10000)
3218 {
3219 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3220 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3221 }
3222 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003223#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 *p++ = ch;
3225 q += 4;
3226 continue;
3227 utf32Error:
3228 outpos = p-PyUnicode_AS_UNICODE(unicode);
3229 if (unicode_decode_call_errorhandler(
3230 errors, &errorHandler,
3231 "utf32", errmsg,
3232 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3233 &unicode, &outpos, &p))
3234 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003235 }
3236
3237 if (byteorder)
3238 *byteorder = bo;
3239
3240 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003241 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003242
3243 /* Adjust length */
3244 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3245 goto onError;
3246
3247 Py_XDECREF(errorHandler);
3248 Py_XDECREF(exc);
3249 return (PyObject *)unicode;
3250
Benjamin Peterson29060642009-01-31 22:14:21 +00003251 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003252 Py_DECREF(unicode);
3253 Py_XDECREF(errorHandler);
3254 Py_XDECREF(exc);
3255 return NULL;
3256}
3257
3258PyObject *
3259PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003260 Py_ssize_t size,
3261 const char *errors,
3262 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003263{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003264 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003265 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003266 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003267#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003268 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003269#else
3270 const int pairs = 0;
3271#endif
3272 /* Offsets from p for storing byte pairs in the right order. */
3273#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3274 int iorder[] = {0, 1, 2, 3};
3275#else
3276 int iorder[] = {3, 2, 1, 0};
3277#endif
3278
Benjamin Peterson29060642009-01-31 22:14:21 +00003279#define STORECHAR(CH) \
3280 do { \
3281 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3282 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3283 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3284 p[iorder[0]] = (CH) & 0xff; \
3285 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003286 } while(0)
3287
3288 /* In narrow builds we can output surrogate pairs as one codepoint,
3289 so we need less space. */
3290#ifndef Py_UNICODE_WIDE
3291 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003292 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3293 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3294 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003295#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003296 nsize = (size - pairs + (byteorder == 0));
3297 bytesize = nsize * 4;
3298 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003299 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003300 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003301 if (v == NULL)
3302 return NULL;
3303
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003304 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003305 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003307 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003308 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003309
3310 if (byteorder == -1) {
3311 /* force LE */
3312 iorder[0] = 0;
3313 iorder[1] = 1;
3314 iorder[2] = 2;
3315 iorder[3] = 3;
3316 }
3317 else if (byteorder == 1) {
3318 /* force BE */
3319 iorder[0] = 3;
3320 iorder[1] = 2;
3321 iorder[2] = 1;
3322 iorder[3] = 0;
3323 }
3324
3325 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003327#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003328 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3329 Py_UCS4 ch2 = *s;
3330 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3331 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3332 s++;
3333 size--;
3334 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003335 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003336#endif
3337 STORECHAR(ch);
3338 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003339
3340 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003341 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003342#undef STORECHAR
3343}
3344
Alexander Belopolsky40018472011-02-26 01:02:56 +00003345PyObject *
3346PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003347{
3348 if (!PyUnicode_Check(unicode)) {
3349 PyErr_BadArgument();
3350 return NULL;
3351 }
3352 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003353 PyUnicode_GET_SIZE(unicode),
3354 NULL,
3355 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003356}
3357
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358/* --- UTF-16 Codec ------------------------------------------------------- */
3359
Tim Peters772747b2001-08-09 22:21:55 +00003360PyObject *
3361PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003362 Py_ssize_t size,
3363 const char *errors,
3364 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365{
Walter Dörwald69652032004-09-07 20:24:22 +00003366 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3367}
3368
Antoine Pitrouab868312009-01-10 15:40:25 +00003369/* Two masks for fast checking of whether a C 'long' may contain
3370 UTF16-encoded surrogate characters. This is an efficient heuristic,
3371 assuming that non-surrogate characters with a code point >= 0x8000 are
3372 rare in most input.
3373 FAST_CHAR_MASK is used when the input is in native byte ordering,
3374 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003375*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003376#if (SIZEOF_LONG == 8)
3377# define FAST_CHAR_MASK 0x8000800080008000L
3378# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3379#elif (SIZEOF_LONG == 4)
3380# define FAST_CHAR_MASK 0x80008000L
3381# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3382#else
3383# error C 'long' size should be either 4 or 8!
3384#endif
3385
Walter Dörwald69652032004-09-07 20:24:22 +00003386PyObject *
3387PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003388 Py_ssize_t size,
3389 const char *errors,
3390 int *byteorder,
3391 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003392{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003394 Py_ssize_t startinpos;
3395 Py_ssize_t endinpos;
3396 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 PyUnicodeObject *unicode;
3398 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003399 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003400 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003401 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003402 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003403 /* Offsets from q for retrieving byte pairs in the right order. */
3404#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3405 int ihi = 1, ilo = 0;
3406#else
3407 int ihi = 0, ilo = 1;
3408#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409 PyObject *errorHandler = NULL;
3410 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411
3412 /* Note: size will always be longer than the resulting Unicode
3413 character count */
3414 unicode = _PyUnicode_New(size);
3415 if (!unicode)
3416 return NULL;
3417 if (size == 0)
3418 return (PyObject *)unicode;
3419
3420 /* Unpack UTF-16 encoded data */
3421 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003422 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003423 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424
3425 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003426 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003428 /* Check for BOM marks (U+FEFF) in the input and adjust current
3429 byte order setting accordingly. In native mode, the leading BOM
3430 mark is skipped, in all other modes, it is copied to the output
3431 stream as-is (giving a ZWNBSP character). */
3432 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003433 if (size >= 2) {
3434 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003435#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003436 if (bom == 0xFEFF) {
3437 q += 2;
3438 bo = -1;
3439 }
3440 else if (bom == 0xFFFE) {
3441 q += 2;
3442 bo = 1;
3443 }
Tim Petersced69f82003-09-16 20:30:58 +00003444#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003445 if (bom == 0xFEFF) {
3446 q += 2;
3447 bo = 1;
3448 }
3449 else if (bom == 0xFFFE) {
3450 q += 2;
3451 bo = -1;
3452 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003453#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003454 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003455 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456
Tim Peters772747b2001-08-09 22:21:55 +00003457 if (bo == -1) {
3458 /* force LE */
3459 ihi = 1;
3460 ilo = 0;
3461 }
3462 else if (bo == 1) {
3463 /* force BE */
3464 ihi = 0;
3465 ilo = 1;
3466 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003467#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3468 native_ordering = ilo < ihi;
3469#else
3470 native_ordering = ilo > ihi;
3471#endif
Tim Peters772747b2001-08-09 22:21:55 +00003472
Antoine Pitrouab868312009-01-10 15:40:25 +00003473 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003474 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003475 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003476 /* First check for possible aligned read of a C 'long'. Unaligned
3477 reads are more expensive, better to defer to another iteration. */
3478 if (!((size_t) q & LONG_PTR_MASK)) {
3479 /* Fast path for runs of non-surrogate chars. */
3480 register const unsigned char *_q = q;
3481 Py_UNICODE *_p = p;
3482 if (native_ordering) {
3483 /* Native ordering is simple: as long as the input cannot
3484 possibly contain a surrogate char, do an unrolled copy
3485 of several 16-bit code points to the target object.
3486 The non-surrogate check is done on several input bytes
3487 at a time (as many as a C 'long' can contain). */
3488 while (_q < aligned_end) {
3489 unsigned long data = * (unsigned long *) _q;
3490 if (data & FAST_CHAR_MASK)
3491 break;
3492 _p[0] = ((unsigned short *) _q)[0];
3493 _p[1] = ((unsigned short *) _q)[1];
3494#if (SIZEOF_LONG == 8)
3495 _p[2] = ((unsigned short *) _q)[2];
3496 _p[3] = ((unsigned short *) _q)[3];
3497#endif
3498 _q += SIZEOF_LONG;
3499 _p += SIZEOF_LONG / 2;
3500 }
3501 }
3502 else {
3503 /* Byteswapped ordering is similar, but we must decompose
3504 the copy bytewise, and take care of zero'ing out the
3505 upper bytes if the target object is in 32-bit units
3506 (that is, in UCS-4 builds). */
3507 while (_q < aligned_end) {
3508 unsigned long data = * (unsigned long *) _q;
3509 if (data & SWAPPED_FAST_CHAR_MASK)
3510 break;
3511 /* Zero upper bytes in UCS-4 builds */
3512#if (Py_UNICODE_SIZE > 2)
3513 _p[0] = 0;
3514 _p[1] = 0;
3515#if (SIZEOF_LONG == 8)
3516 _p[2] = 0;
3517 _p[3] = 0;
3518#endif
3519#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003520 /* Issue #4916; UCS-4 builds on big endian machines must
3521 fill the two last bytes of each 4-byte unit. */
3522#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3523# define OFF 2
3524#else
3525# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003526#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003527 ((unsigned char *) _p)[OFF + 1] = _q[0];
3528 ((unsigned char *) _p)[OFF + 0] = _q[1];
3529 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3530 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3531#if (SIZEOF_LONG == 8)
3532 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3533 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3534 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3535 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3536#endif
3537#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003538 _q += SIZEOF_LONG;
3539 _p += SIZEOF_LONG / 2;
3540 }
3541 }
3542 p = _p;
3543 q = _q;
3544 if (q >= e)
3545 break;
3546 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003547 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548
Benjamin Peterson14339b62009-01-31 16:36:08 +00003549 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003550
3551 if (ch < 0xD800 || ch > 0xDFFF) {
3552 *p++ = ch;
3553 continue;
3554 }
3555
3556 /* UTF-16 code pair: */
3557 if (q > e) {
3558 errmsg = "unexpected end of data";
3559 startinpos = (((const char *)q) - 2) - starts;
3560 endinpos = ((const char *)e) + 1 - starts;
3561 goto utf16Error;
3562 }
3563 if (0xD800 <= ch && ch <= 0xDBFF) {
3564 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3565 q += 2;
3566 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003567#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003568 *p++ = ch;
3569 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003570#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003571 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003572#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003573 continue;
3574 }
3575 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003576 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003577 startinpos = (((const char *)q)-4)-starts;
3578 endinpos = startinpos+2;
3579 goto utf16Error;
3580 }
3581
Benjamin Peterson14339b62009-01-31 16:36:08 +00003582 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003583 errmsg = "illegal encoding";
3584 startinpos = (((const char *)q)-2)-starts;
3585 endinpos = startinpos+2;
3586 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003587
Benjamin Peterson29060642009-01-31 22:14:21 +00003588 utf16Error:
3589 outpos = p - PyUnicode_AS_UNICODE(unicode);
3590 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003591 errors,
3592 &errorHandler,
3593 "utf16", errmsg,
3594 &starts,
3595 (const char **)&e,
3596 &startinpos,
3597 &endinpos,
3598 &exc,
3599 (const char **)&q,
3600 &unicode,
3601 &outpos,
3602 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003603 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003605 /* remaining byte at the end? (size should be even) */
3606 if (e == q) {
3607 if (!consumed) {
3608 errmsg = "truncated data";
3609 startinpos = ((const char *)q) - starts;
3610 endinpos = ((const char *)e) + 1 - starts;
3611 outpos = p - PyUnicode_AS_UNICODE(unicode);
3612 if (unicode_decode_call_errorhandler(
3613 errors,
3614 &errorHandler,
3615 "utf16", errmsg,
3616 &starts,
3617 (const char **)&e,
3618 &startinpos,
3619 &endinpos,
3620 &exc,
3621 (const char **)&q,
3622 &unicode,
3623 &outpos,
3624 &p))
3625 goto onError;
3626 /* The remaining input chars are ignored if the callback
3627 chooses to skip the input */
3628 }
3629 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630
3631 if (byteorder)
3632 *byteorder = bo;
3633
Walter Dörwald69652032004-09-07 20:24:22 +00003634 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003636
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003638 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 goto onError;
3640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 Py_XDECREF(errorHandler);
3642 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643 return (PyObject *)unicode;
3644
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 Py_XDECREF(errorHandler);
3648 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 return NULL;
3650}
3651
Antoine Pitrouab868312009-01-10 15:40:25 +00003652#undef FAST_CHAR_MASK
3653#undef SWAPPED_FAST_CHAR_MASK
3654
Tim Peters772747b2001-08-09 22:21:55 +00003655PyObject *
3656PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 Py_ssize_t size,
3658 const char *errors,
3659 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003661 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003662 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003663 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003664#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003665 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003666#else
3667 const int pairs = 0;
3668#endif
Tim Peters772747b2001-08-09 22:21:55 +00003669 /* Offsets from p for storing byte pairs in the right order. */
3670#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3671 int ihi = 1, ilo = 0;
3672#else
3673 int ihi = 0, ilo = 1;
3674#endif
3675
Benjamin Peterson29060642009-01-31 22:14:21 +00003676#define STORECHAR(CH) \
3677 do { \
3678 p[ihi] = ((CH) >> 8) & 0xff; \
3679 p[ilo] = (CH) & 0xff; \
3680 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003681 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003683#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003684 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003685 if (s[i] >= 0x10000)
3686 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003687#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003688 /* 2 * (size + pairs + (byteorder == 0)) */
3689 if (size > PY_SSIZE_T_MAX ||
3690 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003692 nsize = size + pairs + (byteorder == 0);
3693 bytesize = nsize * 2;
3694 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003695 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003696 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697 if (v == NULL)
3698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003700 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003703 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003704 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003705
3706 if (byteorder == -1) {
3707 /* force LE */
3708 ihi = 1;
3709 ilo = 0;
3710 }
3711 else if (byteorder == 1) {
3712 /* force BE */
3713 ihi = 0;
3714 ilo = 1;
3715 }
3716
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003717 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003718 Py_UNICODE ch = *s++;
3719 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003720#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 if (ch >= 0x10000) {
3722 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3723 ch = 0xD800 | ((ch-0x10000) >> 10);
3724 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003725#endif
Tim Peters772747b2001-08-09 22:21:55 +00003726 STORECHAR(ch);
3727 if (ch2)
3728 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003729 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003730
3731 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003732 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003733#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734}
3735
Alexander Belopolsky40018472011-02-26 01:02:56 +00003736PyObject *
3737PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738{
3739 if (!PyUnicode_Check(unicode)) {
3740 PyErr_BadArgument();
3741 return NULL;
3742 }
3743 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003744 PyUnicode_GET_SIZE(unicode),
3745 NULL,
3746 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747}
3748
3749/* --- Unicode Escape Codec ----------------------------------------------- */
3750
Fredrik Lundh06d12682001-01-24 07:59:11 +00003751static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003752
Alexander Belopolsky40018472011-02-26 01:02:56 +00003753PyObject *
3754PyUnicode_DecodeUnicodeEscape(const char *s,
3755 Py_ssize_t size,
3756 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003759 Py_ssize_t startinpos;
3760 Py_ssize_t endinpos;
3761 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003766 char* message;
3767 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 PyObject *errorHandler = NULL;
3769 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003770
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 /* Escaped strings will always be longer than the resulting
3772 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003773 length after conversion to the true value.
3774 (but if the error callback returns a long replacement string
3775 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 v = _PyUnicode_New(size);
3777 if (v == NULL)
3778 goto onError;
3779 if (size == 0)
3780 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003784
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 while (s < end) {
3786 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003787 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789
3790 /* Non-escape characters are interpreted as Unicode ordinals */
3791 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003792 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 continue;
3794 }
3795
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 /* \ - Escapes */
3798 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003799 c = *s++;
3800 if (s > end)
3801 c = '\0'; /* Invalid after \ */
3802 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803
Benjamin Peterson29060642009-01-31 22:14:21 +00003804 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 case '\n': break;
3806 case '\\': *p++ = '\\'; break;
3807 case '\'': *p++ = '\''; break;
3808 case '\"': *p++ = '\"'; break;
3809 case 'b': *p++ = '\b'; break;
3810 case 'f': *p++ = '\014'; break; /* FF */
3811 case 't': *p++ = '\t'; break;
3812 case 'n': *p++ = '\n'; break;
3813 case 'r': *p++ = '\r'; break;
3814 case 'v': *p++ = '\013'; break; /* VT */
3815 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3816
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 case '0': case '1': case '2': case '3':
3819 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003820 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003821 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003822 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003823 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003824 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003826 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 break;
3828
Benjamin Peterson29060642009-01-31 22:14:21 +00003829 /* hex escapes */
3830 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003832 digits = 2;
3833 message = "truncated \\xXX escape";
3834 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835
Benjamin Peterson29060642009-01-31 22:14:21 +00003836 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003838 digits = 4;
3839 message = "truncated \\uXXXX escape";
3840 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003843 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003844 digits = 8;
3845 message = "truncated \\UXXXXXXXX escape";
3846 hexescape:
3847 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003848 outpos = p-PyUnicode_AS_UNICODE(v);
3849 if (s+digits>end) {
3850 endinpos = size;
3851 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003852 errors, &errorHandler,
3853 "unicodeescape", "end of string in escape sequence",
3854 &starts, &end, &startinpos, &endinpos, &exc, &s,
3855 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 goto onError;
3857 goto nextByte;
3858 }
3859 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003860 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003861 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862 endinpos = (s+i+1)-starts;
3863 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003864 errors, &errorHandler,
3865 "unicodeescape", message,
3866 &starts, &end, &startinpos, &endinpos, &exc, &s,
3867 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003868 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003869 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003870 }
3871 chr = (chr<<4) & ~0xF;
3872 if (c >= '0' && c <= '9')
3873 chr += c - '0';
3874 else if (c >= 'a' && c <= 'f')
3875 chr += 10 + c - 'a';
3876 else
3877 chr += 10 + c - 'A';
3878 }
3879 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003880 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 /* _decoding_error will have already written into the
3882 target buffer. */
3883 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003884 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003885 /* when we get here, chr is a 32-bit unicode character */
3886 if (chr <= 0xffff)
3887 /* UCS-2 character */
3888 *p++ = (Py_UNICODE) chr;
3889 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003890 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003891 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003892#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003893 *p++ = chr;
3894#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003895 chr -= 0x10000L;
3896 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003897 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003898#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003899 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003900 endinpos = s-starts;
3901 outpos = p-PyUnicode_AS_UNICODE(v);
3902 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003903 errors, &errorHandler,
3904 "unicodeescape", "illegal Unicode character",
3905 &starts, &end, &startinpos, &endinpos, &exc, &s,
3906 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003907 goto onError;
3908 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003909 break;
3910
Benjamin Peterson29060642009-01-31 22:14:21 +00003911 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003912 case 'N':
3913 message = "malformed \\N character escape";
3914 if (ucnhash_CAPI == NULL) {
3915 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003916 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003917 if (ucnhash_CAPI == NULL)
3918 goto ucnhashError;
3919 }
3920 if (*s == '{') {
3921 const char *start = s+1;
3922 /* look for the closing brace */
3923 while (*s != '}' && s < end)
3924 s++;
3925 if (s > start && s < end && *s == '}') {
3926 /* found a name. look it up in the unicode database */
3927 message = "unknown Unicode character name";
3928 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003929 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003930 goto store;
3931 }
3932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 endinpos = s-starts;
3934 outpos = p-PyUnicode_AS_UNICODE(v);
3935 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003936 errors, &errorHandler,
3937 "unicodeescape", message,
3938 &starts, &end, &startinpos, &endinpos, &exc, &s,
3939 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003940 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003941 break;
3942
3943 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003944 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 message = "\\ at end of string";
3946 s--;
3947 endinpos = s-starts;
3948 outpos = p-PyUnicode_AS_UNICODE(v);
3949 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 errors, &errorHandler,
3951 "unicodeescape", message,
3952 &starts, &end, &startinpos, &endinpos, &exc, &s,
3953 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003954 goto onError;
3955 }
3956 else {
3957 *p++ = '\\';
3958 *p++ = (unsigned char)s[-1];
3959 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003960 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003962 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003963 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003965 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003967 Py_XDECREF(errorHandler);
3968 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003970
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003972 PyErr_SetString(
3973 PyExc_UnicodeError,
3974 "\\N escapes not supported (can't load unicodedata module)"
3975 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003976 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 Py_XDECREF(errorHandler);
3978 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003979 return NULL;
3980
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003983 Py_XDECREF(errorHandler);
3984 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 return NULL;
3986}
3987
3988/* Return a Unicode-Escape string version of the Unicode object.
3989
3990 If quotes is true, the string is enclosed in u"" or u'' quotes as
3991 appropriate.
3992
3993*/
3994
Thomas Wouters477c8d52006-05-27 19:21:47 +00003995Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 Py_ssize_t size,
3997 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003998{
3999 /* like wcschr, but doesn't stop at NULL characters */
4000
4001 while (size-- > 0) {
4002 if (*s == ch)
4003 return s;
4004 s++;
4005 }
4006
4007 return NULL;
4008}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004009
Walter Dörwald79e913e2007-05-12 11:08:06 +00004010static const char *hexdigits = "0123456789abcdef";
4011
Alexander Belopolsky40018472011-02-26 01:02:56 +00004012PyObject *
4013PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4014 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004016 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004019#ifdef Py_UNICODE_WIDE
4020 const Py_ssize_t expandsize = 10;
4021#else
4022 const Py_ssize_t expandsize = 6;
4023#endif
4024
Thomas Wouters89f507f2006-12-13 04:49:30 +00004025 /* XXX(nnorwitz): rather than over-allocating, it would be
4026 better to choose a different scheme. Perhaps scan the
4027 first N-chars of the string and allocate based on that size.
4028 */
4029 /* Initial allocation is based on the longest-possible unichr
4030 escape.
4031
4032 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4033 unichr, so in this case it's the longest unichr escape. In
4034 narrow (UTF-16) builds this is five chars per source unichr
4035 since there are two unichrs in the surrogate pair, so in narrow
4036 (UTF-16) builds it's not the longest unichr escape.
4037
4038 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4039 so in the narrow (UTF-16) build case it's the longest unichr
4040 escape.
4041 */
4042
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004043 if (size == 0)
4044 return PyBytes_FromStringAndSize(NULL, 0);
4045
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004046 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004048
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004049 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 2
4051 + expandsize*size
4052 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053 if (repr == NULL)
4054 return NULL;
4055
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004056 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 while (size-- > 0) {
4059 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004060
Walter Dörwald79e913e2007-05-12 11:08:06 +00004061 /* Escape backslashes */
4062 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 *p++ = '\\';
4064 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004065 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004066 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004067
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004068#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004069 /* Map 21-bit characters to '\U00xxxxxx' */
4070 else if (ch >= 0x10000) {
4071 *p++ = '\\';
4072 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004073 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4074 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4075 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4076 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4077 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4078 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4079 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4080 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004082 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004083#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4085 else if (ch >= 0xD800 && ch < 0xDC00) {
4086 Py_UNICODE ch2;
4087 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004088
Benjamin Peterson29060642009-01-31 22:14:21 +00004089 ch2 = *s++;
4090 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004091 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004092 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4093 *p++ = '\\';
4094 *p++ = 'U';
4095 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4096 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4097 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4098 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4099 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4100 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4101 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4102 *p++ = hexdigits[ucs & 0x0000000F];
4103 continue;
4104 }
4105 /* Fall through: isolated surrogates are copied as-is */
4106 s--;
4107 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004108 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004109#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004110
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004112 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 *p++ = '\\';
4114 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004115 *p++ = hexdigits[(ch >> 12) & 0x000F];
4116 *p++ = hexdigits[(ch >> 8) & 0x000F];
4117 *p++ = hexdigits[(ch >> 4) & 0x000F];
4118 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004120
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004121 /* Map special whitespace to '\t', \n', '\r' */
4122 else if (ch == '\t') {
4123 *p++ = '\\';
4124 *p++ = 't';
4125 }
4126 else if (ch == '\n') {
4127 *p++ = '\\';
4128 *p++ = 'n';
4129 }
4130 else if (ch == '\r') {
4131 *p++ = '\\';
4132 *p++ = 'r';
4133 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004134
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004135 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004136 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004138 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004139 *p++ = hexdigits[(ch >> 4) & 0x000F];
4140 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004141 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004142
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 /* Copy everything else as-is */
4144 else
4145 *p++ = (char) ch;
4146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004148 assert(p - PyBytes_AS_STRING(repr) > 0);
4149 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4150 return NULL;
4151 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152}
4153
Alexander Belopolsky40018472011-02-26 01:02:56 +00004154PyObject *
4155PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004157 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158 if (!PyUnicode_Check(unicode)) {
4159 PyErr_BadArgument();
4160 return NULL;
4161 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004162 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4163 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004164 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165}
4166
4167/* --- Raw Unicode Escape Codec ------------------------------------------- */
4168
Alexander Belopolsky40018472011-02-26 01:02:56 +00004169PyObject *
4170PyUnicode_DecodeRawUnicodeEscape(const char *s,
4171 Py_ssize_t size,
4172 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004175 Py_ssize_t startinpos;
4176 Py_ssize_t endinpos;
4177 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 const char *end;
4181 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 PyObject *errorHandler = NULL;
4183 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004184
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 /* Escaped strings will always be longer than the resulting
4186 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 length after conversion to the true value. (But decoding error
4188 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 v = _PyUnicode_New(size);
4190 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004193 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195 end = s + size;
4196 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 unsigned char c;
4198 Py_UCS4 x;
4199 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004200 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 /* Non-escape characters are interpreted as Unicode ordinals */
4203 if (*s != '\\') {
4204 *p++ = (unsigned char)*s++;
4205 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004206 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 startinpos = s-starts;
4208
4209 /* \u-escapes are only interpreted iff the number of leading
4210 backslashes if odd */
4211 bs = s;
4212 for (;s < end;) {
4213 if (*s != '\\')
4214 break;
4215 *p++ = (unsigned char)*s++;
4216 }
4217 if (((s - bs) & 1) == 0 ||
4218 s >= end ||
4219 (*s != 'u' && *s != 'U')) {
4220 continue;
4221 }
4222 p--;
4223 count = *s=='u' ? 4 : 8;
4224 s++;
4225
4226 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4227 outpos = p-PyUnicode_AS_UNICODE(v);
4228 for (x = 0, i = 0; i < count; ++i, ++s) {
4229 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004230 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 endinpos = s-starts;
4232 if (unicode_decode_call_errorhandler(
4233 errors, &errorHandler,
4234 "rawunicodeescape", "truncated \\uXXXX",
4235 &starts, &end, &startinpos, &endinpos, &exc, &s,
4236 &v, &outpos, &p))
4237 goto onError;
4238 goto nextByte;
4239 }
4240 x = (x<<4) & ~0xF;
4241 if (c >= '0' && c <= '9')
4242 x += c - '0';
4243 else if (c >= 'a' && c <= 'f')
4244 x += 10 + c - 'a';
4245 else
4246 x += 10 + c - 'A';
4247 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004248 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004249 /* UCS-2 character */
4250 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004251 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 /* UCS-4 character. Either store directly, or as
4253 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004254#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004256#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 x -= 0x10000L;
4258 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4259 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004260#endif
4261 } else {
4262 endinpos = s-starts;
4263 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004264 if (unicode_decode_call_errorhandler(
4265 errors, &errorHandler,
4266 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 &starts, &end, &startinpos, &endinpos, &exc, &s,
4268 &v, &outpos, &p))
4269 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004270 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 nextByte:
4272 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004274 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004276 Py_XDECREF(errorHandler);
4277 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004279
Benjamin Peterson29060642009-01-31 22:14:21 +00004280 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 Py_XDECREF(errorHandler);
4283 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 return NULL;
4285}
4286
Alexander Belopolsky40018472011-02-26 01:02:56 +00004287PyObject *
4288PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4289 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004291 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 char *p;
4293 char *q;
4294
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004295#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004296 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004297#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004298 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004299#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004300
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004301 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004303
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004304 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 if (repr == NULL)
4306 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004307 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004308 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004310 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 while (size-- > 0) {
4312 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004313#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 /* Map 32-bit characters to '\Uxxxxxxxx' */
4315 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004316 *p++ = '\\';
4317 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004318 *p++ = hexdigits[(ch >> 28) & 0xf];
4319 *p++ = hexdigits[(ch >> 24) & 0xf];
4320 *p++ = hexdigits[(ch >> 20) & 0xf];
4321 *p++ = hexdigits[(ch >> 16) & 0xf];
4322 *p++ = hexdigits[(ch >> 12) & 0xf];
4323 *p++ = hexdigits[(ch >> 8) & 0xf];
4324 *p++ = hexdigits[(ch >> 4) & 0xf];
4325 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004326 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004327 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004328#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4330 if (ch >= 0xD800 && ch < 0xDC00) {
4331 Py_UNICODE ch2;
4332 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004333
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 ch2 = *s++;
4335 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004336 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004337 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4338 *p++ = '\\';
4339 *p++ = 'U';
4340 *p++ = hexdigits[(ucs >> 28) & 0xf];
4341 *p++ = hexdigits[(ucs >> 24) & 0xf];
4342 *p++ = hexdigits[(ucs >> 20) & 0xf];
4343 *p++ = hexdigits[(ucs >> 16) & 0xf];
4344 *p++ = hexdigits[(ucs >> 12) & 0xf];
4345 *p++ = hexdigits[(ucs >> 8) & 0xf];
4346 *p++ = hexdigits[(ucs >> 4) & 0xf];
4347 *p++ = hexdigits[ucs & 0xf];
4348 continue;
4349 }
4350 /* Fall through: isolated surrogates are copied as-is */
4351 s--;
4352 size++;
4353 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004354#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004355 /* Map 16-bit characters to '\uxxxx' */
4356 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 *p++ = '\\';
4358 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004359 *p++ = hexdigits[(ch >> 12) & 0xf];
4360 *p++ = hexdigits[(ch >> 8) & 0xf];
4361 *p++ = hexdigits[(ch >> 4) & 0xf];
4362 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004364 /* Copy everything else as-is */
4365 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366 *p++ = (char) ch;
4367 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004368 size = p - q;
4369
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004370 assert(size > 0);
4371 if (_PyBytes_Resize(&repr, size) < 0)
4372 return NULL;
4373 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374}
4375
Alexander Belopolsky40018472011-02-26 01:02:56 +00004376PyObject *
4377PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004379 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004381 PyErr_BadArgument();
4382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004384 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4385 PyUnicode_GET_SIZE(unicode));
4386
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004387 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388}
4389
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004390/* --- Unicode Internal Codec ------------------------------------------- */
4391
Alexander Belopolsky40018472011-02-26 01:02:56 +00004392PyObject *
4393_PyUnicode_DecodeUnicodeInternal(const char *s,
4394 Py_ssize_t size,
4395 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004396{
4397 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004398 Py_ssize_t startinpos;
4399 Py_ssize_t endinpos;
4400 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004401 PyUnicodeObject *v;
4402 Py_UNICODE *p;
4403 const char *end;
4404 const char *reason;
4405 PyObject *errorHandler = NULL;
4406 PyObject *exc = NULL;
4407
Neal Norwitzd43069c2006-01-08 01:12:10 +00004408#ifdef Py_UNICODE_WIDE
4409 Py_UNICODE unimax = PyUnicode_GetMax();
4410#endif
4411
Thomas Wouters89f507f2006-12-13 04:49:30 +00004412 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004413 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4414 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004415 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004416 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004418 p = PyUnicode_AS_UNICODE(v);
4419 end = s + size;
4420
4421 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004422 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004423 /* We have to sanity check the raw data, otherwise doom looms for
4424 some malformed UCS-4 data. */
4425 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004426#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004427 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004428#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004429 end-s < Py_UNICODE_SIZE
4430 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004432 startinpos = s - starts;
4433 if (end-s < Py_UNICODE_SIZE) {
4434 endinpos = end-starts;
4435 reason = "truncated input";
4436 }
4437 else {
4438 endinpos = s - starts + Py_UNICODE_SIZE;
4439 reason = "illegal code point (> 0x10FFFF)";
4440 }
4441 outpos = p - PyUnicode_AS_UNICODE(v);
4442 if (unicode_decode_call_errorhandler(
4443 errors, &errorHandler,
4444 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004445 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004446 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004447 goto onError;
4448 }
4449 }
4450 else {
4451 p++;
4452 s += Py_UNICODE_SIZE;
4453 }
4454 }
4455
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004456 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004457 goto onError;
4458 Py_XDECREF(errorHandler);
4459 Py_XDECREF(exc);
4460 return (PyObject *)v;
4461
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004463 Py_XDECREF(v);
4464 Py_XDECREF(errorHandler);
4465 Py_XDECREF(exc);
4466 return NULL;
4467}
4468
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469/* --- Latin-1 Codec ------------------------------------------------------ */
4470
Alexander Belopolsky40018472011-02-26 01:02:56 +00004471PyObject *
4472PyUnicode_DecodeLatin1(const char *s,
4473 Py_ssize_t size,
4474 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475{
4476 PyUnicodeObject *v;
4477 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004478 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004479
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004481 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004482 Py_UNICODE r = *(unsigned char*)s;
4483 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004484 }
4485
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 v = _PyUnicode_New(size);
4487 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004492 e = s + size;
4493 /* Unrolling the copy makes it much faster by reducing the looping
4494 overhead. This is similar to what many memcpy() implementations do. */
4495 unrolled_end = e - 4;
4496 while (s < unrolled_end) {
4497 p[0] = (unsigned char) s[0];
4498 p[1] = (unsigned char) s[1];
4499 p[2] = (unsigned char) s[2];
4500 p[3] = (unsigned char) s[3];
4501 s += 4;
4502 p += 4;
4503 }
4504 while (s < e)
4505 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004507
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 Py_XDECREF(v);
4510 return NULL;
4511}
4512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004514static void
4515make_encode_exception(PyObject **exceptionObject,
4516 const char *encoding,
4517 const Py_UNICODE *unicode, Py_ssize_t size,
4518 Py_ssize_t startpos, Py_ssize_t endpos,
4519 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 *exceptionObject = PyUnicodeEncodeError_Create(
4523 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 }
4525 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4527 goto onError;
4528 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4529 goto onError;
4530 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4531 goto onError;
4532 return;
4533 onError:
4534 Py_DECREF(*exceptionObject);
4535 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 }
4537}
4538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004540static void
4541raise_encode_exception(PyObject **exceptionObject,
4542 const char *encoding,
4543 const Py_UNICODE *unicode, Py_ssize_t size,
4544 Py_ssize_t startpos, Py_ssize_t endpos,
4545 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546{
4547 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551}
4552
4553/* error handling callback helper:
4554 build arguments, call the callback and check the arguments,
4555 put the result into newpos and return the replacement string, which
4556 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004557static PyObject *
4558unicode_encode_call_errorhandler(const char *errors,
4559 PyObject **errorHandler,
4560 const char *encoding, const char *reason,
4561 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4562 Py_ssize_t startpos, Py_ssize_t endpos,
4563 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004565 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566
4567 PyObject *restuple;
4568 PyObject *resunicode;
4569
4570 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004571 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004573 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 }
4575
4576 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004577 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580
4581 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004586 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 Py_DECREF(restuple);
4588 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004590 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 &resunicode, newpos)) {
4592 Py_DECREF(restuple);
4593 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004595 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4596 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4597 Py_DECREF(restuple);
4598 return NULL;
4599 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004602 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004603 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4604 Py_DECREF(restuple);
4605 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004606 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 Py_INCREF(resunicode);
4608 Py_DECREF(restuple);
4609 return resunicode;
4610}
4611
Alexander Belopolsky40018472011-02-26 01:02:56 +00004612static PyObject *
4613unicode_encode_ucs1(const Py_UNICODE *p,
4614 Py_ssize_t size,
4615 const char *errors,
4616 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617{
4618 /* output object */
4619 PyObject *res;
4620 /* pointers to the beginning and end+1 of input */
4621 const Py_UNICODE *startp = p;
4622 const Py_UNICODE *endp = p + size;
4623 /* pointer to the beginning of the unencodable characters */
4624 /* const Py_UNICODE *badp = NULL; */
4625 /* pointer into the output */
4626 char *str;
4627 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004628 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004629 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4630 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631 PyObject *errorHandler = NULL;
4632 PyObject *exc = NULL;
4633 /* the following variable is used for caching string comparisons
4634 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4635 int known_errorHandler = -1;
4636
4637 /* allocate enough for a simple encoding without
4638 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004639 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004640 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004641 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004643 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004644 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 ressize = size;
4646
4647 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 /* can we encode this? */
4651 if (c<limit) {
4652 /* no overflow check, because we know that the space is enough */
4653 *str++ = (char)c;
4654 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004655 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 else {
4657 Py_ssize_t unicodepos = p-startp;
4658 Py_ssize_t requiredsize;
4659 PyObject *repunicode;
4660 Py_ssize_t repsize;
4661 Py_ssize_t newpos;
4662 Py_ssize_t respos;
4663 Py_UNICODE *uni2;
4664 /* startpos for collecting unencodable chars */
4665 const Py_UNICODE *collstart = p;
4666 const Py_UNICODE *collend = p;
4667 /* find all unecodable characters */
4668 while ((collend < endp) && ((*collend)>=limit))
4669 ++collend;
4670 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4671 if (known_errorHandler==-1) {
4672 if ((errors==NULL) || (!strcmp(errors, "strict")))
4673 known_errorHandler = 1;
4674 else if (!strcmp(errors, "replace"))
4675 known_errorHandler = 2;
4676 else if (!strcmp(errors, "ignore"))
4677 known_errorHandler = 3;
4678 else if (!strcmp(errors, "xmlcharrefreplace"))
4679 known_errorHandler = 4;
4680 else
4681 known_errorHandler = 0;
4682 }
4683 switch (known_errorHandler) {
4684 case 1: /* strict */
4685 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4686 goto onError;
4687 case 2: /* replace */
4688 while (collstart++<collend)
4689 *str++ = '?'; /* fall through */
4690 case 3: /* ignore */
4691 p = collend;
4692 break;
4693 case 4: /* xmlcharrefreplace */
4694 respos = str - PyBytes_AS_STRING(res);
4695 /* determine replacement size (temporarily (mis)uses p) */
4696 for (p = collstart, repsize = 0; p < collend; ++p) {
4697 if (*p<10)
4698 repsize += 2+1+1;
4699 else if (*p<100)
4700 repsize += 2+2+1;
4701 else if (*p<1000)
4702 repsize += 2+3+1;
4703 else if (*p<10000)
4704 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004705#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 else
4707 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004708#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 else if (*p<100000)
4710 repsize += 2+5+1;
4711 else if (*p<1000000)
4712 repsize += 2+6+1;
4713 else
4714 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004715#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 }
4717 requiredsize = respos+repsize+(endp-collend);
4718 if (requiredsize > ressize) {
4719 if (requiredsize<2*ressize)
4720 requiredsize = 2*ressize;
4721 if (_PyBytes_Resize(&res, requiredsize))
4722 goto onError;
4723 str = PyBytes_AS_STRING(res) + respos;
4724 ressize = requiredsize;
4725 }
4726 /* generate replacement (temporarily (mis)uses p) */
4727 for (p = collstart; p < collend; ++p) {
4728 str += sprintf(str, "&#%d;", (int)*p);
4729 }
4730 p = collend;
4731 break;
4732 default:
4733 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4734 encoding, reason, startp, size, &exc,
4735 collstart-startp, collend-startp, &newpos);
4736 if (repunicode == NULL)
4737 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004738 if (PyBytes_Check(repunicode)) {
4739 /* Directly copy bytes result to output. */
4740 repsize = PyBytes_Size(repunicode);
4741 if (repsize > 1) {
4742 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004743 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004744 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4745 Py_DECREF(repunicode);
4746 goto onError;
4747 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004748 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004749 ressize += repsize-1;
4750 }
4751 memcpy(str, PyBytes_AsString(repunicode), repsize);
4752 str += repsize;
4753 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004754 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004755 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004756 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004757 /* need more space? (at least enough for what we
4758 have+the replacement+the rest of the string, so
4759 we won't have to check space for encodable characters) */
4760 respos = str - PyBytes_AS_STRING(res);
4761 repsize = PyUnicode_GET_SIZE(repunicode);
4762 requiredsize = respos+repsize+(endp-collend);
4763 if (requiredsize > ressize) {
4764 if (requiredsize<2*ressize)
4765 requiredsize = 2*ressize;
4766 if (_PyBytes_Resize(&res, requiredsize)) {
4767 Py_DECREF(repunicode);
4768 goto onError;
4769 }
4770 str = PyBytes_AS_STRING(res) + respos;
4771 ressize = requiredsize;
4772 }
4773 /* check if there is anything unencodable in the replacement
4774 and copy it to the output */
4775 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4776 c = *uni2;
4777 if (c >= limit) {
4778 raise_encode_exception(&exc, encoding, startp, size,
4779 unicodepos, unicodepos+1, reason);
4780 Py_DECREF(repunicode);
4781 goto onError;
4782 }
4783 *str = (char)c;
4784 }
4785 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004786 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004787 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004788 }
4789 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004790 /* Resize if we allocated to much */
4791 size = str - PyBytes_AS_STRING(res);
4792 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004793 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004794 if (_PyBytes_Resize(&res, size) < 0)
4795 goto onError;
4796 }
4797
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 Py_XDECREF(errorHandler);
4799 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004800 return res;
4801
4802 onError:
4803 Py_XDECREF(res);
4804 Py_XDECREF(errorHandler);
4805 Py_XDECREF(exc);
4806 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807}
4808
Alexander Belopolsky40018472011-02-26 01:02:56 +00004809PyObject *
4810PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4811 Py_ssize_t size,
4812 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815}
4816
Alexander Belopolsky40018472011-02-26 01:02:56 +00004817PyObject *
4818PyUnicode_AsLatin1String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819{
4820 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 PyErr_BadArgument();
4822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 }
4824 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 PyUnicode_GET_SIZE(unicode),
4826 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827}
4828
4829/* --- 7-bit ASCII Codec -------------------------------------------------- */
4830
Alexander Belopolsky40018472011-02-26 01:02:56 +00004831PyObject *
4832PyUnicode_DecodeASCII(const char *s,
4833 Py_ssize_t size,
4834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004836 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 PyUnicodeObject *v;
4838 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004839 Py_ssize_t startinpos;
4840 Py_ssize_t endinpos;
4841 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 const char *e;
4843 PyObject *errorHandler = NULL;
4844 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004847 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004848 Py_UNICODE r = *(unsigned char*)s;
4849 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004850 }
Tim Petersced69f82003-09-16 20:30:58 +00004851
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 v = _PyUnicode_New(size);
4853 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004856 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 e = s + size;
4859 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 register unsigned char c = (unsigned char)*s;
4861 if (c < 128) {
4862 *p++ = c;
4863 ++s;
4864 }
4865 else {
4866 startinpos = s-starts;
4867 endinpos = startinpos + 1;
4868 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4869 if (unicode_decode_call_errorhandler(
4870 errors, &errorHandler,
4871 "ascii", "ordinal not in range(128)",
4872 &starts, &e, &startinpos, &endinpos, &exc, &s,
4873 &v, &outpos, &p))
4874 goto onError;
4875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004877 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004878 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4879 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004880 Py_XDECREF(errorHandler);
4881 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004883
Benjamin Peterson29060642009-01-31 22:14:21 +00004884 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004886 Py_XDECREF(errorHandler);
4887 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 return NULL;
4889}
4890
Alexander Belopolsky40018472011-02-26 01:02:56 +00004891PyObject *
4892PyUnicode_EncodeASCII(const Py_UNICODE *p,
4893 Py_ssize_t size,
4894 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897}
4898
Alexander Belopolsky40018472011-02-26 01:02:56 +00004899PyObject *
4900PyUnicode_AsASCIIString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901{
4902 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 PyErr_BadArgument();
4904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 }
4906 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 PyUnicode_GET_SIZE(unicode),
4908 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909}
4910
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004911#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004912
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004913/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004914
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004915#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004916#define NEED_RETRY
4917#endif
4918
4919/* XXX This code is limited to "true" double-byte encodings, as
4920 a) it assumes an incomplete character consists of a single byte, and
4921 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004923
Alexander Belopolsky40018472011-02-26 01:02:56 +00004924static int
4925is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004926{
4927 const char *curr = s + offset;
4928
4929 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004930 const char *prev = CharPrev(s, curr);
4931 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004932 }
4933 return 0;
4934}
4935
4936/*
4937 * Decode MBCS string into unicode object. If 'final' is set, converts
4938 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4939 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004940static int
4941decode_mbcs(PyUnicodeObject **v,
4942 const char *s, /* MBCS string */
4943 int size, /* sizeof MBCS string */
4944 int final,
4945 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004946{
4947 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004948 Py_ssize_t n;
4949 DWORD usize;
4950 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004951
4952 assert(size >= 0);
4953
Victor Stinner554f3f02010-06-16 23:33:54 +00004954 /* check and handle 'errors' arg */
4955 if (errors==NULL || strcmp(errors, "strict")==0)
4956 flags = MB_ERR_INVALID_CHARS;
4957 else if (strcmp(errors, "ignore")==0)
4958 flags = 0;
4959 else {
4960 PyErr_Format(PyExc_ValueError,
4961 "mbcs encoding does not support errors='%s'",
4962 errors);
4963 return -1;
4964 }
4965
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004966 /* Skip trailing lead-byte unless 'final' is set */
4967 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004968 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004969
4970 /* First get the size of the result */
4971 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004972 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4973 if (usize==0)
4974 goto mbcs_decode_error;
4975 } else
4976 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004977
4978 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 /* Create unicode object */
4980 *v = _PyUnicode_New(usize);
4981 if (*v == NULL)
4982 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004983 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004984 }
4985 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 /* Extend unicode object */
4987 n = PyUnicode_GET_SIZE(*v);
4988 if (_PyUnicode_Resize(v, n + usize) < 0)
4989 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004990 }
4991
4992 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004993 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004995 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4996 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004998 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004999 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00005000
5001mbcs_decode_error:
5002 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5003 we raise a UnicodeDecodeError - else it is a 'generic'
5004 windows error
5005 */
5006 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5007 /* Ideally, we should get reason from FormatMessage - this
5008 is the Windows 2000 English version of the message
5009 */
5010 PyObject *exc = NULL;
5011 const char *reason = "No mapping for the Unicode character exists "
5012 "in the target multi-byte code page.";
5013 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5014 if (exc != NULL) {
5015 PyCodec_StrictErrors(exc);
5016 Py_DECREF(exc);
5017 }
5018 } else {
5019 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5020 }
5021 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005022}
5023
Alexander Belopolsky40018472011-02-26 01:02:56 +00005024PyObject *
5025PyUnicode_DecodeMBCSStateful(const char *s,
5026 Py_ssize_t size,
5027 const char *errors,
5028 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005029{
5030 PyUnicodeObject *v = NULL;
5031 int done;
5032
5033 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005035
5036#ifdef NEED_RETRY
5037 retry:
5038 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005039 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005040 else
5041#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005042 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005043
5044 if (done < 0) {
5045 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005047 }
5048
5049 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005051
5052#ifdef NEED_RETRY
5053 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 s += done;
5055 size -= done;
5056 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005057 }
5058#endif
5059
5060 return (PyObject *)v;
5061}
5062
Alexander Belopolsky40018472011-02-26 01:02:56 +00005063PyObject *
5064PyUnicode_DecodeMBCS(const char *s,
5065 Py_ssize_t size,
5066 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005067{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005068 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5069}
5070
5071/*
5072 * Convert unicode into string object (MBCS).
5073 * Returns 0 if succeed, -1 otherwise.
5074 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005075static int
5076encode_mbcs(PyObject **repr,
5077 const Py_UNICODE *p, /* unicode */
5078 int size, /* size of unicode */
5079 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005080{
Victor Stinner554f3f02010-06-16 23:33:54 +00005081 BOOL usedDefaultChar = FALSE;
5082 BOOL *pusedDefaultChar;
5083 int mbcssize;
5084 Py_ssize_t n;
5085 PyObject *exc = NULL;
5086 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005087
5088 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005089
Victor Stinner554f3f02010-06-16 23:33:54 +00005090 /* check and handle 'errors' arg */
5091 if (errors==NULL || strcmp(errors, "strict")==0) {
5092 flags = WC_NO_BEST_FIT_CHARS;
5093 pusedDefaultChar = &usedDefaultChar;
5094 } else if (strcmp(errors, "replace")==0) {
5095 flags = 0;
5096 pusedDefaultChar = NULL;
5097 } else {
5098 PyErr_Format(PyExc_ValueError,
5099 "mbcs encoding does not support errors='%s'",
5100 errors);
5101 return -1;
5102 }
5103
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005104 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005105 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005106 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5107 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 if (mbcssize == 0) {
5109 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5110 return -1;
5111 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005112 /* If we used a default char, then we failed! */
5113 if (pusedDefaultChar && *pusedDefaultChar)
5114 goto mbcs_encode_error;
5115 } else {
5116 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005117 }
5118
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005119 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 /* Create string object */
5121 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5122 if (*repr == NULL)
5123 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005124 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005125 }
5126 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 /* Extend string object */
5128 n = PyBytes_Size(*repr);
5129 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5130 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005131 }
5132
5133 /* Do the conversion */
5134 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005136 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5137 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005138 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5139 return -1;
5140 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005141 if (pusedDefaultChar && *pusedDefaultChar)
5142 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005143 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005144 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005145
5146mbcs_encode_error:
5147 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5148 Py_XDECREF(exc);
5149 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005150}
5151
Alexander Belopolsky40018472011-02-26 01:02:56 +00005152PyObject *
5153PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5154 Py_ssize_t size,
5155 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005156{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005157 PyObject *repr = NULL;
5158 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005159
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005160#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005162 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005163 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005164 else
5165#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005166 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005167
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005168 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005169 Py_XDECREF(repr);
5170 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005171 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005172
5173#ifdef NEED_RETRY
5174 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 p += INT_MAX;
5176 size -= INT_MAX;
5177 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005178 }
5179#endif
5180
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005181 return repr;
5182}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005183
Alexander Belopolsky40018472011-02-26 01:02:56 +00005184PyObject *
5185PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005186{
5187 if (!PyUnicode_Check(unicode)) {
5188 PyErr_BadArgument();
5189 return NULL;
5190 }
5191 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 PyUnicode_GET_SIZE(unicode),
5193 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005194}
5195
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005196#undef NEED_RETRY
5197
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005198#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005199
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200/* --- Character Mapping Codec -------------------------------------------- */
5201
Alexander Belopolsky40018472011-02-26 01:02:56 +00005202PyObject *
5203PyUnicode_DecodeCharmap(const char *s,
5204 Py_ssize_t size,
5205 PyObject *mapping,
5206 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005208 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005209 Py_ssize_t startinpos;
5210 Py_ssize_t endinpos;
5211 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005212 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 PyUnicodeObject *v;
5214 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005215 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005216 PyObject *errorHandler = NULL;
5217 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005218 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005219 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005220
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 /* Default to Latin-1 */
5222 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224
5225 v = _PyUnicode_New(size);
5226 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005232 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 mapstring = PyUnicode_AS_UNICODE(mapping);
5234 maplen = PyUnicode_GET_SIZE(mapping);
5235 while (s < e) {
5236 unsigned char ch = *s;
5237 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 if (ch < maplen)
5240 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 if (x == 0xfffe) {
5243 /* undefined mapping */
5244 outpos = p-PyUnicode_AS_UNICODE(v);
5245 startinpos = s-starts;
5246 endinpos = startinpos+1;
5247 if (unicode_decode_call_errorhandler(
5248 errors, &errorHandler,
5249 "charmap", "character maps to <undefined>",
5250 &starts, &e, &startinpos, &endinpos, &exc, &s,
5251 &v, &outpos, &p)) {
5252 goto onError;
5253 }
5254 continue;
5255 }
5256 *p++ = x;
5257 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005258 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005259 }
5260 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 while (s < e) {
5262 unsigned char ch = *s;
5263 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005264
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5266 w = PyLong_FromLong((long)ch);
5267 if (w == NULL)
5268 goto onError;
5269 x = PyObject_GetItem(mapping, w);
5270 Py_DECREF(w);
5271 if (x == NULL) {
5272 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5273 /* No mapping found means: mapping is undefined. */
5274 PyErr_Clear();
5275 x = Py_None;
5276 Py_INCREF(x);
5277 } else
5278 goto onError;
5279 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005280
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 /* Apply mapping */
5282 if (PyLong_Check(x)) {
5283 long value = PyLong_AS_LONG(x);
5284 if (value < 0 || value > 65535) {
5285 PyErr_SetString(PyExc_TypeError,
5286 "character mapping must be in range(65536)");
5287 Py_DECREF(x);
5288 goto onError;
5289 }
5290 *p++ = (Py_UNICODE)value;
5291 }
5292 else if (x == Py_None) {
5293 /* undefined mapping */
5294 outpos = p-PyUnicode_AS_UNICODE(v);
5295 startinpos = s-starts;
5296 endinpos = startinpos+1;
5297 if (unicode_decode_call_errorhandler(
5298 errors, &errorHandler,
5299 "charmap", "character maps to <undefined>",
5300 &starts, &e, &startinpos, &endinpos, &exc, &s,
5301 &v, &outpos, &p)) {
5302 Py_DECREF(x);
5303 goto onError;
5304 }
5305 Py_DECREF(x);
5306 continue;
5307 }
5308 else if (PyUnicode_Check(x)) {
5309 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005310
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 if (targetsize == 1)
5312 /* 1-1 mapping */
5313 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005314
Benjamin Peterson29060642009-01-31 22:14:21 +00005315 else if (targetsize > 1) {
5316 /* 1-n mapping */
5317 if (targetsize > extrachars) {
5318 /* resize first */
5319 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5320 Py_ssize_t needed = (targetsize - extrachars) + \
5321 (targetsize << 2);
5322 extrachars += needed;
5323 /* XXX overflow detection missing */
5324 if (_PyUnicode_Resize(&v,
5325 PyUnicode_GET_SIZE(v) + needed) < 0) {
5326 Py_DECREF(x);
5327 goto onError;
5328 }
5329 p = PyUnicode_AS_UNICODE(v) + oldpos;
5330 }
5331 Py_UNICODE_COPY(p,
5332 PyUnicode_AS_UNICODE(x),
5333 targetsize);
5334 p += targetsize;
5335 extrachars -= targetsize;
5336 }
5337 /* 1-0 mapping: skip the character */
5338 }
5339 else {
5340 /* wrong return value */
5341 PyErr_SetString(PyExc_TypeError,
5342 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005343 Py_DECREF(x);
5344 goto onError;
5345 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 Py_DECREF(x);
5347 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 }
5350 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5352 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 Py_XDECREF(errorHandler);
5354 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005356
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005358 Py_XDECREF(errorHandler);
5359 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 Py_XDECREF(v);
5361 return NULL;
5362}
5363
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005364/* Charmap encoding: the lookup table */
5365
Alexander Belopolsky40018472011-02-26 01:02:56 +00005366struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 PyObject_HEAD
5368 unsigned char level1[32];
5369 int count2, count3;
5370 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005371};
5372
5373static PyObject*
5374encoding_map_size(PyObject *obj, PyObject* args)
5375{
5376 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005377 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005379}
5380
5381static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005382 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 PyDoc_STR("Return the size (in bytes) of this object") },
5384 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005385};
5386
5387static void
5388encoding_map_dealloc(PyObject* o)
5389{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005390 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005391}
5392
5393static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005394 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 "EncodingMap", /*tp_name*/
5396 sizeof(struct encoding_map), /*tp_basicsize*/
5397 0, /*tp_itemsize*/
5398 /* methods */
5399 encoding_map_dealloc, /*tp_dealloc*/
5400 0, /*tp_print*/
5401 0, /*tp_getattr*/
5402 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005403 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 0, /*tp_repr*/
5405 0, /*tp_as_number*/
5406 0, /*tp_as_sequence*/
5407 0, /*tp_as_mapping*/
5408 0, /*tp_hash*/
5409 0, /*tp_call*/
5410 0, /*tp_str*/
5411 0, /*tp_getattro*/
5412 0, /*tp_setattro*/
5413 0, /*tp_as_buffer*/
5414 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5415 0, /*tp_doc*/
5416 0, /*tp_traverse*/
5417 0, /*tp_clear*/
5418 0, /*tp_richcompare*/
5419 0, /*tp_weaklistoffset*/
5420 0, /*tp_iter*/
5421 0, /*tp_iternext*/
5422 encoding_map_methods, /*tp_methods*/
5423 0, /*tp_members*/
5424 0, /*tp_getset*/
5425 0, /*tp_base*/
5426 0, /*tp_dict*/
5427 0, /*tp_descr_get*/
5428 0, /*tp_descr_set*/
5429 0, /*tp_dictoffset*/
5430 0, /*tp_init*/
5431 0, /*tp_alloc*/
5432 0, /*tp_new*/
5433 0, /*tp_free*/
5434 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005435};
5436
5437PyObject*
5438PyUnicode_BuildEncodingMap(PyObject* string)
5439{
5440 Py_UNICODE *decode;
5441 PyObject *result;
5442 struct encoding_map *mresult;
5443 int i;
5444 int need_dict = 0;
5445 unsigned char level1[32];
5446 unsigned char level2[512];
5447 unsigned char *mlevel1, *mlevel2, *mlevel3;
5448 int count2 = 0, count3 = 0;
5449
5450 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5451 PyErr_BadArgument();
5452 return NULL;
5453 }
5454 decode = PyUnicode_AS_UNICODE(string);
5455 memset(level1, 0xFF, sizeof level1);
5456 memset(level2, 0xFF, sizeof level2);
5457
5458 /* If there isn't a one-to-one mapping of NULL to \0,
5459 or if there are non-BMP characters, we need to use
5460 a mapping dictionary. */
5461 if (decode[0] != 0)
5462 need_dict = 1;
5463 for (i = 1; i < 256; i++) {
5464 int l1, l2;
5465 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005466#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005467 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005468#endif
5469 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005470 need_dict = 1;
5471 break;
5472 }
5473 if (decode[i] == 0xFFFE)
5474 /* unmapped character */
5475 continue;
5476 l1 = decode[i] >> 11;
5477 l2 = decode[i] >> 7;
5478 if (level1[l1] == 0xFF)
5479 level1[l1] = count2++;
5480 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005481 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005482 }
5483
5484 if (count2 >= 0xFF || count3 >= 0xFF)
5485 need_dict = 1;
5486
5487 if (need_dict) {
5488 PyObject *result = PyDict_New();
5489 PyObject *key, *value;
5490 if (!result)
5491 return NULL;
5492 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005493 key = PyLong_FromLong(decode[i]);
5494 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005495 if (!key || !value)
5496 goto failed1;
5497 if (PyDict_SetItem(result, key, value) == -1)
5498 goto failed1;
5499 Py_DECREF(key);
5500 Py_DECREF(value);
5501 }
5502 return result;
5503 failed1:
5504 Py_XDECREF(key);
5505 Py_XDECREF(value);
5506 Py_DECREF(result);
5507 return NULL;
5508 }
5509
5510 /* Create a three-level trie */
5511 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5512 16*count2 + 128*count3 - 1);
5513 if (!result)
5514 return PyErr_NoMemory();
5515 PyObject_Init(result, &EncodingMapType);
5516 mresult = (struct encoding_map*)result;
5517 mresult->count2 = count2;
5518 mresult->count3 = count3;
5519 mlevel1 = mresult->level1;
5520 mlevel2 = mresult->level23;
5521 mlevel3 = mresult->level23 + 16*count2;
5522 memcpy(mlevel1, level1, 32);
5523 memset(mlevel2, 0xFF, 16*count2);
5524 memset(mlevel3, 0, 128*count3);
5525 count3 = 0;
5526 for (i = 1; i < 256; i++) {
5527 int o1, o2, o3, i2, i3;
5528 if (decode[i] == 0xFFFE)
5529 /* unmapped character */
5530 continue;
5531 o1 = decode[i]>>11;
5532 o2 = (decode[i]>>7) & 0xF;
5533 i2 = 16*mlevel1[o1] + o2;
5534 if (mlevel2[i2] == 0xFF)
5535 mlevel2[i2] = count3++;
5536 o3 = decode[i] & 0x7F;
5537 i3 = 128*mlevel2[i2] + o3;
5538 mlevel3[i3] = i;
5539 }
5540 return result;
5541}
5542
5543static int
5544encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5545{
5546 struct encoding_map *map = (struct encoding_map*)mapping;
5547 int l1 = c>>11;
5548 int l2 = (c>>7) & 0xF;
5549 int l3 = c & 0x7F;
5550 int i;
5551
5552#ifdef Py_UNICODE_WIDE
5553 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005555 }
5556#endif
5557 if (c == 0)
5558 return 0;
5559 /* level 1*/
5560 i = map->level1[l1];
5561 if (i == 0xFF) {
5562 return -1;
5563 }
5564 /* level 2*/
5565 i = map->level23[16*i+l2];
5566 if (i == 0xFF) {
5567 return -1;
5568 }
5569 /* level 3 */
5570 i = map->level23[16*map->count2 + 128*i + l3];
5571 if (i == 0) {
5572 return -1;
5573 }
5574 return i;
5575}
5576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577/* Lookup the character ch in the mapping. If the character
5578 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005579 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005580static PyObject *
5581charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582{
Christian Heimes217cfd12007-12-02 14:31:20 +00005583 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005584 PyObject *x;
5585
5586 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005588 x = PyObject_GetItem(mapping, w);
5589 Py_DECREF(w);
5590 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5592 /* No mapping found means: mapping is undefined. */
5593 PyErr_Clear();
5594 x = Py_None;
5595 Py_INCREF(x);
5596 return x;
5597 } else
5598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005600 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005602 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 long value = PyLong_AS_LONG(x);
5604 if (value < 0 || value > 255) {
5605 PyErr_SetString(PyExc_TypeError,
5606 "character mapping must be in range(256)");
5607 Py_DECREF(x);
5608 return NULL;
5609 }
5610 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005612 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 /* wrong return value */
5616 PyErr_Format(PyExc_TypeError,
5617 "character mapping must return integer, bytes or None, not %.400s",
5618 x->ob_type->tp_name);
5619 Py_DECREF(x);
5620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 }
5622}
5623
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005624static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005625charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005626{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005627 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5628 /* exponentially overallocate to minimize reallocations */
5629 if (requiredsize < 2*outsize)
5630 requiredsize = 2*outsize;
5631 if (_PyBytes_Resize(outobj, requiredsize))
5632 return -1;
5633 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005634}
5635
Benjamin Peterson14339b62009-01-31 16:36:08 +00005636typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00005638} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005640 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 space is available. Return a new reference to the object that
5642 was put in the output buffer, or Py_None, if the mapping was undefined
5643 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005644 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005645static charmapencode_result
5646charmapencode_output(Py_UNICODE c, PyObject *mapping,
5647 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005649 PyObject *rep;
5650 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005651 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005652
Christian Heimes90aa7642007-12-19 02:45:37 +00005653 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005654 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005656 if (res == -1)
5657 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 if (outsize<requiredsize)
5659 if (charmapencode_resize(outobj, outpos, requiredsize))
5660 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005661 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005662 outstart[(*outpos)++] = (char)res;
5663 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005664 }
5665
5666 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005669 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 Py_DECREF(rep);
5671 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005672 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 if (PyLong_Check(rep)) {
5674 Py_ssize_t requiredsize = *outpos+1;
5675 if (outsize<requiredsize)
5676 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5677 Py_DECREF(rep);
5678 return enc_EXCEPTION;
5679 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005680 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005682 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 else {
5684 const char *repchars = PyBytes_AS_STRING(rep);
5685 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5686 Py_ssize_t requiredsize = *outpos+repsize;
5687 if (outsize<requiredsize)
5688 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5689 Py_DECREF(rep);
5690 return enc_EXCEPTION;
5691 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005692 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 memcpy(outstart + *outpos, repchars, repsize);
5694 *outpos += repsize;
5695 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005697 Py_DECREF(rep);
5698 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699}
5700
5701/* handle an error in PyUnicode_EncodeCharmap
5702 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005703static int
5704charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005705 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005707 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005708 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709{
5710 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005711 Py_ssize_t repsize;
5712 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 Py_UNICODE *uni2;
5714 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005715 Py_ssize_t collstartpos = *inpos;
5716 Py_ssize_t collendpos = *inpos+1;
5717 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718 char *encoding = "charmap";
5719 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005720 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005721
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 /* find all unencodable characters */
5723 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005724 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005725 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 int res = encoding_map_lookup(p[collendpos], mapping);
5727 if (res != -1)
5728 break;
5729 ++collendpos;
5730 continue;
5731 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005732
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 rep = charmapencode_lookup(p[collendpos], mapping);
5734 if (rep==NULL)
5735 return -1;
5736 else if (rep!=Py_None) {
5737 Py_DECREF(rep);
5738 break;
5739 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005740 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 }
5743 /* cache callback name lookup
5744 * (if not done yet, i.e. it's the first error) */
5745 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 if ((errors==NULL) || (!strcmp(errors, "strict")))
5747 *known_errorHandler = 1;
5748 else if (!strcmp(errors, "replace"))
5749 *known_errorHandler = 2;
5750 else if (!strcmp(errors, "ignore"))
5751 *known_errorHandler = 3;
5752 else if (!strcmp(errors, "xmlcharrefreplace"))
5753 *known_errorHandler = 4;
5754 else
5755 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 }
5757 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005758 case 1: /* strict */
5759 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5760 return -1;
5761 case 2: /* replace */
5762 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 x = charmapencode_output('?', mapping, res, respos);
5764 if (x==enc_EXCEPTION) {
5765 return -1;
5766 }
5767 else if (x==enc_FAILED) {
5768 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5769 return -1;
5770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005771 }
5772 /* fall through */
5773 case 3: /* ignore */
5774 *inpos = collendpos;
5775 break;
5776 case 4: /* xmlcharrefreplace */
5777 /* generate replacement (temporarily (mis)uses p) */
5778 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 char buffer[2+29+1+1];
5780 char *cp;
5781 sprintf(buffer, "&#%d;", (int)p[collpos]);
5782 for (cp = buffer; *cp; ++cp) {
5783 x = charmapencode_output(*cp, mapping, res, respos);
5784 if (x==enc_EXCEPTION)
5785 return -1;
5786 else if (x==enc_FAILED) {
5787 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5788 return -1;
5789 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005790 }
5791 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005792 *inpos = collendpos;
5793 break;
5794 default:
5795 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 encoding, reason, p, size, exceptionObject,
5797 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005798 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005800 if (PyBytes_Check(repunicode)) {
5801 /* Directly copy bytes result to output. */
5802 Py_ssize_t outsize = PyBytes_Size(*res);
5803 Py_ssize_t requiredsize;
5804 repsize = PyBytes_Size(repunicode);
5805 requiredsize = *respos + repsize;
5806 if (requiredsize > outsize)
5807 /* Make room for all additional bytes. */
5808 if (charmapencode_resize(res, respos, requiredsize)) {
5809 Py_DECREF(repunicode);
5810 return -1;
5811 }
5812 memcpy(PyBytes_AsString(*res) + *respos,
5813 PyBytes_AsString(repunicode), repsize);
5814 *respos += repsize;
5815 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005816 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005817 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005818 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005819 /* generate replacement */
5820 repsize = PyUnicode_GET_SIZE(repunicode);
5821 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 x = charmapencode_output(*uni2, mapping, res, respos);
5823 if (x==enc_EXCEPTION) {
5824 return -1;
5825 }
5826 else if (x==enc_FAILED) {
5827 Py_DECREF(repunicode);
5828 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5829 return -1;
5830 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005831 }
5832 *inpos = newpos;
5833 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 }
5835 return 0;
5836}
5837
Alexander Belopolsky40018472011-02-26 01:02:56 +00005838PyObject *
5839PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5840 Py_ssize_t size,
5841 PyObject *mapping,
5842 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005844 /* output object */
5845 PyObject *res = NULL;
5846 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005847 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005849 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 PyObject *errorHandler = NULL;
5851 PyObject *exc = NULL;
5852 /* the following variable is used for caching string comparisons
5853 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5854 * 3=ignore, 4=xmlcharrefreplace */
5855 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856
5857 /* Default to Latin-1 */
5858 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 /* allocate enough for a simple encoding without
5862 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005863 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 if (res == NULL)
5865 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005866 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 /* try to encode it */
5871 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5872 if (x==enc_EXCEPTION) /* error */
5873 goto onError;
5874 if (x==enc_FAILED) { /* unencodable character */
5875 if (charmap_encoding_error(p, size, &inpos, mapping,
5876 &exc,
5877 &known_errorHandler, &errorHandler, errors,
5878 &res, &respos)) {
5879 goto onError;
5880 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005881 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 else
5883 /* done with this character => adjust input position */
5884 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005887 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005888 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005889 if (_PyBytes_Resize(&res, respos) < 0)
5890 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005891
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005892 Py_XDECREF(exc);
5893 Py_XDECREF(errorHandler);
5894 return res;
5895
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 Py_XDECREF(res);
5898 Py_XDECREF(exc);
5899 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 return NULL;
5901}
5902
Alexander Belopolsky40018472011-02-26 01:02:56 +00005903PyObject *
5904PyUnicode_AsCharmapString(PyObject *unicode,
5905 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906{
5907 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 PyErr_BadArgument();
5909 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 }
5911 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 PyUnicode_GET_SIZE(unicode),
5913 mapping,
5914 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915}
5916
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005918static void
5919make_translate_exception(PyObject **exceptionObject,
5920 const Py_UNICODE *unicode, Py_ssize_t size,
5921 Py_ssize_t startpos, Py_ssize_t endpos,
5922 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005924 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005925 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 }
5928 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5930 goto onError;
5931 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5932 goto onError;
5933 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5934 goto onError;
5935 return;
5936 onError:
5937 Py_DECREF(*exceptionObject);
5938 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 }
5940}
5941
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005942/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005943static void
5944raise_translate_exception(PyObject **exceptionObject,
5945 const Py_UNICODE *unicode, Py_ssize_t size,
5946 Py_ssize_t startpos, Py_ssize_t endpos,
5947 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005948{
5949 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005951 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005953}
5954
5955/* error handling callback helper:
5956 build arguments, call the callback and check the arguments,
5957 put the result into newpos and return the replacement string, which
5958 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005959static PyObject *
5960unicode_translate_call_errorhandler(const char *errors,
5961 PyObject **errorHandler,
5962 const char *reason,
5963 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5964 Py_ssize_t startpos, Py_ssize_t endpos,
5965 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005966{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005967 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005968
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005969 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970 PyObject *restuple;
5971 PyObject *resunicode;
5972
5973 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005975 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005977 }
5978
5979 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005981 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005983
5984 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005986 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005988 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005989 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 Py_DECREF(restuple);
5991 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005992 }
5993 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 &resunicode, &i_newpos)) {
5995 Py_DECREF(restuple);
5996 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005997 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005998 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006000 else
6001 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006002 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6004 Py_DECREF(restuple);
6005 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006006 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 Py_INCREF(resunicode);
6008 Py_DECREF(restuple);
6009 return resunicode;
6010}
6011
6012/* Lookup the character ch in the mapping and put the result in result,
6013 which must be decrefed by the caller.
6014 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006015static int
6016charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017{
Christian Heimes217cfd12007-12-02 14:31:20 +00006018 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006019 PyObject *x;
6020
6021 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 x = PyObject_GetItem(mapping, w);
6024 Py_DECREF(w);
6025 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6027 /* No mapping found means: use 1:1 mapping. */
6028 PyErr_Clear();
6029 *result = NULL;
6030 return 0;
6031 } else
6032 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 }
6034 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 *result = x;
6036 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006038 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 long value = PyLong_AS_LONG(x);
6040 long max = PyUnicode_GetMax();
6041 if (value < 0 || value > max) {
6042 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006043 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 Py_DECREF(x);
6045 return -1;
6046 }
6047 *result = x;
6048 return 0;
6049 }
6050 else if (PyUnicode_Check(x)) {
6051 *result = x;
6052 return 0;
6053 }
6054 else {
6055 /* wrong return value */
6056 PyErr_SetString(PyExc_TypeError,
6057 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006058 Py_DECREF(x);
6059 return -1;
6060 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061}
6062/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 if not reallocate and adjust various state variables.
6064 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006065static int
6066charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006069 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006070 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 /* remember old output position */
6072 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6073 /* exponentially overallocate to minimize reallocations */
6074 if (requiredsize < 2 * oldsize)
6075 requiredsize = 2 * oldsize;
6076 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6077 return -1;
6078 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 }
6080 return 0;
6081}
6082/* lookup the character, put the result in the output string and adjust
6083 various state variables. Return a new reference to the object that
6084 was put in the output buffer in *result, or Py_None, if the mapping was
6085 undefined (in which case no character was written).
6086 The called must decref result.
6087 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006088static int
6089charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6090 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6091 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092{
Walter Dörwald4894c302003-10-24 14:25:28 +00006093 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006095 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 /* not found => default to 1:1 mapping */
6097 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098 }
6099 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006101 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 /* no overflow check, because we know that the space is enough */
6103 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006104 }
6105 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6107 if (repsize==1) {
6108 /* no overflow check, because we know that the space is enough */
6109 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6110 }
6111 else if (repsize!=0) {
6112 /* more than one character */
6113 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6114 (insize - (curinp-startinp)) +
6115 repsize - 1;
6116 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6117 return -1;
6118 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6119 *outp += repsize;
6120 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 }
6122 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006124 return 0;
6125}
6126
Alexander Belopolsky40018472011-02-26 01:02:56 +00006127PyObject *
6128PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6129 Py_ssize_t size,
6130 PyObject *mapping,
6131 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133 /* output object */
6134 PyObject *res = NULL;
6135 /* pointers to the beginning and end+1 of input */
6136 const Py_UNICODE *startp = p;
6137 const Py_UNICODE *endp = p + size;
6138 /* pointer into the output */
6139 Py_UNICODE *str;
6140 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006141 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006142 char *reason = "character maps to <undefined>";
6143 PyObject *errorHandler = NULL;
6144 PyObject *exc = NULL;
6145 /* the following variable is used for caching string comparisons
6146 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6147 * 3=ignore, 4=xmlcharrefreplace */
6148 int known_errorHandler = -1;
6149
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 PyErr_BadArgument();
6152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154
6155 /* allocate enough for a simple 1:1 translation without
6156 replacements, if we need more, we'll resize */
6157 res = PyUnicode_FromUnicode(NULL, size);
6158 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006162 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 /* try to encode it */
6166 PyObject *x = NULL;
6167 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6168 Py_XDECREF(x);
6169 goto onError;
6170 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006171 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 if (x!=Py_None) /* it worked => adjust input pointer */
6173 ++p;
6174 else { /* untranslatable character */
6175 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6176 Py_ssize_t repsize;
6177 Py_ssize_t newpos;
6178 Py_UNICODE *uni2;
6179 /* startpos for collecting untranslatable chars */
6180 const Py_UNICODE *collstart = p;
6181 const Py_UNICODE *collend = p+1;
6182 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 /* find all untranslatable characters */
6185 while (collend < endp) {
6186 if (charmaptranslate_lookup(*collend, mapping, &x))
6187 goto onError;
6188 Py_XDECREF(x);
6189 if (x!=Py_None)
6190 break;
6191 ++collend;
6192 }
6193 /* cache callback name lookup
6194 * (if not done yet, i.e. it's the first error) */
6195 if (known_errorHandler==-1) {
6196 if ((errors==NULL) || (!strcmp(errors, "strict")))
6197 known_errorHandler = 1;
6198 else if (!strcmp(errors, "replace"))
6199 known_errorHandler = 2;
6200 else if (!strcmp(errors, "ignore"))
6201 known_errorHandler = 3;
6202 else if (!strcmp(errors, "xmlcharrefreplace"))
6203 known_errorHandler = 4;
6204 else
6205 known_errorHandler = 0;
6206 }
6207 switch (known_errorHandler) {
6208 case 1: /* strict */
6209 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006210 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 case 2: /* replace */
6212 /* No need to check for space, this is a 1:1 replacement */
6213 for (coll = collstart; coll<collend; ++coll)
6214 *str++ = '?';
6215 /* fall through */
6216 case 3: /* ignore */
6217 p = collend;
6218 break;
6219 case 4: /* xmlcharrefreplace */
6220 /* generate replacement (temporarily (mis)uses p) */
6221 for (p = collstart; p < collend; ++p) {
6222 char buffer[2+29+1+1];
6223 char *cp;
6224 sprintf(buffer, "&#%d;", (int)*p);
6225 if (charmaptranslate_makespace(&res, &str,
6226 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6227 goto onError;
6228 for (cp = buffer; *cp; ++cp)
6229 *str++ = *cp;
6230 }
6231 p = collend;
6232 break;
6233 default:
6234 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6235 reason, startp, size, &exc,
6236 collstart-startp, collend-startp, &newpos);
6237 if (repunicode == NULL)
6238 goto onError;
6239 /* generate replacement */
6240 repsize = PyUnicode_GET_SIZE(repunicode);
6241 if (charmaptranslate_makespace(&res, &str,
6242 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6243 Py_DECREF(repunicode);
6244 goto onError;
6245 }
6246 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6247 *str++ = *uni2;
6248 p = startp + newpos;
6249 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006250 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006251 }
6252 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006253 /* Resize if we allocated to much */
6254 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006255 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 if (PyUnicode_Resize(&res, respos) < 0)
6257 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258 }
6259 Py_XDECREF(exc);
6260 Py_XDECREF(errorHandler);
6261 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006264 Py_XDECREF(res);
6265 Py_XDECREF(exc);
6266 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 return NULL;
6268}
6269
Alexander Belopolsky40018472011-02-26 01:02:56 +00006270PyObject *
6271PyUnicode_Translate(PyObject *str,
6272 PyObject *mapping,
6273 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274{
6275 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006276
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 str = PyUnicode_FromObject(str);
6278 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 PyUnicode_GET_SIZE(str),
6282 mapping,
6283 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 Py_DECREF(str);
6285 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006286
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 Py_XDECREF(str);
6289 return NULL;
6290}
Tim Petersced69f82003-09-16 20:30:58 +00006291
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006292PyObject *
6293PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6294 Py_ssize_t length)
6295{
6296 PyObject *result;
6297 Py_UNICODE *p; /* write pointer into result */
6298 Py_ssize_t i;
6299 /* Copy to a new string */
6300 result = (PyObject *)_PyUnicode_New(length);
6301 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6302 if (result == NULL)
6303 return result;
6304 p = PyUnicode_AS_UNICODE(result);
6305 /* Iterate over code points */
6306 for (i = 0; i < length; i++) {
6307 Py_UNICODE ch =s[i];
6308 if (ch > 127) {
6309 int decimal = Py_UNICODE_TODECIMAL(ch);
6310 if (decimal >= 0)
6311 p[i] = '0' + decimal;
6312 }
6313 }
6314 return result;
6315}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006316/* --- Decimal Encoder ---------------------------------------------------- */
6317
Alexander Belopolsky40018472011-02-26 01:02:56 +00006318int
6319PyUnicode_EncodeDecimal(Py_UNICODE *s,
6320 Py_ssize_t length,
6321 char *output,
6322 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006323{
6324 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006325 PyObject *errorHandler = NULL;
6326 PyObject *exc = NULL;
6327 const char *encoding = "decimal";
6328 const char *reason = "invalid decimal Unicode string";
6329 /* the following variable is used for caching string comparisons
6330 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6331 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006332
6333 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 PyErr_BadArgument();
6335 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006336 }
6337
6338 p = s;
6339 end = s + length;
6340 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 register Py_UNICODE ch = *p;
6342 int decimal;
6343 PyObject *repunicode;
6344 Py_ssize_t repsize;
6345 Py_ssize_t newpos;
6346 Py_UNICODE *uni2;
6347 Py_UNICODE *collstart;
6348 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006349
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006351 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 ++p;
6353 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006354 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 decimal = Py_UNICODE_TODECIMAL(ch);
6356 if (decimal >= 0) {
6357 *output++ = '0' + decimal;
6358 ++p;
6359 continue;
6360 }
6361 if (0 < ch && ch < 256) {
6362 *output++ = (char)ch;
6363 ++p;
6364 continue;
6365 }
6366 /* All other characters are considered unencodable */
6367 collstart = p;
6368 collend = p+1;
6369 while (collend < end) {
6370 if ((0 < *collend && *collend < 256) ||
6371 !Py_UNICODE_ISSPACE(*collend) ||
6372 Py_UNICODE_TODECIMAL(*collend))
6373 break;
6374 }
6375 /* cache callback name lookup
6376 * (if not done yet, i.e. it's the first error) */
6377 if (known_errorHandler==-1) {
6378 if ((errors==NULL) || (!strcmp(errors, "strict")))
6379 known_errorHandler = 1;
6380 else if (!strcmp(errors, "replace"))
6381 known_errorHandler = 2;
6382 else if (!strcmp(errors, "ignore"))
6383 known_errorHandler = 3;
6384 else if (!strcmp(errors, "xmlcharrefreplace"))
6385 known_errorHandler = 4;
6386 else
6387 known_errorHandler = 0;
6388 }
6389 switch (known_errorHandler) {
6390 case 1: /* strict */
6391 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6392 goto onError;
6393 case 2: /* replace */
6394 for (p = collstart; p < collend; ++p)
6395 *output++ = '?';
6396 /* fall through */
6397 case 3: /* ignore */
6398 p = collend;
6399 break;
6400 case 4: /* xmlcharrefreplace */
6401 /* generate replacement (temporarily (mis)uses p) */
6402 for (p = collstart; p < collend; ++p)
6403 output += sprintf(output, "&#%d;", (int)*p);
6404 p = collend;
6405 break;
6406 default:
6407 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6408 encoding, reason, s, length, &exc,
6409 collstart-s, collend-s, &newpos);
6410 if (repunicode == NULL)
6411 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006412 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006413 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006414 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6415 Py_DECREF(repunicode);
6416 goto onError;
6417 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 /* generate replacement */
6419 repsize = PyUnicode_GET_SIZE(repunicode);
6420 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6421 Py_UNICODE ch = *uni2;
6422 if (Py_UNICODE_ISSPACE(ch))
6423 *output++ = ' ';
6424 else {
6425 decimal = Py_UNICODE_TODECIMAL(ch);
6426 if (decimal >= 0)
6427 *output++ = '0' + decimal;
6428 else if (0 < ch && ch < 256)
6429 *output++ = (char)ch;
6430 else {
6431 Py_DECREF(repunicode);
6432 raise_encode_exception(&exc, encoding,
6433 s, length, collstart-s, collend-s, reason);
6434 goto onError;
6435 }
6436 }
6437 }
6438 p = s + newpos;
6439 Py_DECREF(repunicode);
6440 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006441 }
6442 /* 0-terminate the output string */
6443 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 Py_XDECREF(exc);
6445 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006446 return 0;
6447
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 Py_XDECREF(exc);
6450 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006451 return -1;
6452}
6453
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454/* --- Helpers ------------------------------------------------------------ */
6455
Eric Smith8c663262007-08-25 02:26:07 +00006456#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006457#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006458
Thomas Wouters477c8d52006-05-27 19:21:47 +00006459#include "stringlib/count.h"
6460#include "stringlib/find.h"
6461#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006462#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006463
Eric Smith5807c412008-05-11 21:00:57 +00006464#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006465#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006466#include "stringlib/localeutil.h"
6467
Thomas Wouters477c8d52006-05-27 19:21:47 +00006468/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006469#define ADJUST_INDICES(start, end, len) \
6470 if (end > len) \
6471 end = len; \
6472 else if (end < 0) { \
6473 end += len; \
6474 if (end < 0) \
6475 end = 0; \
6476 } \
6477 if (start < 0) { \
6478 start += len; \
6479 if (start < 0) \
6480 start = 0; \
6481 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006482
Alexander Belopolsky40018472011-02-26 01:02:56 +00006483Py_ssize_t
6484PyUnicode_Count(PyObject *str,
6485 PyObject *substr,
6486 Py_ssize_t start,
6487 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006489 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006490 PyUnicodeObject* str_obj;
6491 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006492
Thomas Wouters477c8d52006-05-27 19:21:47 +00006493 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6494 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006496 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6497 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 Py_DECREF(str_obj);
6499 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 }
Tim Petersced69f82003-09-16 20:30:58 +00006501
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006502 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006503 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006504 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6505 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006506 );
6507
6508 Py_DECREF(sub_obj);
6509 Py_DECREF(str_obj);
6510
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 return result;
6512}
6513
Alexander Belopolsky40018472011-02-26 01:02:56 +00006514Py_ssize_t
6515PyUnicode_Find(PyObject *str,
6516 PyObject *sub,
6517 Py_ssize_t start,
6518 Py_ssize_t end,
6519 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006521 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006522
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006524 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006526 sub = PyUnicode_FromObject(sub);
6527 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 Py_DECREF(str);
6529 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 }
Tim Petersced69f82003-09-16 20:30:58 +00006531
Thomas Wouters477c8d52006-05-27 19:21:47 +00006532 if (direction > 0)
6533 result = stringlib_find_slice(
6534 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6535 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6536 start, end
6537 );
6538 else
6539 result = stringlib_rfind_slice(
6540 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6541 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6542 start, end
6543 );
6544
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006546 Py_DECREF(sub);
6547
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 return result;
6549}
6550
Alexander Belopolsky40018472011-02-26 01:02:56 +00006551static int
6552tailmatch(PyUnicodeObject *self,
6553 PyUnicodeObject *substring,
6554 Py_ssize_t start,
6555 Py_ssize_t end,
6556 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 if (substring->length == 0)
6559 return 1;
6560
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006561 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 end -= substring->length;
6563 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565
6566 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 if (Py_UNICODE_MATCH(self, end, substring))
6568 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 } else {
6570 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 }
6573
6574 return 0;
6575}
6576
Alexander Belopolsky40018472011-02-26 01:02:56 +00006577Py_ssize_t
6578PyUnicode_Tailmatch(PyObject *str,
6579 PyObject *substr,
6580 Py_ssize_t start,
6581 Py_ssize_t end,
6582 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006584 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006585
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 str = PyUnicode_FromObject(str);
6587 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 substr = PyUnicode_FromObject(substr);
6590 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 Py_DECREF(str);
6592 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 }
Tim Petersced69f82003-09-16 20:30:58 +00006594
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 (PyUnicodeObject *)substr,
6597 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 Py_DECREF(str);
6599 Py_DECREF(substr);
6600 return result;
6601}
6602
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603/* Apply fixfct filter to the Unicode object self and return a
6604 reference to the modified object */
6605
Alexander Belopolsky40018472011-02-26 01:02:56 +00006606static PyObject *
6607fixup(PyUnicodeObject *self,
6608 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609{
6610
6611 PyUnicodeObject *u;
6612
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006613 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006616
6617 Py_UNICODE_COPY(u->str, self->str, self->length);
6618
Tim Peters7a29bd52001-09-12 03:03:31 +00006619 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 /* fixfct should return TRUE if it modified the buffer. If
6621 FALSE, return a reference to the original buffer instead
6622 (to save space, not time) */
6623 Py_INCREF(self);
6624 Py_DECREF(u);
6625 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626 }
6627 return (PyObject*) u;
6628}
6629
Alexander Belopolsky40018472011-02-26 01:02:56 +00006630static int
6631fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006633 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 Py_UNICODE *s = self->str;
6635 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006636
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006639
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 ch = Py_UNICODE_TOUPPER(*s);
6641 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 *s = ch;
6644 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 s++;
6646 }
6647
6648 return status;
6649}
6650
Alexander Belopolsky40018472011-02-26 01:02:56 +00006651static int
6652fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006654 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 Py_UNICODE *s = self->str;
6656 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006657
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006660
Benjamin Peterson29060642009-01-31 22:14:21 +00006661 ch = Py_UNICODE_TOLOWER(*s);
6662 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 *s = ch;
6665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 s++;
6667 }
6668
6669 return status;
6670}
6671
Alexander Belopolsky40018472011-02-26 01:02:56 +00006672static int
6673fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006675 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 Py_UNICODE *s = self->str;
6677 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006678
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 while (len-- > 0) {
6680 if (Py_UNICODE_ISUPPER(*s)) {
6681 *s = Py_UNICODE_TOLOWER(*s);
6682 status = 1;
6683 } else if (Py_UNICODE_ISLOWER(*s)) {
6684 *s = Py_UNICODE_TOUPPER(*s);
6685 status = 1;
6686 }
6687 s++;
6688 }
6689
6690 return status;
6691}
6692
Alexander Belopolsky40018472011-02-26 01:02:56 +00006693static int
6694fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006696 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006697 Py_UNICODE *s = self->str;
6698 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006699
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006700 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006702 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 *s = Py_UNICODE_TOUPPER(*s);
6704 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006706 s++;
6707 while (--len > 0) {
6708 if (Py_UNICODE_ISUPPER(*s)) {
6709 *s = Py_UNICODE_TOLOWER(*s);
6710 status = 1;
6711 }
6712 s++;
6713 }
6714 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715}
6716
Alexander Belopolsky40018472011-02-26 01:02:56 +00006717static int
6718fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719{
6720 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6721 register Py_UNICODE *e;
6722 int previous_is_cased;
6723
6724 /* Shortcut for single character strings */
6725 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6727 if (*p != ch) {
6728 *p = ch;
6729 return 1;
6730 }
6731 else
6732 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 }
Tim Petersced69f82003-09-16 20:30:58 +00006734
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 e = p + PyUnicode_GET_SIZE(self);
6736 previous_is_cased = 0;
6737 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006739
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 if (previous_is_cased)
6741 *p = Py_UNICODE_TOLOWER(ch);
6742 else
6743 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006744
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 if (Py_UNICODE_ISLOWER(ch) ||
6746 Py_UNICODE_ISUPPER(ch) ||
6747 Py_UNICODE_ISTITLE(ch))
6748 previous_is_cased = 1;
6749 else
6750 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 }
6752 return 1;
6753}
6754
Tim Peters8ce9f162004-08-27 01:49:32 +00006755PyObject *
6756PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757{
Skip Montanaro6543b452004-09-16 03:28:13 +00006758 const Py_UNICODE blank = ' ';
6759 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006760 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006761 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006762 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6763 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006764 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6765 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006766 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006767 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768
Tim Peters05eba1f2004-08-27 21:32:02 +00006769 fseq = PySequence_Fast(seq, "");
6770 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006771 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006772 }
6773
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006774 /* NOTE: the following code can't call back into Python code,
6775 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006776 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006777
Tim Peters05eba1f2004-08-27 21:32:02 +00006778 seqlen = PySequence_Fast_GET_SIZE(fseq);
6779 /* If empty sequence, return u"". */
6780 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006781 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6782 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006783 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006784 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006785 /* If singleton sequence with an exact Unicode, return that. */
6786 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 item = items[0];
6788 if (PyUnicode_CheckExact(item)) {
6789 Py_INCREF(item);
6790 res = (PyUnicodeObject *)item;
6791 goto Done;
6792 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006793 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006794 else {
6795 /* Set up sep and seplen */
6796 if (separator == NULL) {
6797 sep = &blank;
6798 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006799 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006800 else {
6801 if (!PyUnicode_Check(separator)) {
6802 PyErr_Format(PyExc_TypeError,
6803 "separator: expected str instance,"
6804 " %.80s found",
6805 Py_TYPE(separator)->tp_name);
6806 goto onError;
6807 }
6808 sep = PyUnicode_AS_UNICODE(separator);
6809 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006810 }
6811 }
6812
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006813 /* There are at least two things to join, or else we have a subclass
6814 * of str in the sequence.
6815 * Do a pre-pass to figure out the total amount of space we'll
6816 * need (sz), and see whether all argument are strings.
6817 */
6818 sz = 0;
6819 for (i = 0; i < seqlen; i++) {
6820 const Py_ssize_t old_sz = sz;
6821 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 if (!PyUnicode_Check(item)) {
6823 PyErr_Format(PyExc_TypeError,
6824 "sequence item %zd: expected str instance,"
6825 " %.80s found",
6826 i, Py_TYPE(item)->tp_name);
6827 goto onError;
6828 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006829 sz += PyUnicode_GET_SIZE(item);
6830 if (i != 0)
6831 sz += seplen;
6832 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6833 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006835 goto onError;
6836 }
6837 }
Tim Petersced69f82003-09-16 20:30:58 +00006838
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006839 res = _PyUnicode_New(sz);
6840 if (res == NULL)
6841 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006842
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006843 /* Catenate everything. */
6844 res_p = PyUnicode_AS_UNICODE(res);
6845 for (i = 0; i < seqlen; ++i) {
6846 Py_ssize_t itemlen;
6847 item = items[i];
6848 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 /* Copy item, and maybe the separator. */
6850 if (i) {
6851 Py_UNICODE_COPY(res_p, sep, seplen);
6852 res_p += seplen;
6853 }
6854 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6855 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006856 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006857
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006859 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 return (PyObject *)res;
6861
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006863 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006864 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 return NULL;
6866}
6867
Alexander Belopolsky40018472011-02-26 01:02:56 +00006868static PyUnicodeObject *
6869pad(PyUnicodeObject *self,
6870 Py_ssize_t left,
6871 Py_ssize_t right,
6872 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873{
6874 PyUnicodeObject *u;
6875
6876 if (left < 0)
6877 left = 0;
6878 if (right < 0)
6879 right = 0;
6880
Tim Peters7a29bd52001-09-12 03:03:31 +00006881 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 Py_INCREF(self);
6883 return self;
6884 }
6885
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006886 if (left > PY_SSIZE_T_MAX - self->length ||
6887 right > PY_SSIZE_T_MAX - (left + self->length)) {
6888 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6889 return NULL;
6890 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 u = _PyUnicode_New(left + self->length + right);
6892 if (u) {
6893 if (left)
6894 Py_UNICODE_FILL(u->str, fill, left);
6895 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6896 if (right)
6897 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6898 }
6899
6900 return u;
6901}
6902
Alexander Belopolsky40018472011-02-26 01:02:56 +00006903PyObject *
6904PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907
6908 string = PyUnicode_FromObject(string);
6909 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006912 list = stringlib_splitlines(
6913 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6914 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915
6916 Py_DECREF(string);
6917 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918}
6919
Alexander Belopolsky40018472011-02-26 01:02:56 +00006920static PyObject *
6921split(PyUnicodeObject *self,
6922 PyUnicodeObject *substring,
6923 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006926 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006929 return stringlib_split_whitespace(
6930 (PyObject*) self, self->str, self->length, maxcount
6931 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006933 return stringlib_split(
6934 (PyObject*) self, self->str, self->length,
6935 substring->str, substring->length,
6936 maxcount
6937 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938}
6939
Alexander Belopolsky40018472011-02-26 01:02:56 +00006940static PyObject *
6941rsplit(PyUnicodeObject *self,
6942 PyUnicodeObject *substring,
6943 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006944{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006945 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006946 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006947
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006948 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006949 return stringlib_rsplit_whitespace(
6950 (PyObject*) self, self->str, self->length, maxcount
6951 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006952
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006953 return stringlib_rsplit(
6954 (PyObject*) self, self->str, self->length,
6955 substring->str, substring->length,
6956 maxcount
6957 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006958}
6959
Alexander Belopolsky40018472011-02-26 01:02:56 +00006960static PyObject *
6961replace(PyUnicodeObject *self,
6962 PyUnicodeObject *str1,
6963 PyUnicodeObject *str2,
6964 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965{
6966 PyUnicodeObject *u;
6967
6968 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006970 else if (maxcount == 0 || self->length == 0)
6971 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972
Thomas Wouters477c8d52006-05-27 19:21:47 +00006973 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006974 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006975 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006976 if (str1->length == 0)
6977 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006978 if (str1->length == 1) {
6979 /* replace characters */
6980 Py_UNICODE u1, u2;
6981 if (!findchar(self->str, self->length, str1->str[0]))
6982 goto nothing;
6983 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6984 if (!u)
6985 return NULL;
6986 Py_UNICODE_COPY(u->str, self->str, self->length);
6987 u1 = str1->str[0];
6988 u2 = str2->str[0];
6989 for (i = 0; i < u->length; i++)
6990 if (u->str[i] == u1) {
6991 if (--maxcount < 0)
6992 break;
6993 u->str[i] = u2;
6994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006996 i = stringlib_find(
6997 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006999 if (i < 0)
7000 goto nothing;
7001 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7002 if (!u)
7003 return NULL;
7004 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007005
7006 /* change everything in-place, starting with this one */
7007 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7008 i += str1->length;
7009
7010 while ( --maxcount > 0) {
7011 i = stringlib_find(self->str+i, self->length-i,
7012 str1->str, str1->length,
7013 i);
7014 if (i == -1)
7015 break;
7016 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7017 i += str1->length;
7018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007021
Brett Cannonb94767f2011-02-22 20:15:44 +00007022 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007023 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 Py_UNICODE *p;
7025
7026 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007027 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7028 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007029 if (n == 0)
7030 goto nothing;
7031 /* new_size = self->length + n * (str2->length - str1->length)); */
7032 delta = (str2->length - str1->length);
7033 if (delta == 0) {
7034 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007036 product = n * (str2->length - str1->length);
7037 if ((product / (str2->length - str1->length)) != n) {
7038 PyErr_SetString(PyExc_OverflowError,
7039 "replace string is too long");
7040 return NULL;
7041 }
7042 new_size = self->length + product;
7043 if (new_size < 0) {
7044 PyErr_SetString(PyExc_OverflowError,
7045 "replace string is too long");
7046 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047 }
7048 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007049 u = _PyUnicode_New(new_size);
7050 if (!u)
7051 return NULL;
7052 i = 0;
7053 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007054 if (str1->length > 0) {
7055 while (n-- > 0) {
7056 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007057 j = stringlib_find(self->str+i, self->length-i,
7058 str1->str, str1->length,
7059 i);
7060 if (j == -1)
7061 break;
7062 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007063 /* copy unchanged part [i:j] */
7064 Py_UNICODE_COPY(p, self->str+i, j-i);
7065 p += j - i;
7066 }
7067 /* copy substitution string */
7068 if (str2->length > 0) {
7069 Py_UNICODE_COPY(p, str2->str, str2->length);
7070 p += str2->length;
7071 }
7072 i = j + str1->length;
7073 }
7074 if (i < self->length)
7075 /* copy tail [i:] */
7076 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7077 } else {
7078 /* interleave */
7079 while (n > 0) {
7080 Py_UNICODE_COPY(p, str2->str, str2->length);
7081 p += str2->length;
7082 if (--n <= 0)
7083 break;
7084 *p++ = self->str[i++];
7085 }
7086 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007090
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007092 /* nothing to replace; return original string (when possible) */
7093 if (PyUnicode_CheckExact(self)) {
7094 Py_INCREF(self);
7095 return (PyObject *) self;
7096 }
7097 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098}
7099
7100/* --- Unicode Object Methods --------------------------------------------- */
7101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007102PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007103 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104\n\
7105Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007106characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107
7108static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007109unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111 return fixup(self, fixtitle);
7112}
7113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007114PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116\n\
7117Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007118have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119
7120static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007121unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123 return fixup(self, fixcapitalize);
7124}
7125
7126#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007127PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129\n\
7130Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007131normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
7133static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007134unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135{
7136 PyObject *list;
7137 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007138 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 /* Split into words */
7141 list = split(self, NULL, -1);
7142 if (!list)
7143 return NULL;
7144
7145 /* Capitalize each word */
7146 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7147 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 if (item == NULL)
7150 goto onError;
7151 Py_DECREF(PyList_GET_ITEM(list, i));
7152 PyList_SET_ITEM(list, i, item);
7153 }
7154
7155 /* Join the words to form a new string */
7156 item = PyUnicode_Join(NULL, list);
7157
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 Py_DECREF(list);
7160 return (PyObject *)item;
7161}
7162#endif
7163
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007164/* Argument converter. Coerces to a single unicode character */
7165
7166static int
7167convert_uc(PyObject *obj, void *addr)
7168{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007169 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7170 PyObject *uniobj;
7171 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007172
Benjamin Peterson14339b62009-01-31 16:36:08 +00007173 uniobj = PyUnicode_FromObject(obj);
7174 if (uniobj == NULL) {
7175 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007177 return 0;
7178 }
7179 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7180 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007182 Py_DECREF(uniobj);
7183 return 0;
7184 }
7185 unistr = PyUnicode_AS_UNICODE(uniobj);
7186 *fillcharloc = unistr[0];
7187 Py_DECREF(uniobj);
7188 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007189}
7190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007191PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007194Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007195done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196
7197static PyObject *
7198unicode_center(PyUnicodeObject *self, PyObject *args)
7199{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007200 Py_ssize_t marg, left;
7201 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007202 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203
Thomas Woutersde017742006-02-16 19:34:37 +00007204 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 return NULL;
7206
Tim Peters7a29bd52001-09-12 03:03:31 +00007207 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 Py_INCREF(self);
7209 return (PyObject*) self;
7210 }
7211
7212 marg = width - self->length;
7213 left = marg / 2 + (marg & width & 1);
7214
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007215 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216}
7217
Marc-André Lemburge5034372000-08-08 08:04:29 +00007218#if 0
7219
7220/* This code should go into some future Unicode collation support
7221 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007222 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007223
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007224/* speedy UTF-16 code point order comparison */
7225/* gleaned from: */
7226/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7227
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007228static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007229{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007230 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007231 0, 0, 0, 0, 0, 0, 0, 0,
7232 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007233 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007234};
7235
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236static int
7237unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7238{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007239 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007240
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 Py_UNICODE *s1 = str1->str;
7242 Py_UNICODE *s2 = str2->str;
7243
7244 len1 = str1->length;
7245 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007246
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007248 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007249
7250 c1 = *s1++;
7251 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007252
Benjamin Peterson29060642009-01-31 22:14:21 +00007253 if (c1 > (1<<11) * 26)
7254 c1 += utf16Fixup[c1>>11];
7255 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007256 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007257 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007258
7259 if (c1 != c2)
7260 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007261
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007262 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 }
7264
7265 return (len1 < len2) ? -1 : (len1 != len2);
7266}
7267
Marc-André Lemburge5034372000-08-08 08:04:29 +00007268#else
7269
7270static int
7271unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7272{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007273 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007274
7275 Py_UNICODE *s1 = str1->str;
7276 Py_UNICODE *s2 = str2->str;
7277
7278 len1 = str1->length;
7279 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007280
Marc-André Lemburge5034372000-08-08 08:04:29 +00007281 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007282 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007283
Fredrik Lundh45714e92001-06-26 16:39:36 +00007284 c1 = *s1++;
7285 c2 = *s2++;
7286
7287 if (c1 != c2)
7288 return (c1 < c2) ? -1 : 1;
7289
Marc-André Lemburge5034372000-08-08 08:04:29 +00007290 len1--; len2--;
7291 }
7292
7293 return (len1 < len2) ? -1 : (len1 != len2);
7294}
7295
7296#endif
7297
Alexander Belopolsky40018472011-02-26 01:02:56 +00007298int
7299PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007301 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7302 return unicode_compare((PyUnicodeObject *)left,
7303 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007304 PyErr_Format(PyExc_TypeError,
7305 "Can't compare %.100s and %.100s",
7306 left->ob_type->tp_name,
7307 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308 return -1;
7309}
7310
Martin v. Löwis5b222132007-06-10 09:51:05 +00007311int
7312PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7313{
7314 int i;
7315 Py_UNICODE *id;
7316 assert(PyUnicode_Check(uni));
7317 id = PyUnicode_AS_UNICODE(uni);
7318 /* Compare Unicode string and source character set string */
7319 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 if (id[i] != str[i])
7321 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007322 /* This check keeps Python strings that end in '\0' from comparing equal
7323 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007324 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007326 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007328 return 0;
7329}
7330
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007331
Benjamin Peterson29060642009-01-31 22:14:21 +00007332#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007333 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007334
Alexander Belopolsky40018472011-02-26 01:02:56 +00007335PyObject *
7336PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007337{
7338 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007339
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007340 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7341 PyObject *v;
7342 if (((PyUnicodeObject *) left)->length !=
7343 ((PyUnicodeObject *) right)->length) {
7344 if (op == Py_EQ) {
7345 Py_INCREF(Py_False);
7346 return Py_False;
7347 }
7348 if (op == Py_NE) {
7349 Py_INCREF(Py_True);
7350 return Py_True;
7351 }
7352 }
7353 if (left == right)
7354 result = 0;
7355 else
7356 result = unicode_compare((PyUnicodeObject *)left,
7357 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007358
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007359 /* Convert the return value to a Boolean */
7360 switch (op) {
7361 case Py_EQ:
7362 v = TEST_COND(result == 0);
7363 break;
7364 case Py_NE:
7365 v = TEST_COND(result != 0);
7366 break;
7367 case Py_LE:
7368 v = TEST_COND(result <= 0);
7369 break;
7370 case Py_GE:
7371 v = TEST_COND(result >= 0);
7372 break;
7373 case Py_LT:
7374 v = TEST_COND(result == -1);
7375 break;
7376 case Py_GT:
7377 v = TEST_COND(result == 1);
7378 break;
7379 default:
7380 PyErr_BadArgument();
7381 return NULL;
7382 }
7383 Py_INCREF(v);
7384 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007385 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007386
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007387 Py_INCREF(Py_NotImplemented);
7388 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007389}
7390
Alexander Belopolsky40018472011-02-26 01:02:56 +00007391int
7392PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007393{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007394 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007395 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007396
7397 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007398 sub = PyUnicode_FromObject(element);
7399 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 PyErr_Format(PyExc_TypeError,
7401 "'in <string>' requires string as left operand, not %s",
7402 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007403 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007404 }
7405
Thomas Wouters477c8d52006-05-27 19:21:47 +00007406 str = PyUnicode_FromObject(container);
7407 if (!str) {
7408 Py_DECREF(sub);
7409 return -1;
7410 }
7411
7412 result = stringlib_contains_obj(str, sub);
7413
7414 Py_DECREF(str);
7415 Py_DECREF(sub);
7416
Guido van Rossum403d68b2000-03-13 15:55:09 +00007417 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007418}
7419
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420/* Concat to string or Unicode object giving a new Unicode object. */
7421
Alexander Belopolsky40018472011-02-26 01:02:56 +00007422PyObject *
7423PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424{
7425 PyUnicodeObject *u = NULL, *v = NULL, *w;
7426
7427 /* Coerce the two arguments */
7428 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7429 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007430 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7432 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434
7435 /* Shortcuts */
7436 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 Py_DECREF(v);
7438 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 }
7440 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 Py_DECREF(u);
7442 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 }
7444
7445 /* Concat the two Unicode strings */
7446 w = _PyUnicode_New(u->length + v->length);
7447 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 Py_UNICODE_COPY(w->str, u->str, u->length);
7450 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7451
7452 Py_DECREF(u);
7453 Py_DECREF(v);
7454 return (PyObject *)w;
7455
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 Py_XDECREF(u);
7458 Py_XDECREF(v);
7459 return NULL;
7460}
7461
Walter Dörwald1ab83302007-05-18 17:15:44 +00007462void
7463PyUnicode_Append(PyObject **pleft, PyObject *right)
7464{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007465 PyObject *new;
7466 if (*pleft == NULL)
7467 return;
7468 if (right == NULL || !PyUnicode_Check(*pleft)) {
7469 Py_DECREF(*pleft);
7470 *pleft = NULL;
7471 return;
7472 }
7473 new = PyUnicode_Concat(*pleft, right);
7474 Py_DECREF(*pleft);
7475 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007476}
7477
7478void
7479PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7480{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007481 PyUnicode_Append(pleft, right);
7482 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007483}
7484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007485PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007488Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007489string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007490interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491
7492static PyObject *
7493unicode_count(PyUnicodeObject *self, PyObject *args)
7494{
7495 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007496 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007497 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 PyObject *result;
7499
Guido van Rossumb8872e62000-05-09 14:14:27 +00007500 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 return NULL;
7503
7504 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007505 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007508
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007509 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007510 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007511 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007512 substring->str, substring->length,
7513 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007514 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515
7516 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007517
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 return result;
7519}
7520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007521PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007522 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007524Encode S using the codec registered for encoding. Default encoding\n\
7525is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007526handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007527a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7528'xmlcharrefreplace' as well as any other name registered with\n\
7529codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530
7531static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007532unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007534 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535 char *encoding = NULL;
7536 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007537
Benjamin Peterson308d6372009-09-18 21:42:35 +00007538 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7539 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007541 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007542}
7543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007544PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007545 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546\n\
7547Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007548If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549
7550static PyObject*
7551unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7552{
7553 Py_UNICODE *e;
7554 Py_UNICODE *p;
7555 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007556 Py_UNICODE *qe;
7557 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 PyUnicodeObject *u;
7559 int tabsize = 8;
7560
7561 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563
Thomas Wouters7e474022000-07-16 12:04:32 +00007564 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007565 i = 0; /* chars up to and including most recent \n or \r */
7566 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7567 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 for (p = self->str; p < e; p++)
7569 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 if (tabsize > 0) {
7571 incr = tabsize - (j % tabsize); /* cannot overflow */
7572 if (j > PY_SSIZE_T_MAX - incr)
7573 goto overflow1;
7574 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 if (j > PY_SSIZE_T_MAX - 1)
7579 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 j++;
7581 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 if (i > PY_SSIZE_T_MAX - j)
7583 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007585 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 }
7587 }
7588
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007589 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007591
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 /* Second pass: create output string and fill it */
7593 u = _PyUnicode_New(i + j);
7594 if (!u)
7595 return NULL;
7596
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007597 j = 0; /* same as in first pass */
7598 q = u->str; /* next output char */
7599 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600
7601 for (p = self->str; p < e; p++)
7602 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 if (tabsize > 0) {
7604 i = tabsize - (j % tabsize);
7605 j += i;
7606 while (i--) {
7607 if (q >= qe)
7608 goto overflow2;
7609 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007610 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007612 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 else {
7614 if (q >= qe)
7615 goto overflow2;
7616 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007617 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618 if (*p == '\n' || *p == '\r')
7619 j = 0;
7620 }
7621
7622 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007623
7624 overflow2:
7625 Py_DECREF(u);
7626 overflow1:
7627 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629}
7630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007631PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633\n\
7634Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007635such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636arguments start and end are interpreted as in slice notation.\n\
7637\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007638Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639
7640static PyObject *
7641unicode_find(PyUnicodeObject *self, PyObject *args)
7642{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007643 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007644 Py_ssize_t start;
7645 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007646 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647
Christian Heimes9cd17752007-11-18 19:35:23 +00007648 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650
Thomas Wouters477c8d52006-05-27 19:21:47 +00007651 result = stringlib_find_slice(
7652 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7653 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7654 start, end
7655 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656
7657 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007658
Christian Heimes217cfd12007-12-02 14:31:20 +00007659 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660}
7661
7662static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007663unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664{
7665 if (index < 0 || index >= self->length) {
7666 PyErr_SetString(PyExc_IndexError, "string index out of range");
7667 return NULL;
7668 }
7669
7670 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7671}
7672
Guido van Rossumc2504932007-09-18 19:42:40 +00007673/* Believe it or not, this produces the same value for ASCII strings
7674 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007675static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007676unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677{
Guido van Rossumc2504932007-09-18 19:42:40 +00007678 Py_ssize_t len;
7679 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007680 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007681
7682 if (self->hash != -1)
7683 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007684 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007685 p = self->str;
7686 x = *p << 7;
7687 while (--len >= 0)
7688 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007689 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007690 if (x == -1)
7691 x = -2;
7692 self->hash = x;
7693 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694}
7695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007696PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007699Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700
7701static PyObject *
7702unicode_index(PyUnicodeObject *self, PyObject *args)
7703{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007704 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007705 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007706 Py_ssize_t start;
7707 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708
Christian Heimes9cd17752007-11-18 19:35:23 +00007709 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711
Thomas Wouters477c8d52006-05-27 19:21:47 +00007712 result = stringlib_find_slice(
7713 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7714 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7715 start, end
7716 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717
7718 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007719
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 if (result < 0) {
7721 PyErr_SetString(PyExc_ValueError, "substring not found");
7722 return NULL;
7723 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007724
Christian Heimes217cfd12007-12-02 14:31:20 +00007725 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726}
7727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007728PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007731Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007732at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733
7734static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007735unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736{
7737 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7738 register const Py_UNICODE *e;
7739 int cased;
7740
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741 /* Shortcut for single character strings */
7742 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007743 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007745 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007746 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007748
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 e = p + PyUnicode_GET_SIZE(self);
7750 cased = 0;
7751 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007753
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7755 return PyBool_FromLong(0);
7756 else if (!cased && Py_UNICODE_ISLOWER(ch))
7757 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007759 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760}
7761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007762PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007765Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007766at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767
7768static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007769unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770{
7771 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7772 register const Py_UNICODE *e;
7773 int cased;
7774
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 /* Shortcut for single character strings */
7776 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007779 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007780 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007782
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 e = p + PyUnicode_GET_SIZE(self);
7784 cased = 0;
7785 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007787
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7789 return PyBool_FromLong(0);
7790 else if (!cased && Py_UNICODE_ISUPPER(ch))
7791 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007793 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794}
7795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007796PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007799Return True if S is a titlecased string and there is at least one\n\
7800character in S, i.e. upper- and titlecase characters may only\n\
7801follow uncased characters and lowercase characters only cased ones.\n\
7802Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803
7804static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007805unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806{
7807 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7808 register const Py_UNICODE *e;
7809 int cased, previous_is_cased;
7810
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811 /* Shortcut for single character strings */
7812 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7814 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007816 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007817 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007819
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820 e = p + PyUnicode_GET_SIZE(self);
7821 cased = 0;
7822 previous_is_cased = 0;
7823 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007825
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7827 if (previous_is_cased)
7828 return PyBool_FromLong(0);
7829 previous_is_cased = 1;
7830 cased = 1;
7831 }
7832 else if (Py_UNICODE_ISLOWER(ch)) {
7833 if (!previous_is_cased)
7834 return PyBool_FromLong(0);
7835 previous_is_cased = 1;
7836 cased = 1;
7837 }
7838 else
7839 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007841 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842}
7843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007844PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007847Return True if all characters in S are whitespace\n\
7848and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849
7850static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007851unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852{
7853 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7854 register const Py_UNICODE *e;
7855
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 /* Shortcut for single character strings */
7857 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 Py_UNICODE_ISSPACE(*p))
7859 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007861 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007862 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007864
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865 e = p + PyUnicode_GET_SIZE(self);
7866 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 if (!Py_UNICODE_ISSPACE(*p))
7868 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007870 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871}
7872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007873PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007875\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007876Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007877and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007878
7879static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007880unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007881{
7882 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7883 register const Py_UNICODE *e;
7884
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007885 /* Shortcut for single character strings */
7886 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 Py_UNICODE_ISALPHA(*p))
7888 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007889
7890 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007891 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007893
7894 e = p + PyUnicode_GET_SIZE(self);
7895 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 if (!Py_UNICODE_ISALPHA(*p))
7897 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007898 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007899 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007900}
7901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007902PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007904\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007905Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007906and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007907
7908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007909unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007910{
7911 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7912 register const Py_UNICODE *e;
7913
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007914 /* Shortcut for single character strings */
7915 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 Py_UNICODE_ISALNUM(*p))
7917 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007918
7919 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007920 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007922
7923 e = p + PyUnicode_GET_SIZE(self);
7924 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 if (!Py_UNICODE_ISALNUM(*p))
7926 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007927 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007928 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007929}
7930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007931PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007934Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007935False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936
7937static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007938unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939{
7940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7941 register const Py_UNICODE *e;
7942
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943 /* Shortcut for single character strings */
7944 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 Py_UNICODE_ISDECIMAL(*p))
7946 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007948 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007949 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007951
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 e = p + PyUnicode_GET_SIZE(self);
7953 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 if (!Py_UNICODE_ISDECIMAL(*p))
7955 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007957 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958}
7959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007960PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007963Return True if all characters in S are digits\n\
7964and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965
7966static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007967unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968{
7969 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7970 register const Py_UNICODE *e;
7971
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972 /* Shortcut for single character strings */
7973 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 Py_UNICODE_ISDIGIT(*p))
7975 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007977 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007978 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007980
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 e = p + PyUnicode_GET_SIZE(self);
7982 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 if (!Py_UNICODE_ISDIGIT(*p))
7984 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007986 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987}
7988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007989PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007992Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007993False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994
7995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007996unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997{
7998 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7999 register const Py_UNICODE *e;
8000
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 /* Shortcut for single character strings */
8002 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 Py_UNICODE_ISNUMERIC(*p))
8004 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008006 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008007 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008009
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 e = p + PyUnicode_GET_SIZE(self);
8011 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 if (!Py_UNICODE_ISNUMERIC(*p))
8013 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008015 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016}
8017
Martin v. Löwis47383402007-08-15 07:32:56 +00008018int
8019PyUnicode_IsIdentifier(PyObject *self)
8020{
8021 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8022 register const Py_UNICODE *e;
8023
8024 /* Special case for empty strings */
8025 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008027
8028 /* PEP 3131 says that the first character must be in
8029 XID_Start and subsequent characters in XID_Continue,
8030 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008031 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008032 letters, digits, underscore). However, given the current
8033 definition of XID_Start and XID_Continue, it is sufficient
8034 to check just for these, except that _ must be allowed
8035 as starting an identifier. */
8036 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8037 return 0;
8038
8039 e = p + PyUnicode_GET_SIZE(self);
8040 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 if (!_PyUnicode_IsXidContinue(*p))
8042 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008043 }
8044 return 1;
8045}
8046
8047PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008049\n\
8050Return True if S is a valid identifier according\n\
8051to the language definition.");
8052
8053static PyObject*
8054unicode_isidentifier(PyObject *self)
8055{
8056 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8057}
8058
Georg Brandl559e5d72008-06-11 18:37:52 +00008059PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008061\n\
8062Return True if all characters in S are considered\n\
8063printable in repr() or S is empty, False otherwise.");
8064
8065static PyObject*
8066unicode_isprintable(PyObject *self)
8067{
8068 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8069 register const Py_UNICODE *e;
8070
8071 /* Shortcut for single character strings */
8072 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8073 Py_RETURN_TRUE;
8074 }
8075
8076 e = p + PyUnicode_GET_SIZE(self);
8077 for (; p < e; p++) {
8078 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8079 Py_RETURN_FALSE;
8080 }
8081 }
8082 Py_RETURN_TRUE;
8083}
8084
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008085PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008086 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087\n\
8088Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008089iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090
8091static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008092unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008094 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095}
8096
Martin v. Löwis18e16552006-02-15 17:27:45 +00008097static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098unicode_length(PyUnicodeObject *self)
8099{
8100 return self->length;
8101}
8102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008103PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008106Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008107done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108
8109static PyObject *
8110unicode_ljust(PyUnicodeObject *self, PyObject *args)
8111{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008112 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008113 Py_UNICODE fillchar = ' ';
8114
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008115 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116 return NULL;
8117
Tim Peters7a29bd52001-09-12 03:03:31 +00008118 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 Py_INCREF(self);
8120 return (PyObject*) self;
8121 }
8122
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008123 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124}
8125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008126PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008129Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130
8131static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008132unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 return fixup(self, fixlower);
8135}
8136
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008137#define LEFTSTRIP 0
8138#define RIGHTSTRIP 1
8139#define BOTHSTRIP 2
8140
8141/* Arrays indexed by above */
8142static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8143
8144#define STRIPNAME(i) (stripformat[i]+3)
8145
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008146/* externally visible for str.strip(unicode) */
8147PyObject *
8148_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8149{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008150 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8151 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8152 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8153 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8154 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008155
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008157
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 i = 0;
8159 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8161 i++;
8162 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008163 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008164
Benjamin Peterson14339b62009-01-31 16:36:08 +00008165 j = len;
8166 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 do {
8168 j--;
8169 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8170 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008171 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008172
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 Py_INCREF(self);
8175 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008176 }
8177 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008179}
8180
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181
8182static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008183do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008185 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8186 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008187
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188 i = 0;
8189 if (striptype != RIGHTSTRIP) {
8190 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8191 i++;
8192 }
8193 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008194
Benjamin Peterson14339b62009-01-31 16:36:08 +00008195 j = len;
8196 if (striptype != LEFTSTRIP) {
8197 do {
8198 j--;
8199 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8200 j++;
8201 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008202
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8204 Py_INCREF(self);
8205 return (PyObject*)self;
8206 }
8207 else
8208 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209}
8210
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008211
8212static PyObject *
8213do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8214{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008215 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008216
Benjamin Peterson14339b62009-01-31 16:36:08 +00008217 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8218 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008219
Benjamin Peterson14339b62009-01-31 16:36:08 +00008220 if (sep != NULL && sep != Py_None) {
8221 if (PyUnicode_Check(sep))
8222 return _PyUnicode_XStrip(self, striptype, sep);
8223 else {
8224 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 "%s arg must be None or str",
8226 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008227 return NULL;
8228 }
8229 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008230
Benjamin Peterson14339b62009-01-31 16:36:08 +00008231 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008232}
8233
8234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008235PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008237\n\
8238Return a copy of the string S with leading and trailing\n\
8239whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008240If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008241
8242static PyObject *
8243unicode_strip(PyUnicodeObject *self, PyObject *args)
8244{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008245 if (PyTuple_GET_SIZE(args) == 0)
8246 return do_strip(self, BOTHSTRIP); /* Common case */
8247 else
8248 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008249}
8250
8251
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008252PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008254\n\
8255Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008256If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008257
8258static PyObject *
8259unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8260{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008261 if (PyTuple_GET_SIZE(args) == 0)
8262 return do_strip(self, LEFTSTRIP); /* Common case */
8263 else
8264 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008265}
8266
8267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008268PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008270\n\
8271Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008272If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008273
8274static PyObject *
8275unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8276{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008277 if (PyTuple_GET_SIZE(args) == 0)
8278 return do_strip(self, RIGHTSTRIP); /* Common case */
8279 else
8280 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008281}
8282
8283
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008285unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286{
8287 PyUnicodeObject *u;
8288 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008289 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008290 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291
Georg Brandl222de0f2009-04-12 12:01:50 +00008292 if (len < 1) {
8293 Py_INCREF(unicode_empty);
8294 return (PyObject *)unicode_empty;
8295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296
Tim Peters7a29bd52001-09-12 03:03:31 +00008297 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 /* no repeat, return original string */
8299 Py_INCREF(str);
8300 return (PyObject*) str;
8301 }
Tim Peters8f422462000-09-09 06:13:41 +00008302
8303 /* ensure # of chars needed doesn't overflow int and # of bytes
8304 * needed doesn't overflow size_t
8305 */
8306 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008307 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008308 PyErr_SetString(PyExc_OverflowError,
8309 "repeated string is too long");
8310 return NULL;
8311 }
8312 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8313 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8314 PyErr_SetString(PyExc_OverflowError,
8315 "repeated string is too long");
8316 return NULL;
8317 }
8318 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 if (!u)
8320 return NULL;
8321
8322 p = u->str;
8323
Georg Brandl222de0f2009-04-12 12:01:50 +00008324 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008325 Py_UNICODE_FILL(p, str->str[0], len);
8326 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008327 Py_ssize_t done = str->length; /* number of characters copied this far */
8328 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008330 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008331 Py_UNICODE_COPY(p+done, p, n);
8332 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 }
8335
8336 return (PyObject*) u;
8337}
8338
Alexander Belopolsky40018472011-02-26 01:02:56 +00008339PyObject *
8340PyUnicode_Replace(PyObject *obj,
8341 PyObject *subobj,
8342 PyObject *replobj,
8343 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344{
8345 PyObject *self;
8346 PyObject *str1;
8347 PyObject *str2;
8348 PyObject *result;
8349
8350 self = PyUnicode_FromObject(obj);
8351 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 str1 = PyUnicode_FromObject(subobj);
8354 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 Py_DECREF(self);
8356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357 }
8358 str2 = PyUnicode_FromObject(replobj);
8359 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 Py_DECREF(self);
8361 Py_DECREF(str1);
8362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 }
Tim Petersced69f82003-09-16 20:30:58 +00008364 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 (PyUnicodeObject *)str1,
8366 (PyUnicodeObject *)str2,
8367 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368 Py_DECREF(self);
8369 Py_DECREF(str1);
8370 Py_DECREF(str2);
8371 return result;
8372}
8373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008374PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008375 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376\n\
8377Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008378old replaced by new. If the optional argument count is\n\
8379given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380
8381static PyObject*
8382unicode_replace(PyUnicodeObject *self, PyObject *args)
8383{
8384 PyUnicodeObject *str1;
8385 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008386 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387 PyObject *result;
8388
Martin v. Löwis18e16552006-02-15 17:27:45 +00008389 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390 return NULL;
8391 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8392 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008395 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 Py_DECREF(str1);
8397 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399
8400 result = replace(self, str1, str2, maxcount);
8401
8402 Py_DECREF(str1);
8403 Py_DECREF(str2);
8404 return result;
8405}
8406
Alexander Belopolsky40018472011-02-26 01:02:56 +00008407static PyObject *
8408unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008410 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008411 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008412 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8413 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8414
8415 /* XXX(nnorwitz): rather than over-allocating, it would be
8416 better to choose a different scheme. Perhaps scan the
8417 first N-chars of the string and allocate based on that size.
8418 */
8419 /* Initial allocation is based on the longest-possible unichr
8420 escape.
8421
8422 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8423 unichr, so in this case it's the longest unichr escape. In
8424 narrow (UTF-16) builds this is five chars per source unichr
8425 since there are two unichrs in the surrogate pair, so in narrow
8426 (UTF-16) builds it's not the longest unichr escape.
8427
8428 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8429 so in the narrow (UTF-16) build case it's the longest unichr
8430 escape.
8431 */
8432
Walter Dörwald1ab83302007-05-18 17:15:44 +00008433 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008435#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008437#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008439#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008441 if (repr == NULL)
8442 return NULL;
8443
Walter Dörwald1ab83302007-05-18 17:15:44 +00008444 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008445
8446 /* Add quote */
8447 *p++ = (findchar(s, size, '\'') &&
8448 !findchar(s, size, '"')) ? '"' : '\'';
8449 while (size-- > 0) {
8450 Py_UNICODE ch = *s++;
8451
8452 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008453 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008454 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008455 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008456 continue;
8457 }
8458
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008460 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008461 *p++ = '\\';
8462 *p++ = 't';
8463 }
8464 else if (ch == '\n') {
8465 *p++ = '\\';
8466 *p++ = 'n';
8467 }
8468 else if (ch == '\r') {
8469 *p++ = '\\';
8470 *p++ = 'r';
8471 }
8472
8473 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008474 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008475 *p++ = '\\';
8476 *p++ = 'x';
8477 *p++ = hexdigits[(ch >> 4) & 0x000F];
8478 *p++ = hexdigits[ch & 0x000F];
8479 }
8480
Georg Brandl559e5d72008-06-11 18:37:52 +00008481 /* Copy ASCII characters as-is */
8482 else if (ch < 0x7F) {
8483 *p++ = ch;
8484 }
8485
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008487 else {
8488 Py_UCS4 ucs = ch;
8489
8490#ifndef Py_UNICODE_WIDE
8491 Py_UNICODE ch2 = 0;
8492 /* Get code point from surrogate pair */
8493 if (size > 0) {
8494 ch2 = *s;
8495 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008497 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008499 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008500 size--;
8501 }
8502 }
8503#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008504 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008505 (categories Z* and C* except ASCII space)
8506 */
8507 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8508 /* Map 8-bit characters to '\xhh' */
8509 if (ucs <= 0xff) {
8510 *p++ = '\\';
8511 *p++ = 'x';
8512 *p++ = hexdigits[(ch >> 4) & 0x000F];
8513 *p++ = hexdigits[ch & 0x000F];
8514 }
8515 /* Map 21-bit characters to '\U00xxxxxx' */
8516 else if (ucs >= 0x10000) {
8517 *p++ = '\\';
8518 *p++ = 'U';
8519 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8520 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8521 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8522 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8523 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8524 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8525 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8526 *p++ = hexdigits[ucs & 0x0000000F];
8527 }
8528 /* Map 16-bit characters to '\uxxxx' */
8529 else {
8530 *p++ = '\\';
8531 *p++ = 'u';
8532 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8533 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8534 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8535 *p++ = hexdigits[ucs & 0x000F];
8536 }
8537 }
8538 /* Copy characters as-is */
8539 else {
8540 *p++ = ch;
8541#ifndef Py_UNICODE_WIDE
8542 if (ucs >= 0x10000)
8543 *p++ = ch2;
8544#endif
8545 }
8546 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008547 }
8548 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008549 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008550
8551 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008552 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008553 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554}
8555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008556PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558\n\
8559Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008560such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561arguments start and end are interpreted as in slice notation.\n\
8562\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008563Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564
8565static PyObject *
8566unicode_rfind(PyUnicodeObject *self, PyObject *args)
8567{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008568 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008569 Py_ssize_t start;
8570 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008571 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572
Christian Heimes9cd17752007-11-18 19:35:23 +00008573 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575
Thomas Wouters477c8d52006-05-27 19:21:47 +00008576 result = stringlib_rfind_slice(
8577 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8578 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8579 start, end
8580 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581
8582 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008583
Christian Heimes217cfd12007-12-02 14:31:20 +00008584 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585}
8586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008587PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008590Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
8592static PyObject *
8593unicode_rindex(PyUnicodeObject *self, PyObject *args)
8594{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008595 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008596 Py_ssize_t start;
8597 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008598 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599
Christian Heimes9cd17752007-11-18 19:35:23 +00008600 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602
Thomas Wouters477c8d52006-05-27 19:21:47 +00008603 result = stringlib_rfind_slice(
8604 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8605 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8606 start, end
8607 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608
8609 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008610
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 if (result < 0) {
8612 PyErr_SetString(PyExc_ValueError, "substring not found");
8613 return NULL;
8614 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008615 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616}
8617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008618PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008621Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008622done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623
8624static PyObject *
8625unicode_rjust(PyUnicodeObject *self, PyObject *args)
8626{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008627 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008628 Py_UNICODE fillchar = ' ';
8629
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008630 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 return NULL;
8632
Tim Peters7a29bd52001-09-12 03:03:31 +00008633 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 Py_INCREF(self);
8635 return (PyObject*) self;
8636 }
8637
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008638 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639}
8640
Alexander Belopolsky40018472011-02-26 01:02:56 +00008641PyObject *
8642PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643{
8644 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008645
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 s = PyUnicode_FromObject(s);
8647 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008648 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 if (sep != NULL) {
8650 sep = PyUnicode_FromObject(sep);
8651 if (sep == NULL) {
8652 Py_DECREF(s);
8653 return NULL;
8654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 }
8656
8657 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8658
8659 Py_DECREF(s);
8660 Py_XDECREF(sep);
8661 return result;
8662}
8663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008664PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666\n\
8667Return a list of the words in S, using sep as the\n\
8668delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008669splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008670whitespace string is a separator and empty strings are\n\
8671removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672
8673static PyObject*
8674unicode_split(PyUnicodeObject *self, PyObject *args)
8675{
8676 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008677 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678
Martin v. Löwis18e16552006-02-15 17:27:45 +00008679 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 return NULL;
8681
8682 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688}
8689
Thomas Wouters477c8d52006-05-27 19:21:47 +00008690PyObject *
8691PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8692{
8693 PyObject* str_obj;
8694 PyObject* sep_obj;
8695 PyObject* out;
8696
8697 str_obj = PyUnicode_FromObject(str_in);
8698 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008700 sep_obj = PyUnicode_FromObject(sep_in);
8701 if (!sep_obj) {
8702 Py_DECREF(str_obj);
8703 return NULL;
8704 }
8705
8706 out = stringlib_partition(
8707 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8708 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8709 );
8710
8711 Py_DECREF(sep_obj);
8712 Py_DECREF(str_obj);
8713
8714 return out;
8715}
8716
8717
8718PyObject *
8719PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8720{
8721 PyObject* str_obj;
8722 PyObject* sep_obj;
8723 PyObject* out;
8724
8725 str_obj = PyUnicode_FromObject(str_in);
8726 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008728 sep_obj = PyUnicode_FromObject(sep_in);
8729 if (!sep_obj) {
8730 Py_DECREF(str_obj);
8731 return NULL;
8732 }
8733
8734 out = stringlib_rpartition(
8735 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8736 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8737 );
8738
8739 Py_DECREF(sep_obj);
8740 Py_DECREF(str_obj);
8741
8742 return out;
8743}
8744
8745PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008747\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008748Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008749the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008750found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008751
8752static PyObject*
8753unicode_partition(PyUnicodeObject *self, PyObject *separator)
8754{
8755 return PyUnicode_Partition((PyObject *)self, separator);
8756}
8757
8758PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008759 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008760\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008761Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008762the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008763separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008764
8765static PyObject*
8766unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8767{
8768 return PyUnicode_RPartition((PyObject *)self, separator);
8769}
8770
Alexander Belopolsky40018472011-02-26 01:02:56 +00008771PyObject *
8772PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008773{
8774 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008775
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008776 s = PyUnicode_FromObject(s);
8777 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008778 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 if (sep != NULL) {
8780 sep = PyUnicode_FromObject(sep);
8781 if (sep == NULL) {
8782 Py_DECREF(s);
8783 return NULL;
8784 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008785 }
8786
8787 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8788
8789 Py_DECREF(s);
8790 Py_XDECREF(sep);
8791 return result;
8792}
8793
8794PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008796\n\
8797Return a list of the words in S, using sep as the\n\
8798delimiter string, starting at the end of the string and\n\
8799working to the front. If maxsplit is given, at most maxsplit\n\
8800splits are done. If sep is not specified, any whitespace string\n\
8801is a separator.");
8802
8803static PyObject*
8804unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8805{
8806 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008807 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008808
Martin v. Löwis18e16552006-02-15 17:27:45 +00008809 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008810 return NULL;
8811
8812 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008813 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008814 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008816 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008817 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008818}
8819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008820PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822\n\
8823Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008824Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008825is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826
8827static PyObject*
8828unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8829{
Guido van Rossum86662912000-04-11 15:38:46 +00008830 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831
Guido van Rossum86662912000-04-11 15:38:46 +00008832 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833 return NULL;
8834
Guido van Rossum86662912000-04-11 15:38:46 +00008835 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836}
8837
8838static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008839PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840{
Walter Dörwald346737f2007-05-31 10:44:43 +00008841 if (PyUnicode_CheckExact(self)) {
8842 Py_INCREF(self);
8843 return self;
8844 } else
8845 /* Subtype -- return genuine unicode string with the same value. */
8846 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8847 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848}
8849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008850PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852\n\
8853Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008854and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855
8856static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008857unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859 return fixup(self, fixswapcase);
8860}
8861
Georg Brandlceee0772007-11-27 23:48:05 +00008862PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008864\n\
8865Return a translation table usable for str.translate().\n\
8866If there is only one argument, it must be a dictionary mapping Unicode\n\
8867ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008868Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008869If there are two arguments, they must be strings of equal length, and\n\
8870in the resulting dictionary, each character in x will be mapped to the\n\
8871character at the same position in y. If there is a third argument, it\n\
8872must be a string, whose characters will be mapped to None in the result.");
8873
8874static PyObject*
8875unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8876{
8877 PyObject *x, *y = NULL, *z = NULL;
8878 PyObject *new = NULL, *key, *value;
8879 Py_ssize_t i = 0;
8880 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008881
Georg Brandlceee0772007-11-27 23:48:05 +00008882 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8883 return NULL;
8884 new = PyDict_New();
8885 if (!new)
8886 return NULL;
8887 if (y != NULL) {
8888 /* x must be a string too, of equal length */
8889 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8890 if (!PyUnicode_Check(x)) {
8891 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8892 "be a string if there is a second argument");
8893 goto err;
8894 }
8895 if (PyUnicode_GET_SIZE(x) != ylen) {
8896 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8897 "arguments must have equal length");
8898 goto err;
8899 }
8900 /* create entries for translating chars in x to those in y */
8901 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008902 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8903 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008904 if (!key || !value)
8905 goto err;
8906 res = PyDict_SetItem(new, key, value);
8907 Py_DECREF(key);
8908 Py_DECREF(value);
8909 if (res < 0)
8910 goto err;
8911 }
8912 /* create entries for deleting chars in z */
8913 if (z != NULL) {
8914 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008915 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008916 if (!key)
8917 goto err;
8918 res = PyDict_SetItem(new, key, Py_None);
8919 Py_DECREF(key);
8920 if (res < 0)
8921 goto err;
8922 }
8923 }
8924 } else {
8925 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008926 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008927 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8928 "to maketrans it must be a dict");
8929 goto err;
8930 }
8931 /* copy entries into the new dict, converting string keys to int keys */
8932 while (PyDict_Next(x, &i, &key, &value)) {
8933 if (PyUnicode_Check(key)) {
8934 /* convert string keys to integer keys */
8935 PyObject *newkey;
8936 if (PyUnicode_GET_SIZE(key) != 1) {
8937 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8938 "table must be of length 1");
8939 goto err;
8940 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008941 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008942 if (!newkey)
8943 goto err;
8944 res = PyDict_SetItem(new, newkey, value);
8945 Py_DECREF(newkey);
8946 if (res < 0)
8947 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008948 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008949 /* just keep integer keys */
8950 if (PyDict_SetItem(new, key, value) < 0)
8951 goto err;
8952 } else {
8953 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8954 "be strings or integers");
8955 goto err;
8956 }
8957 }
8958 }
8959 return new;
8960 err:
8961 Py_DECREF(new);
8962 return NULL;
8963}
8964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008965PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008966 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967\n\
8968Return a copy of the string S, where all characters have been mapped\n\
8969through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008970Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008971Unmapped characters are left untouched. Characters mapped to None\n\
8972are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973
8974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008975unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976{
Georg Brandlceee0772007-11-27 23:48:05 +00008977 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978}
8979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008980PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008983Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984
8985static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008986unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988 return fixup(self, fixupper);
8989}
8990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008991PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008994Pad a numeric string S with zeros on the left, to fill a field\n\
8995of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996
8997static PyObject *
8998unicode_zfill(PyUnicodeObject *self, PyObject *args)
8999{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009000 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001 PyUnicodeObject *u;
9002
Martin v. Löwis18e16552006-02-15 17:27:45 +00009003 Py_ssize_t width;
9004 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 return NULL;
9006
9007 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009008 if (PyUnicode_CheckExact(self)) {
9009 Py_INCREF(self);
9010 return (PyObject*) self;
9011 }
9012 else
9013 return PyUnicode_FromUnicode(
9014 PyUnicode_AS_UNICODE(self),
9015 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009016 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 }
9018
9019 fill = width - self->length;
9020
9021 u = pad(self, fill, 0, '0');
9022
Walter Dörwald068325e2002-04-15 13:36:47 +00009023 if (u == NULL)
9024 return NULL;
9025
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026 if (u->str[fill] == '+' || u->str[fill] == '-') {
9027 /* move sign to beginning of string */
9028 u->str[0] = u->str[fill];
9029 u->str[fill] = '0';
9030 }
9031
9032 return (PyObject*) u;
9033}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034
9035#if 0
9036static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009037unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038{
Christian Heimes2202f872008-02-06 14:31:34 +00009039 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009041
9042static PyObject *
9043unicode__decimal2ascii(PyObject *self)
9044{
9045 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9046 PyUnicode_GET_SIZE(self));
9047}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048#endif
9049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009050PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009051 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009053Return True if S starts with the specified prefix, False otherwise.\n\
9054With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009055With optional end, stop comparing S at that position.\n\
9056prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057
9058static PyObject *
9059unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009060 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009062 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009064 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009065 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009066 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009068 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009069 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9070 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009071 if (PyTuple_Check(subobj)) {
9072 Py_ssize_t i;
9073 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9074 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009076 if (substring == NULL)
9077 return NULL;
9078 result = tailmatch(self, substring, start, end, -1);
9079 Py_DECREF(substring);
9080 if (result) {
9081 Py_RETURN_TRUE;
9082 }
9083 }
9084 /* nothing matched */
9085 Py_RETURN_FALSE;
9086 }
9087 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009090 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009092 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093}
9094
9095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009096PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009099Return True if S ends with the specified suffix, False otherwise.\n\
9100With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009101With optional end, stop comparing S at that position.\n\
9102suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103
9104static PyObject *
9105unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009108 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009110 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009111 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009112 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009114 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9116 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009117 if (PyTuple_Check(subobj)) {
9118 Py_ssize_t i;
9119 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9120 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009122 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009124 result = tailmatch(self, substring, start, end, +1);
9125 Py_DECREF(substring);
9126 if (result) {
9127 Py_RETURN_TRUE;
9128 }
9129 }
9130 Py_RETURN_FALSE;
9131 }
9132 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009136 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009138 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139}
9140
Eric Smith8c663262007-08-25 02:26:07 +00009141#include "stringlib/string_format.h"
9142
9143PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009145\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009146Return a formatted version of S, using substitutions from args and kwargs.\n\
9147The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009148
Eric Smith27bbca62010-11-04 17:06:58 +00009149PyDoc_STRVAR(format_map__doc__,
9150 "S.format_map(mapping) -> str\n\
9151\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009152Return a formatted version of S, using substitutions from mapping.\n\
9153The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009154
Eric Smith4a7d76d2008-05-30 18:10:19 +00009155static PyObject *
9156unicode__format__(PyObject* self, PyObject* args)
9157{
9158 PyObject *format_spec;
9159
9160 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9161 return NULL;
9162
9163 return _PyUnicode_FormatAdvanced(self,
9164 PyUnicode_AS_UNICODE(format_spec),
9165 PyUnicode_GET_SIZE(format_spec));
9166}
9167
Eric Smith8c663262007-08-25 02:26:07 +00009168PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009170\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009171Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009172
9173static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009174unicode__sizeof__(PyUnicodeObject *v)
9175{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009176 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9177 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009178}
9179
9180PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009182
9183static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009184unicode_getnewargs(PyUnicodeObject *v)
9185{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009186 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009187}
9188
Guido van Rossumd57fd912000-03-10 22:53:23 +00009189static PyMethodDef unicode_methods[] = {
9190
9191 /* Order is according to common usage: often used methods should
9192 appear first, since lookup is done sequentially. */
9193
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009194 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009195 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9196 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009197 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009198 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9199 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9200 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9201 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9202 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9203 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9204 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009205 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009206 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9207 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9208 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009209 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009210 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9211 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9212 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009213 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009214 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009215 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009216 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009217 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9218 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9219 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9220 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9221 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9222 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9223 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9224 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9225 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9226 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9227 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9228 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9229 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9230 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009231 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009232 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009233 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009234 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009235 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009236 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009237 {"maketrans", (PyCFunction) unicode_maketrans,
9238 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009239 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009240#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009241 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242#endif
9243
9244#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009245 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009246 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009247 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248#endif
9249
Benjamin Peterson14339b62009-01-31 16:36:08 +00009250 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251 {NULL, NULL}
9252};
9253
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009254static PyObject *
9255unicode_mod(PyObject *v, PyObject *w)
9256{
Benjamin Peterson29060642009-01-31 22:14:21 +00009257 if (!PyUnicode_Check(v)) {
9258 Py_INCREF(Py_NotImplemented);
9259 return Py_NotImplemented;
9260 }
9261 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009262}
9263
9264static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009265 0, /*nb_add*/
9266 0, /*nb_subtract*/
9267 0, /*nb_multiply*/
9268 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009269};
9270
Guido van Rossumd57fd912000-03-10 22:53:23 +00009271static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009272 (lenfunc) unicode_length, /* sq_length */
9273 PyUnicode_Concat, /* sq_concat */
9274 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9275 (ssizeargfunc) unicode_getitem, /* sq_item */
9276 0, /* sq_slice */
9277 0, /* sq_ass_item */
9278 0, /* sq_ass_slice */
9279 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280};
9281
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009282static PyObject*
9283unicode_subscript(PyUnicodeObject* self, PyObject* item)
9284{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009285 if (PyIndex_Check(item)) {
9286 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009287 if (i == -1 && PyErr_Occurred())
9288 return NULL;
9289 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009290 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009291 return unicode_getitem(self, i);
9292 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009293 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009294 Py_UNICODE* source_buf;
9295 Py_UNICODE* result_buf;
9296 PyObject* result;
9297
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009298 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009299 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009300 return NULL;
9301 }
9302
9303 if (slicelength <= 0) {
9304 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009305 } else if (start == 0 && step == 1 && slicelength == self->length &&
9306 PyUnicode_CheckExact(self)) {
9307 Py_INCREF(self);
9308 return (PyObject *)self;
9309 } else if (step == 1) {
9310 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009311 } else {
9312 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009313 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9314 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009315
Benjamin Peterson29060642009-01-31 22:14:21 +00009316 if (result_buf == NULL)
9317 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009318
9319 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9320 result_buf[i] = source_buf[cur];
9321 }
Tim Petersced69f82003-09-16 20:30:58 +00009322
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009323 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009324 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009325 return result;
9326 }
9327 } else {
9328 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9329 return NULL;
9330 }
9331}
9332
9333static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009334 (lenfunc)unicode_length, /* mp_length */
9335 (binaryfunc)unicode_subscript, /* mp_subscript */
9336 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009337};
9338
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340/* Helpers for PyUnicode_Format() */
9341
9342static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009343getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009345 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009347 (*p_argidx)++;
9348 if (arglen < 0)
9349 return args;
9350 else
9351 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 }
9353 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355 return NULL;
9356}
9357
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009358/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009360static PyObject *
9361formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009363 char *p;
9364 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009366
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367 x = PyFloat_AsDouble(v);
9368 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009369 return NULL;
9370
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009373
Eric Smith0923d1d2009-04-16 20:16:10 +00009374 p = PyOS_double_to_string(x, type, prec,
9375 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009376 if (p == NULL)
9377 return NULL;
9378 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009379 PyMem_Free(p);
9380 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381}
9382
Tim Peters38fd5b62000-09-21 05:43:11 +00009383static PyObject*
9384formatlong(PyObject *val, int flags, int prec, int type)
9385{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009386 char *buf;
9387 int len;
9388 PyObject *str; /* temporary string object. */
9389 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009390
Benjamin Peterson14339b62009-01-31 16:36:08 +00009391 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9392 if (!str)
9393 return NULL;
9394 result = PyUnicode_FromStringAndSize(buf, len);
9395 Py_DECREF(str);
9396 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009397}
9398
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399static int
9400formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009401 size_t buflen,
9402 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009404 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009405 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009406 if (PyUnicode_GET_SIZE(v) == 1) {
9407 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9408 buf[1] = '\0';
9409 return 1;
9410 }
9411#ifndef Py_UNICODE_WIDE
9412 if (PyUnicode_GET_SIZE(v) == 2) {
9413 /* Decode a valid surrogate pair */
9414 int c0 = PyUnicode_AS_UNICODE(v)[0];
9415 int c1 = PyUnicode_AS_UNICODE(v)[1];
9416 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9417 0xDC00 <= c1 && c1 <= 0xDFFF) {
9418 buf[0] = c0;
9419 buf[1] = c1;
9420 buf[2] = '\0';
9421 return 2;
9422 }
9423 }
9424#endif
9425 goto onError;
9426 }
9427 else {
9428 /* Integer input truncated to a character */
9429 long x;
9430 x = PyLong_AsLong(v);
9431 if (x == -1 && PyErr_Occurred())
9432 goto onError;
9433
9434 if (x < 0 || x > 0x10ffff) {
9435 PyErr_SetString(PyExc_OverflowError,
9436 "%c arg not in range(0x110000)");
9437 return -1;
9438 }
9439
9440#ifndef Py_UNICODE_WIDE
9441 if (x > 0xffff) {
9442 x -= 0x10000;
9443 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9444 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9445 return 2;
9446 }
9447#endif
9448 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009449 buf[1] = '\0';
9450 return 1;
9451 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009452
Benjamin Peterson29060642009-01-31 22:14:21 +00009453 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009454 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009455 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009456 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457}
9458
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009459/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009460 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009461*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009462#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009463
Alexander Belopolsky40018472011-02-26 01:02:56 +00009464PyObject *
9465PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466{
9467 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009468 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 int args_owned = 0;
9470 PyUnicodeObject *result = NULL;
9471 PyObject *dict = NULL;
9472 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009473
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 PyErr_BadInternalCall();
9476 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477 }
9478 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009479 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 fmt = PyUnicode_AS_UNICODE(uformat);
9482 fmtcnt = PyUnicode_GET_SIZE(uformat);
9483
9484 reslen = rescnt = fmtcnt + 100;
9485 result = _PyUnicode_New(reslen);
9486 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009487 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488 res = PyUnicode_AS_UNICODE(result);
9489
9490 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009491 arglen = PyTuple_Size(args);
9492 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493 }
9494 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 arglen = -1;
9496 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009498 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009499 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009500 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501
9502 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 if (*fmt != '%') {
9504 if (--rescnt < 0) {
9505 rescnt = fmtcnt + 100;
9506 reslen += rescnt;
9507 if (_PyUnicode_Resize(&result, reslen) < 0)
9508 goto onError;
9509 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9510 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009511 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009513 }
9514 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009515 /* Got a format specifier */
9516 int flags = 0;
9517 Py_ssize_t width = -1;
9518 int prec = -1;
9519 Py_UNICODE c = '\0';
9520 Py_UNICODE fill;
9521 int isnumok;
9522 PyObject *v = NULL;
9523 PyObject *temp = NULL;
9524 Py_UNICODE *pbuf;
9525 Py_UNICODE sign;
9526 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009527 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 fmt++;
9530 if (*fmt == '(') {
9531 Py_UNICODE *keystart;
9532 Py_ssize_t keylen;
9533 PyObject *key;
9534 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009535
Benjamin Peterson29060642009-01-31 22:14:21 +00009536 if (dict == NULL) {
9537 PyErr_SetString(PyExc_TypeError,
9538 "format requires a mapping");
9539 goto onError;
9540 }
9541 ++fmt;
9542 --fmtcnt;
9543 keystart = fmt;
9544 /* Skip over balanced parentheses */
9545 while (pcount > 0 && --fmtcnt >= 0) {
9546 if (*fmt == ')')
9547 --pcount;
9548 else if (*fmt == '(')
9549 ++pcount;
9550 fmt++;
9551 }
9552 keylen = fmt - keystart - 1;
9553 if (fmtcnt < 0 || pcount > 0) {
9554 PyErr_SetString(PyExc_ValueError,
9555 "incomplete format key");
9556 goto onError;
9557 }
9558#if 0
9559 /* keys are converted to strings using UTF-8 and
9560 then looked up since Python uses strings to hold
9561 variables names etc. in its namespaces and we
9562 wouldn't want to break common idioms. */
9563 key = PyUnicode_EncodeUTF8(keystart,
9564 keylen,
9565 NULL);
9566#else
9567 key = PyUnicode_FromUnicode(keystart, keylen);
9568#endif
9569 if (key == NULL)
9570 goto onError;
9571 if (args_owned) {
9572 Py_DECREF(args);
9573 args_owned = 0;
9574 }
9575 args = PyObject_GetItem(dict, key);
9576 Py_DECREF(key);
9577 if (args == NULL) {
9578 goto onError;
9579 }
9580 args_owned = 1;
9581 arglen = -1;
9582 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009583 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009584 while (--fmtcnt >= 0) {
9585 switch (c = *fmt++) {
9586 case '-': flags |= F_LJUST; continue;
9587 case '+': flags |= F_SIGN; continue;
9588 case ' ': flags |= F_BLANK; continue;
9589 case '#': flags |= F_ALT; continue;
9590 case '0': flags |= F_ZERO; continue;
9591 }
9592 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009593 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 if (c == '*') {
9595 v = getnextarg(args, arglen, &argidx);
9596 if (v == NULL)
9597 goto onError;
9598 if (!PyLong_Check(v)) {
9599 PyErr_SetString(PyExc_TypeError,
9600 "* wants int");
9601 goto onError;
9602 }
9603 width = PyLong_AsLong(v);
9604 if (width == -1 && PyErr_Occurred())
9605 goto onError;
9606 if (width < 0) {
9607 flags |= F_LJUST;
9608 width = -width;
9609 }
9610 if (--fmtcnt >= 0)
9611 c = *fmt++;
9612 }
9613 else if (c >= '0' && c <= '9') {
9614 width = c - '0';
9615 while (--fmtcnt >= 0) {
9616 c = *fmt++;
9617 if (c < '0' || c > '9')
9618 break;
9619 if ((width*10) / 10 != width) {
9620 PyErr_SetString(PyExc_ValueError,
9621 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009622 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009623 }
9624 width = width*10 + (c - '0');
9625 }
9626 }
9627 if (c == '.') {
9628 prec = 0;
9629 if (--fmtcnt >= 0)
9630 c = *fmt++;
9631 if (c == '*') {
9632 v = getnextarg(args, arglen, &argidx);
9633 if (v == NULL)
9634 goto onError;
9635 if (!PyLong_Check(v)) {
9636 PyErr_SetString(PyExc_TypeError,
9637 "* wants int");
9638 goto onError;
9639 }
9640 prec = PyLong_AsLong(v);
9641 if (prec == -1 && PyErr_Occurred())
9642 goto onError;
9643 if (prec < 0)
9644 prec = 0;
9645 if (--fmtcnt >= 0)
9646 c = *fmt++;
9647 }
9648 else if (c >= '0' && c <= '9') {
9649 prec = c - '0';
9650 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009651 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 if (c < '0' || c > '9')
9653 break;
9654 if ((prec*10) / 10 != prec) {
9655 PyErr_SetString(PyExc_ValueError,
9656 "prec too big");
9657 goto onError;
9658 }
9659 prec = prec*10 + (c - '0');
9660 }
9661 }
9662 } /* prec */
9663 if (fmtcnt >= 0) {
9664 if (c == 'h' || c == 'l' || c == 'L') {
9665 if (--fmtcnt >= 0)
9666 c = *fmt++;
9667 }
9668 }
9669 if (fmtcnt < 0) {
9670 PyErr_SetString(PyExc_ValueError,
9671 "incomplete format");
9672 goto onError;
9673 }
9674 if (c != '%') {
9675 v = getnextarg(args, arglen, &argidx);
9676 if (v == NULL)
9677 goto onError;
9678 }
9679 sign = 0;
9680 fill = ' ';
9681 switch (c) {
9682
9683 case '%':
9684 pbuf = formatbuf;
9685 /* presume that buffer length is at least 1 */
9686 pbuf[0] = '%';
9687 len = 1;
9688 break;
9689
9690 case 's':
9691 case 'r':
9692 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009693 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009694 temp = v;
9695 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009696 }
9697 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009698 if (c == 's')
9699 temp = PyObject_Str(v);
9700 else if (c == 'r')
9701 temp = PyObject_Repr(v);
9702 else
9703 temp = PyObject_ASCII(v);
9704 if (temp == NULL)
9705 goto onError;
9706 if (PyUnicode_Check(temp))
9707 /* nothing to do */;
9708 else {
9709 Py_DECREF(temp);
9710 PyErr_SetString(PyExc_TypeError,
9711 "%s argument has non-string str()");
9712 goto onError;
9713 }
9714 }
9715 pbuf = PyUnicode_AS_UNICODE(temp);
9716 len = PyUnicode_GET_SIZE(temp);
9717 if (prec >= 0 && len > prec)
9718 len = prec;
9719 break;
9720
9721 case 'i':
9722 case 'd':
9723 case 'u':
9724 case 'o':
9725 case 'x':
9726 case 'X':
9727 if (c == 'i')
9728 c = 'd';
9729 isnumok = 0;
9730 if (PyNumber_Check(v)) {
9731 PyObject *iobj=NULL;
9732
9733 if (PyLong_Check(v)) {
9734 iobj = v;
9735 Py_INCREF(iobj);
9736 }
9737 else {
9738 iobj = PyNumber_Long(v);
9739 }
9740 if (iobj!=NULL) {
9741 if (PyLong_Check(iobj)) {
9742 isnumok = 1;
9743 temp = formatlong(iobj, flags, prec, c);
9744 Py_DECREF(iobj);
9745 if (!temp)
9746 goto onError;
9747 pbuf = PyUnicode_AS_UNICODE(temp);
9748 len = PyUnicode_GET_SIZE(temp);
9749 sign = 1;
9750 }
9751 else {
9752 Py_DECREF(iobj);
9753 }
9754 }
9755 }
9756 if (!isnumok) {
9757 PyErr_Format(PyExc_TypeError,
9758 "%%%c format: a number is required, "
9759 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9760 goto onError;
9761 }
9762 if (flags & F_ZERO)
9763 fill = '0';
9764 break;
9765
9766 case 'e':
9767 case 'E':
9768 case 'f':
9769 case 'F':
9770 case 'g':
9771 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009772 temp = formatfloat(v, flags, prec, c);
9773 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009774 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009775 pbuf = PyUnicode_AS_UNICODE(temp);
9776 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009777 sign = 1;
9778 if (flags & F_ZERO)
9779 fill = '0';
9780 break;
9781
9782 case 'c':
9783 pbuf = formatbuf;
9784 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9785 if (len < 0)
9786 goto onError;
9787 break;
9788
9789 default:
9790 PyErr_Format(PyExc_ValueError,
9791 "unsupported format character '%c' (0x%x) "
9792 "at index %zd",
9793 (31<=c && c<=126) ? (char)c : '?',
9794 (int)c,
9795 (Py_ssize_t)(fmt - 1 -
9796 PyUnicode_AS_UNICODE(uformat)));
9797 goto onError;
9798 }
9799 if (sign) {
9800 if (*pbuf == '-' || *pbuf == '+') {
9801 sign = *pbuf++;
9802 len--;
9803 }
9804 else if (flags & F_SIGN)
9805 sign = '+';
9806 else if (flags & F_BLANK)
9807 sign = ' ';
9808 else
9809 sign = 0;
9810 }
9811 if (width < len)
9812 width = len;
9813 if (rescnt - (sign != 0) < width) {
9814 reslen -= rescnt;
9815 rescnt = width + fmtcnt + 100;
9816 reslen += rescnt;
9817 if (reslen < 0) {
9818 Py_XDECREF(temp);
9819 PyErr_NoMemory();
9820 goto onError;
9821 }
9822 if (_PyUnicode_Resize(&result, reslen) < 0) {
9823 Py_XDECREF(temp);
9824 goto onError;
9825 }
9826 res = PyUnicode_AS_UNICODE(result)
9827 + reslen - rescnt;
9828 }
9829 if (sign) {
9830 if (fill != ' ')
9831 *res++ = sign;
9832 rescnt--;
9833 if (width > len)
9834 width--;
9835 }
9836 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9837 assert(pbuf[0] == '0');
9838 assert(pbuf[1] == c);
9839 if (fill != ' ') {
9840 *res++ = *pbuf++;
9841 *res++ = *pbuf++;
9842 }
9843 rescnt -= 2;
9844 width -= 2;
9845 if (width < 0)
9846 width = 0;
9847 len -= 2;
9848 }
9849 if (width > len && !(flags & F_LJUST)) {
9850 do {
9851 --rescnt;
9852 *res++ = fill;
9853 } while (--width > len);
9854 }
9855 if (fill == ' ') {
9856 if (sign)
9857 *res++ = sign;
9858 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9859 assert(pbuf[0] == '0');
9860 assert(pbuf[1] == c);
9861 *res++ = *pbuf++;
9862 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009863 }
9864 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009865 Py_UNICODE_COPY(res, pbuf, len);
9866 res += len;
9867 rescnt -= len;
9868 while (--width >= len) {
9869 --rescnt;
9870 *res++ = ' ';
9871 }
9872 if (dict && (argidx < arglen) && c != '%') {
9873 PyErr_SetString(PyExc_TypeError,
9874 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009875 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009876 goto onError;
9877 }
9878 Py_XDECREF(temp);
9879 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880 } /* until end */
9881 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009882 PyErr_SetString(PyExc_TypeError,
9883 "not all arguments converted during string formatting");
9884 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885 }
9886
Thomas Woutersa96affe2006-03-12 00:29:36 +00009887 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009890 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891 }
9892 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893 return (PyObject *)result;
9894
Benjamin Peterson29060642009-01-31 22:14:21 +00009895 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896 Py_XDECREF(result);
9897 Py_DECREF(uformat);
9898 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009899 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900 }
9901 return NULL;
9902}
9903
Jeremy Hylton938ace62002-07-17 16:30:39 +00009904static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009905unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9906
Tim Peters6d6c1a32001-08-02 04:15:00 +00009907static PyObject *
9908unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9909{
Benjamin Peterson29060642009-01-31 22:14:21 +00009910 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009911 static char *kwlist[] = {"object", "encoding", "errors", 0};
9912 char *encoding = NULL;
9913 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009914
Benjamin Peterson14339b62009-01-31 16:36:08 +00009915 if (type != &PyUnicode_Type)
9916 return unicode_subtype_new(type, args, kwds);
9917 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009918 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009919 return NULL;
9920 if (x == NULL)
9921 return (PyObject *)_PyUnicode_New(0);
9922 if (encoding == NULL && errors == NULL)
9923 return PyObject_Str(x);
9924 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009925 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009926}
9927
Guido van Rossume023fe02001-08-30 03:12:59 +00009928static PyObject *
9929unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9930{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009931 PyUnicodeObject *tmp, *pnew;
9932 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009933
Benjamin Peterson14339b62009-01-31 16:36:08 +00009934 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9935 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9936 if (tmp == NULL)
9937 return NULL;
9938 assert(PyUnicode_Check(tmp));
9939 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9940 if (pnew == NULL) {
9941 Py_DECREF(tmp);
9942 return NULL;
9943 }
9944 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9945 if (pnew->str == NULL) {
9946 _Py_ForgetReference((PyObject *)pnew);
9947 PyObject_Del(pnew);
9948 Py_DECREF(tmp);
9949 return PyErr_NoMemory();
9950 }
9951 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9952 pnew->length = n;
9953 pnew->hash = tmp->hash;
9954 Py_DECREF(tmp);
9955 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009956}
9957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009958PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009959 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009960\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009961Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009962encoding defaults to the current default string encoding.\n\
9963errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009964
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009965static PyObject *unicode_iter(PyObject *seq);
9966
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009968 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009969 "str", /* tp_name */
9970 sizeof(PyUnicodeObject), /* tp_size */
9971 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009973 (destructor)unicode_dealloc, /* tp_dealloc */
9974 0, /* tp_print */
9975 0, /* tp_getattr */
9976 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009977 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009978 unicode_repr, /* tp_repr */
9979 &unicode_as_number, /* tp_as_number */
9980 &unicode_as_sequence, /* tp_as_sequence */
9981 &unicode_as_mapping, /* tp_as_mapping */
9982 (hashfunc) unicode_hash, /* tp_hash*/
9983 0, /* tp_call*/
9984 (reprfunc) unicode_str, /* tp_str */
9985 PyObject_GenericGetAttr, /* tp_getattro */
9986 0, /* tp_setattro */
9987 0, /* tp_as_buffer */
9988 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009990 unicode_doc, /* tp_doc */
9991 0, /* tp_traverse */
9992 0, /* tp_clear */
9993 PyUnicode_RichCompare, /* tp_richcompare */
9994 0, /* tp_weaklistoffset */
9995 unicode_iter, /* tp_iter */
9996 0, /* tp_iternext */
9997 unicode_methods, /* tp_methods */
9998 0, /* tp_members */
9999 0, /* tp_getset */
10000 &PyBaseObject_Type, /* tp_base */
10001 0, /* tp_dict */
10002 0, /* tp_descr_get */
10003 0, /* tp_descr_set */
10004 0, /* tp_dictoffset */
10005 0, /* tp_init */
10006 0, /* tp_alloc */
10007 unicode_new, /* tp_new */
10008 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009};
10010
10011/* Initialize the Unicode implementation */
10012
Thomas Wouters78890102000-07-22 19:25:51 +000010013void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010015 int i;
10016
Thomas Wouters477c8d52006-05-27 19:21:47 +000010017 /* XXX - move this array to unicodectype.c ? */
10018 Py_UNICODE linebreak[] = {
10019 0x000A, /* LINE FEED */
10020 0x000D, /* CARRIAGE RETURN */
10021 0x001C, /* FILE SEPARATOR */
10022 0x001D, /* GROUP SEPARATOR */
10023 0x001E, /* RECORD SEPARATOR */
10024 0x0085, /* NEXT LINE */
10025 0x2028, /* LINE SEPARATOR */
10026 0x2029, /* PARAGRAPH SEPARATOR */
10027 };
10028
Fred Drakee4315f52000-05-09 19:53:39 +000010029 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010030 free_list = NULL;
10031 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010033 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010034 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010035
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010036 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010037 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010038 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010039 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010040
10041 /* initialize the linebreak bloom filter */
10042 bloom_linebreak = make_bloom_mask(
10043 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10044 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010045
10046 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047}
10048
10049/* Finalize the Unicode implementation */
10050
Christian Heimesa156e092008-02-16 07:38:31 +000010051int
10052PyUnicode_ClearFreeList(void)
10053{
10054 int freelist_size = numfree;
10055 PyUnicodeObject *u;
10056
10057 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010058 PyUnicodeObject *v = u;
10059 u = *(PyUnicodeObject **)u;
10060 if (v->str)
10061 PyObject_DEL(v->str);
10062 Py_XDECREF(v->defenc);
10063 PyObject_Del(v);
10064 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010065 }
10066 free_list = NULL;
10067 assert(numfree == 0);
10068 return freelist_size;
10069}
10070
Guido van Rossumd57fd912000-03-10 22:53:23 +000010071void
Thomas Wouters78890102000-07-22 19:25:51 +000010072_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010073{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010074 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010076 Py_XDECREF(unicode_empty);
10077 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010078
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010079 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010080 if (unicode_latin1[i]) {
10081 Py_DECREF(unicode_latin1[i]);
10082 unicode_latin1[i] = NULL;
10083 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010084 }
Christian Heimesa156e092008-02-16 07:38:31 +000010085 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010086}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010087
Walter Dörwald16807132007-05-25 13:52:07 +000010088void
10089PyUnicode_InternInPlace(PyObject **p)
10090{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010091 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10092 PyObject *t;
10093 if (s == NULL || !PyUnicode_Check(s))
10094 Py_FatalError(
10095 "PyUnicode_InternInPlace: unicode strings only please!");
10096 /* If it's a subclass, we don't really know what putting
10097 it in the interned dict might do. */
10098 if (!PyUnicode_CheckExact(s))
10099 return;
10100 if (PyUnicode_CHECK_INTERNED(s))
10101 return;
10102 if (interned == NULL) {
10103 interned = PyDict_New();
10104 if (interned == NULL) {
10105 PyErr_Clear(); /* Don't leave an exception */
10106 return;
10107 }
10108 }
10109 /* It might be that the GetItem call fails even
10110 though the key is present in the dictionary,
10111 namely when this happens during a stack overflow. */
10112 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010113 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010114 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010115
Benjamin Peterson29060642009-01-31 22:14:21 +000010116 if (t) {
10117 Py_INCREF(t);
10118 Py_DECREF(*p);
10119 *p = t;
10120 return;
10121 }
Walter Dörwald16807132007-05-25 13:52:07 +000010122
Benjamin Peterson14339b62009-01-31 16:36:08 +000010123 PyThreadState_GET()->recursion_critical = 1;
10124 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10125 PyErr_Clear();
10126 PyThreadState_GET()->recursion_critical = 0;
10127 return;
10128 }
10129 PyThreadState_GET()->recursion_critical = 0;
10130 /* The two references in interned are not counted by refcnt.
10131 The deallocator will take care of this */
10132 Py_REFCNT(s) -= 2;
10133 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010134}
10135
10136void
10137PyUnicode_InternImmortal(PyObject **p)
10138{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010139 PyUnicode_InternInPlace(p);
10140 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10141 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10142 Py_INCREF(*p);
10143 }
Walter Dörwald16807132007-05-25 13:52:07 +000010144}
10145
10146PyObject *
10147PyUnicode_InternFromString(const char *cp)
10148{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010149 PyObject *s = PyUnicode_FromString(cp);
10150 if (s == NULL)
10151 return NULL;
10152 PyUnicode_InternInPlace(&s);
10153 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010154}
10155
Alexander Belopolsky40018472011-02-26 01:02:56 +000010156void
10157_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000010158{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010159 PyObject *keys;
10160 PyUnicodeObject *s;
10161 Py_ssize_t i, n;
10162 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010163
Benjamin Peterson14339b62009-01-31 16:36:08 +000010164 if (interned == NULL || !PyDict_Check(interned))
10165 return;
10166 keys = PyDict_Keys(interned);
10167 if (keys == NULL || !PyList_Check(keys)) {
10168 PyErr_Clear();
10169 return;
10170 }
Walter Dörwald16807132007-05-25 13:52:07 +000010171
Benjamin Peterson14339b62009-01-31 16:36:08 +000010172 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10173 detector, interned unicode strings are not forcibly deallocated;
10174 rather, we give them their stolen references back, and then clear
10175 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010176
Benjamin Peterson14339b62009-01-31 16:36:08 +000010177 n = PyList_GET_SIZE(keys);
10178 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010179 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010180 for (i = 0; i < n; i++) {
10181 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10182 switch (s->state) {
10183 case SSTATE_NOT_INTERNED:
10184 /* XXX Shouldn't happen */
10185 break;
10186 case SSTATE_INTERNED_IMMORTAL:
10187 Py_REFCNT(s) += 1;
10188 immortal_size += s->length;
10189 break;
10190 case SSTATE_INTERNED_MORTAL:
10191 Py_REFCNT(s) += 2;
10192 mortal_size += s->length;
10193 break;
10194 default:
10195 Py_FatalError("Inconsistent interned string state.");
10196 }
10197 s->state = SSTATE_NOT_INTERNED;
10198 }
10199 fprintf(stderr, "total size of all interned strings: "
10200 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10201 "mortal/immortal\n", mortal_size, immortal_size);
10202 Py_DECREF(keys);
10203 PyDict_Clear(interned);
10204 Py_DECREF(interned);
10205 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010206}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010207
10208
10209/********************* Unicode Iterator **************************/
10210
10211typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010212 PyObject_HEAD
10213 Py_ssize_t it_index;
10214 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010215} unicodeiterobject;
10216
10217static void
10218unicodeiter_dealloc(unicodeiterobject *it)
10219{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010220 _PyObject_GC_UNTRACK(it);
10221 Py_XDECREF(it->it_seq);
10222 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010223}
10224
10225static int
10226unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10227{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010228 Py_VISIT(it->it_seq);
10229 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010230}
10231
10232static PyObject *
10233unicodeiter_next(unicodeiterobject *it)
10234{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010235 PyUnicodeObject *seq;
10236 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010237
Benjamin Peterson14339b62009-01-31 16:36:08 +000010238 assert(it != NULL);
10239 seq = it->it_seq;
10240 if (seq == NULL)
10241 return NULL;
10242 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010243
Benjamin Peterson14339b62009-01-31 16:36:08 +000010244 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10245 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010246 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010247 if (item != NULL)
10248 ++it->it_index;
10249 return item;
10250 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010251
Benjamin Peterson14339b62009-01-31 16:36:08 +000010252 Py_DECREF(seq);
10253 it->it_seq = NULL;
10254 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010255}
10256
10257static PyObject *
10258unicodeiter_len(unicodeiterobject *it)
10259{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010260 Py_ssize_t len = 0;
10261 if (it->it_seq)
10262 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10263 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010264}
10265
10266PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10267
10268static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010269 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010270 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010271 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010272};
10273
10274PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010275 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10276 "str_iterator", /* tp_name */
10277 sizeof(unicodeiterobject), /* tp_basicsize */
10278 0, /* tp_itemsize */
10279 /* methods */
10280 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10281 0, /* tp_print */
10282 0, /* tp_getattr */
10283 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010284 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010285 0, /* tp_repr */
10286 0, /* tp_as_number */
10287 0, /* tp_as_sequence */
10288 0, /* tp_as_mapping */
10289 0, /* tp_hash */
10290 0, /* tp_call */
10291 0, /* tp_str */
10292 PyObject_GenericGetAttr, /* tp_getattro */
10293 0, /* tp_setattro */
10294 0, /* tp_as_buffer */
10295 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10296 0, /* tp_doc */
10297 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10298 0, /* tp_clear */
10299 0, /* tp_richcompare */
10300 0, /* tp_weaklistoffset */
10301 PyObject_SelfIter, /* tp_iter */
10302 (iternextfunc)unicodeiter_next, /* tp_iternext */
10303 unicodeiter_methods, /* tp_methods */
10304 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010305};
10306
10307static PyObject *
10308unicode_iter(PyObject *seq)
10309{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010310 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010311
Benjamin Peterson14339b62009-01-31 16:36:08 +000010312 if (!PyUnicode_Check(seq)) {
10313 PyErr_BadInternalCall();
10314 return NULL;
10315 }
10316 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10317 if (it == NULL)
10318 return NULL;
10319 it->it_index = 0;
10320 Py_INCREF(seq);
10321 it->it_seq = (PyUnicodeObject *)seq;
10322 _PyObject_GC_TRACK(it);
10323 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010324}
10325
Martin v. Löwis5b222132007-06-10 09:51:05 +000010326size_t
10327Py_UNICODE_strlen(const Py_UNICODE *u)
10328{
10329 int res = 0;
10330 while(*u++)
10331 res++;
10332 return res;
10333}
10334
10335Py_UNICODE*
10336Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10337{
10338 Py_UNICODE *u = s1;
10339 while ((*u++ = *s2++));
10340 return s1;
10341}
10342
10343Py_UNICODE*
10344Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10345{
10346 Py_UNICODE *u = s1;
10347 while ((*u++ = *s2++))
10348 if (n-- == 0)
10349 break;
10350 return s1;
10351}
10352
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010353Py_UNICODE*
10354Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10355{
10356 Py_UNICODE *u1 = s1;
10357 u1 += Py_UNICODE_strlen(u1);
10358 Py_UNICODE_strcpy(u1, s2);
10359 return s1;
10360}
10361
Martin v. Löwis5b222132007-06-10 09:51:05 +000010362int
10363Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10364{
10365 while (*s1 && *s2 && *s1 == *s2)
10366 s1++, s2++;
10367 if (*s1 && *s2)
10368 return (*s1 < *s2) ? -1 : +1;
10369 if (*s1)
10370 return 1;
10371 if (*s2)
10372 return -1;
10373 return 0;
10374}
10375
Victor Stinneref8d95c2010-08-16 22:03:11 +000010376int
10377Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10378{
10379 register Py_UNICODE u1, u2;
10380 for (; n != 0; n--) {
10381 u1 = *s1;
10382 u2 = *s2;
10383 if (u1 != u2)
10384 return (u1 < u2) ? -1 : +1;
10385 if (u1 == '\0')
10386 return 0;
10387 s1++;
10388 s2++;
10389 }
10390 return 0;
10391}
10392
Martin v. Löwis5b222132007-06-10 09:51:05 +000010393Py_UNICODE*
10394Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10395{
10396 const Py_UNICODE *p;
10397 for (p = s; *p; p++)
10398 if (*p == c)
10399 return (Py_UNICODE*)p;
10400 return NULL;
10401}
10402
Victor Stinner331ea922010-08-10 16:37:20 +000010403Py_UNICODE*
10404Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10405{
10406 const Py_UNICODE *p;
10407 p = s + Py_UNICODE_strlen(s);
10408 while (p != s) {
10409 p--;
10410 if (*p == c)
10411 return (Py_UNICODE*)p;
10412 }
10413 return NULL;
10414}
10415
Victor Stinner71133ff2010-09-01 23:43:53 +000010416Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010417PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010418{
10419 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10420 Py_UNICODE *copy;
10421 Py_ssize_t size;
10422
10423 /* Ensure we won't overflow the size. */
10424 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10425 PyErr_NoMemory();
10426 return NULL;
10427 }
10428 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10429 size *= sizeof(Py_UNICODE);
10430 copy = PyMem_Malloc(size);
10431 if (copy == NULL) {
10432 PyErr_NoMemory();
10433 return NULL;
10434 }
10435 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10436 return copy;
10437}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010438
Georg Brandl66c221e2010-10-14 07:04:07 +000010439/* A _string module, to export formatter_parser and formatter_field_name_split
10440 to the string.Formatter class implemented in Python. */
10441
10442static PyMethodDef _string_methods[] = {
10443 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10444 METH_O, PyDoc_STR("split the argument as a field name")},
10445 {"formatter_parser", (PyCFunction) formatter_parser,
10446 METH_O, PyDoc_STR("parse the argument as a format string")},
10447 {NULL, NULL}
10448};
10449
10450static struct PyModuleDef _string_module = {
10451 PyModuleDef_HEAD_INIT,
10452 "_string",
10453 PyDoc_STR("string helper module"),
10454 0,
10455 _string_methods,
10456 NULL,
10457 NULL,
10458 NULL,
10459 NULL
10460};
10461
10462PyMODINIT_FUNC
10463PyInit__string(void)
10464{
10465 return PyModule_Create(&_string_module);
10466}
10467
10468
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010469#ifdef __cplusplus
10470}
10471#endif