blob: 4f1177e1ad5f56bef80bb2e1dcfe019bc253298f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Alexander Belopolsky40018472011-02-26 01:02:56 +0000145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
Alexander Belopolsky40018472011-02-26 01:02:56 +0000151static void
152raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Alexander Belopolsky40018472011-02-26 01:02:56 +0000228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static int
261unicode_resize(register PyUnicodeObject *unicode,
262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000302 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303 }
304 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 return 0;
307}
308
309/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000310 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312
313 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316*/
317
Alexander Belopolsky40018472011-02-26 01:02:56 +0000318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320{
321 register PyUnicodeObject *unicode;
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty);
326 return unicode_empty;
327 }
328
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000329 /* Ensure we won't overflow the size. */
330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331 return (PyUnicodeObject *)PyErr_NoMemory();
332 }
333
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000335 if (free_list) {
336 unicode = free_list;
337 free_list = *(PyUnicodeObject **)unicode;
338 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 if (unicode->str) {
340 /* Keep-Alive optimization: we only upsize the buffer,
341 never downsize it. */
342 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000343 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 PyObject_DEL(unicode->str);
345 unicode->str = NULL;
346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 }
352 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 }
354 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 if (unicode == NULL)
358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000363 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 PyErr_NoMemory();
365 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000367 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000368 * the caller fails before initializing str -- unicode_resize()
369 * reads str[0], and the Keep-Alive optimization can keep memory
370 * allocated for str alive across a call to unicode_dealloc(unicode).
371 * We don't want unicode_resize to read uninitialized memory in
372 * that case.
373 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000374 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000383 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000386 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388}
389
Alexander Belopolsky40018472011-02-26 01:02:56 +0000390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392{
Walter Dörwald16807132007-05-25 13:52:07 +0000393 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000394 case SSTATE_NOT_INTERNED:
395 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000396
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError(
402 "deletion of interned string failed");
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_IMMORTAL:
406 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 default:
409 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410 }
411
Guido van Rossum604ddf82001-12-06 20:03:56 +0000412 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000414 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416 PyObject_DEL(unicode->str);
417 unicode->str = NULL;
418 unicode->length = 0;
419 }
420 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000421 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 }
423 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000424 *(PyUnicodeObject **)unicode = free_list;
425 free_list = unicode;
426 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000429 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 }
433}
434
Alexander Belopolsky40018472011-02-26 01:02:56 +0000435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000437{
438 register PyUnicodeObject *v;
439
440 /* Argument checks */
441 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 PyErr_BadInternalCall();
448 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 }
450
451 /* Resizing unicode_empty and single character objects is not
452 possible since these are being shared. We simply return a fresh
453 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000454 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 (v == unicode_empty || v->length == 1)) {
456 PyUnicodeObject *w = _PyUnicode_New(length);
457 if (w == NULL)
458 return -1;
459 Py_UNICODE_COPY(w->str, v->str,
460 length < v->length ? length : v->length);
461 Py_DECREF(*unicode);
462 *unicode = w;
463 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000464 }
465
466 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */
468 return unicode_resize(v, length);
469}
470
Alexander Belopolsky40018472011-02-26 01:02:56 +0000471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473{
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476
Alexander Belopolsky40018472011-02-26 01:02:56 +0000477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 PyUnicodeObject *unicode;
481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */
484 if (u != NULL) {
485
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000490 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000491
492 /* Single character Unicode objects in the Latin-1 range are
493 shared when using this constructor */
494 if (size == 1 && *u < 256) {
495 unicode = unicode_latin1[*u];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = *u;
501 unicode_latin1[*u] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 /* Copy the Unicode data into the new object */
513 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000514 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515
516 return (PyObject *)unicode;
517}
518
Alexander Belopolsky40018472011-02-26 01:02:56 +0000519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Alexander Belopolsky40018472011-02-26 01:02:56 +0000567PyObject *
568PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569{
570 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL;
574 }
575
576 return PyUnicode_FromStringAndSize(u, size);
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579#ifdef HAVE_WCHAR_H
580
Mark Dickinson081dfee2009-03-18 14:47:41 +0000581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */
589
Alexander Belopolsky40018472011-02-26 01:02:56 +0000590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +0000592{
593 PyUnicodeObject *unicode;
594 register Py_ssize_t i;
595 Py_ssize_t alloc;
596 const wchar_t *orig_w;
597
598 if (w == NULL) {
599 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 if (size == -1) {
606 size = wcslen(w);
607 }
608
609 alloc = size;
610 orig_w = w;
611 for (i = size; i > 0; i--) {
612 if (*w > 0xFFFF)
613 alloc++;
614 w++;
615 }
616 w = orig_w;
617 unicode = _PyUnicode_New(alloc);
618 if (!unicode)
619 return NULL;
620
621 /* Copy the wchar_t data into the new object */
622 {
623 register Py_UNICODE *u;
624 u = PyUnicode_AS_UNICODE(unicode);
625 for (i = size; i > 0; i--) {
626 if (*w > 0xFFFF) {
627 wchar_t ordinal = *w++;
628 ordinal -= 0x10000;
629 *u++ = 0xD800 | (ordinal >> 10);
630 *u++ = 0xDC00 | (ordinal & 0x3FF);
631 }
632 else
633 *u++ = *w++;
634 }
635 }
636 return (PyObject *)unicode;
637}
638
639#else
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 PyUnicodeObject *unicode;
645
646 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 PyErr_BadInternalCall();
650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Martin v. Löwis790465f2008-04-05 20:41:37 +0000653 if (size == -1) {
654 size = wcslen(w);
655 }
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 unicode = _PyUnicode_New(size);
658 if (!unicode)
659 return NULL;
660
661 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000664#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000666 register Py_UNICODE *u;
667 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--)
670 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 }
672#endif
673
674 return (PyObject *)unicode;
675}
676
Mark Dickinson081dfee2009-03-18 14:47:41 +0000677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
Walter Dörwald346737f2007-05-31 10:44:43 +0000681static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000684{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 *fmt++ = '%';
686 if (width) {
687 if (zeropad)
688 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width);
690 }
691 if (precision)
692 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag)
694 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702#else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706#endif
707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000708 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T;
710 while (*f)
711 *fmt++ = *f++;
712 }
713 *fmt++ = c;
714 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000715}
716
Victor Stinner96865452011-03-01 23:44:09 +0000717/* helper for PyUnicode_FromFormatV() */
718
719static const char*
720parse_format_flags(const char *f,
721 int *p_width, int *p_precision,
722 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723{
724 int width, precision, longflag, longlongflag, size_tflag;
725
726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727 f++;
728 width = 0;
729 while (Py_ISDIGIT((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 precision = 0;
732 if (*f == '.') {
733 f++;
734 while (Py_ISDIGIT((unsigned)*f))
735 precision = (precision*10) + *f++ - '0';
736 if (*f == '%') {
737 /* "%.3%s" => f points to "3" */
738 f--;
739 }
740 }
741 if (*f == '\0') {
742 /* bogus format "%.1" => go backward, f points to "1" */
743 f--;
744 }
745 if (p_width != NULL)
746 *p_width = width;
747 if (p_precision != NULL)
748 *p_precision = precision;
749
750 /* Handle %ld, %lu, %lld and %llu. */
751 longflag = 0;
752 longlongflag = 0;
753
754 if (*f == 'l') {
755 if (f[1] == 'd' || f[1] == 'u') {
756 longflag = 1;
757 ++f;
758 }
759#ifdef HAVE_LONG_LONG
760 else if (f[1] == 'l' &&
761 (f[2] == 'd' || f[2] == 'u')) {
762 longlongflag = 1;
763 f += 2;
764 }
765#endif
766 }
767 /* handle the size_t flag. */
768 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
769 size_tflag = 1;
770 ++f;
771 }
772 if (p_longflag != NULL)
773 *p_longflag = longflag;
774 if (p_longlongflag != NULL)
775 *p_longlongflag = longlongflag;
776 if (p_size_tflag != NULL)
777 *p_size_tflag = size_tflag;
778 return f;
779}
780
Walter Dörwaldd2034312007-05-18 16:29:38 +0000781#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
782
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000783/* size of fixed-size buffer for formatting single arguments */
784#define ITEM_BUFFER_LEN 21
785/* maximum number of characters required for output of %ld. 21 characters
786 allows for 64-bit integers (in decimal) and an optional sign. */
787#define MAX_LONG_CHARS 21
788/* maximum number of characters required for output of %lld.
789 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
790 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
791#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
792
Walter Dörwaldd2034312007-05-18 16:29:38 +0000793PyObject *
794PyUnicode_FromFormatV(const char *format, va_list vargs)
795{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000796 va_list count;
797 Py_ssize_t callcount = 0;
798 PyObject **callresults = NULL;
799 PyObject **callresult = NULL;
800 Py_ssize_t n = 0;
801 int width = 0;
802 int precision = 0;
803 int zeropad;
804 const char* f;
805 Py_UNICODE *s;
806 PyObject *string;
807 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000808 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000809 /* use abuffer instead of buffer, if we need more space
810 * (which can happen if there's a format specifier with width). */
811 char *abuffer = NULL;
812 char *realbuffer;
813 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000814 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000815 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000816
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000817 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000818 /* step 1: count the number of %S/%R/%A/%s format specifications
819 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
820 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
821 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000822 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000823 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +0000824 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
825 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
826 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000827 ++callcount;
828 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000829 else if (128 <= (unsigned char)*f) {
830 PyErr_Format(PyExc_ValueError,
831 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000832 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000833 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000834 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000835 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000836 }
837 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000838 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000839 if (callcount) {
840 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
841 if (!callresults) {
842 PyErr_NoMemory();
843 return NULL;
844 }
845 callresult = callresults;
846 }
847 /* step 3: figure out how large a buffer we need */
848 for (f = format; *f; f++) {
849 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000850#ifdef HAVE_LONG_LONG
Victor Stinner96865452011-03-01 23:44:09 +0000851 int longlongflag;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000852#endif
Victor Stinner96865452011-03-01 23:44:09 +0000853 const char* p;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000854
Victor Stinner96865452011-03-01 23:44:09 +0000855 p = f;
856 f = parse_format_flags(f, &width, NULL,
857 NULL, &longlongflag, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000858
Benjamin Peterson14339b62009-01-31 16:36:08 +0000859 switch (*f) {
860 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000861 {
862#ifndef Py_UNICODE_WIDE
863 int ordinal = va_arg(count, int);
864 if (ordinal > 0xffff)
865 n += 2;
866 else
867 n++;
868#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000869 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000870 n++;
871#endif
872 break;
873 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000874 case '%':
875 n++;
876 break;
877 case 'd': case 'u': case 'i': case 'x':
878 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000879#ifdef HAVE_LONG_LONG
880 if (longlongflag) {
881 if (width < MAX_LONG_LONG_CHARS)
882 width = MAX_LONG_LONG_CHARS;
883 }
884 else
885#endif
886 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
887 including sign. Decimal takes the most space. This
888 isn't enough for octal. If a width is specified we
889 need more (which we allocate later). */
890 if (width < MAX_LONG_CHARS)
891 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000892 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000893 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000894 if (abuffersize < width)
895 abuffersize = width;
896 break;
897 case 's':
898 {
899 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000900 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000901 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
902 if (!str)
903 goto fail;
904 n += PyUnicode_GET_SIZE(str);
905 /* Remember the str and switch to the next slot */
906 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000907 break;
908 }
909 case 'U':
910 {
911 PyObject *obj = va_arg(count, PyObject *);
912 assert(obj && PyUnicode_Check(obj));
913 n += PyUnicode_GET_SIZE(obj);
914 break;
915 }
916 case 'V':
917 {
918 PyObject *obj = va_arg(count, PyObject *);
919 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000920 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000921 assert(obj || str);
922 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +0000923 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000924 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000925 *callresult++ = NULL;
926 }
927 else {
928 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
929 if (!str_obj)
930 goto fail;
931 n += PyUnicode_GET_SIZE(str_obj);
932 *callresult++ = str_obj;
933 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000934 break;
935 }
936 case 'S':
937 {
938 PyObject *obj = va_arg(count, PyObject *);
939 PyObject *str;
940 assert(obj);
941 str = PyObject_Str(obj);
942 if (!str)
943 goto fail;
944 n += PyUnicode_GET_SIZE(str);
945 /* Remember the str and switch to the next slot */
946 *callresult++ = str;
947 break;
948 }
949 case 'R':
950 {
951 PyObject *obj = va_arg(count, PyObject *);
952 PyObject *repr;
953 assert(obj);
954 repr = PyObject_Repr(obj);
955 if (!repr)
956 goto fail;
957 n += PyUnicode_GET_SIZE(repr);
958 /* Remember the repr and switch to the next slot */
959 *callresult++ = repr;
960 break;
961 }
962 case 'A':
963 {
964 PyObject *obj = va_arg(count, PyObject *);
965 PyObject *ascii;
966 assert(obj);
967 ascii = PyObject_ASCII(obj);
968 if (!ascii)
969 goto fail;
970 n += PyUnicode_GET_SIZE(ascii);
971 /* Remember the repr and switch to the next slot */
972 *callresult++ = ascii;
973 break;
974 }
975 case 'p':
976 (void) va_arg(count, int);
977 /* maximum 64-bit pointer representation:
978 * 0xffffffffffffffff
979 * so 19 characters is enough.
980 * XXX I count 18 -- what's the extra for?
981 */
982 n += 19;
983 break;
984 default:
985 /* if we stumble upon an unknown
986 formatting code, copy the rest of
987 the format string to the output
988 string. (we cannot just skip the
989 code, since there's no way to know
990 what's in the argument list) */
991 n += strlen(p);
992 goto expand;
993 }
994 } else
995 n++;
996 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000997 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000998 if (abuffersize > ITEM_BUFFER_LEN) {
999 /* add 1 for sprintf's trailing null byte */
1000 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001001 if (!abuffer) {
1002 PyErr_NoMemory();
1003 goto fail;
1004 }
1005 realbuffer = abuffer;
1006 }
1007 else
1008 realbuffer = buffer;
1009 /* step 4: fill the buffer */
1010 /* Since we've analyzed how much space we need for the worst case,
1011 we don't have to resize the string.
1012 There can be no errors beyond this point. */
1013 string = PyUnicode_FromUnicode(NULL, n);
1014 if (!string)
1015 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001016
Benjamin Peterson14339b62009-01-31 16:36:08 +00001017 s = PyUnicode_AS_UNICODE(string);
1018 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001019
Benjamin Peterson14339b62009-01-31 16:36:08 +00001020 for (f = format; *f; f++) {
1021 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001022 const char* p;
1023 int longflag;
1024 int longlongflag;
1025 int size_tflag;
1026
1027 p = f;
1028 zeropad = (f[1] == '0');
1029 f = parse_format_flags(f, &width, &precision,
1030 &longflag, &longlongflag, &size_tflag);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001031
Benjamin Peterson14339b62009-01-31 16:36:08 +00001032 switch (*f) {
1033 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001034 {
1035 int ordinal = va_arg(vargs, int);
1036#ifndef Py_UNICODE_WIDE
1037 if (ordinal > 0xffff) {
1038 ordinal -= 0x10000;
1039 *s++ = 0xD800 | (ordinal >> 10);
1040 *s++ = 0xDC00 | (ordinal & 0x3FF);
1041 } else
1042#endif
1043 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001044 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001045 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001046 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001047 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1048 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001049 if (longflag)
1050 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001051#ifdef HAVE_LONG_LONG
1052 else if (longlongflag)
1053 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1054#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001055 else if (size_tflag)
1056 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1057 else
1058 sprintf(realbuffer, fmt, va_arg(vargs, int));
1059 appendstring(realbuffer);
1060 break;
1061 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001062 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1063 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 if (longflag)
1065 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001066#ifdef HAVE_LONG_LONG
1067 else if (longlongflag)
1068 sprintf(realbuffer, fmt, va_arg(vargs,
1069 unsigned PY_LONG_LONG));
1070#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001071 else if (size_tflag)
1072 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1073 else
1074 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1075 appendstring(realbuffer);
1076 break;
1077 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001078 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001079 sprintf(realbuffer, fmt, va_arg(vargs, int));
1080 appendstring(realbuffer);
1081 break;
1082 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001083 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001084 sprintf(realbuffer, fmt, va_arg(vargs, int));
1085 appendstring(realbuffer);
1086 break;
1087 case 's':
1088 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001089 /* unused, since we already have the result */
1090 (void) va_arg(vargs, char *);
1091 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1092 PyUnicode_GET_SIZE(*callresult));
1093 s += PyUnicode_GET_SIZE(*callresult);
1094 /* We're done with the unicode()/repr() => forget it */
1095 Py_DECREF(*callresult);
1096 /* switch to next unicode()/repr() result */
1097 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001098 break;
1099 }
1100 case 'U':
1101 {
1102 PyObject *obj = va_arg(vargs, PyObject *);
1103 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1104 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1105 s += size;
1106 break;
1107 }
1108 case 'V':
1109 {
1110 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001111 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001112 if (obj) {
1113 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1114 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1115 s += size;
1116 } else {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001117 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1118 PyUnicode_GET_SIZE(*callresult));
1119 s += PyUnicode_GET_SIZE(*callresult);
1120 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001121 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001122 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001123 break;
1124 }
1125 case 'S':
1126 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001127 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001128 {
1129 Py_UNICODE *ucopy;
1130 Py_ssize_t usize;
1131 Py_ssize_t upos;
1132 /* unused, since we already have the result */
1133 (void) va_arg(vargs, PyObject *);
1134 ucopy = PyUnicode_AS_UNICODE(*callresult);
1135 usize = PyUnicode_GET_SIZE(*callresult);
1136 for (upos = 0; upos<usize;)
1137 *s++ = ucopy[upos++];
1138 /* We're done with the unicode()/repr() => forget it */
1139 Py_DECREF(*callresult);
1140 /* switch to next unicode()/repr() result */
1141 ++callresult;
1142 break;
1143 }
1144 case 'p':
1145 sprintf(buffer, "%p", va_arg(vargs, void*));
1146 /* %p is ill-defined: ensure leading 0x. */
1147 if (buffer[1] == 'X')
1148 buffer[1] = 'x';
1149 else if (buffer[1] != 'x') {
1150 memmove(buffer+2, buffer, strlen(buffer)+1);
1151 buffer[0] = '0';
1152 buffer[1] = 'x';
1153 }
1154 appendstring(buffer);
1155 break;
1156 case '%':
1157 *s++ = '%';
1158 break;
1159 default:
1160 appendstring(p);
1161 goto end;
1162 }
Victor Stinner1205f272010-09-11 00:54:47 +00001163 }
Victor Stinner1205f272010-09-11 00:54:47 +00001164 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001165 *s++ = *f;
1166 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001167
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001169 if (callresults)
1170 PyObject_Free(callresults);
1171 if (abuffer)
1172 PyObject_Free(abuffer);
1173 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1174 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001175 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001176 if (callresults) {
1177 PyObject **callresult2 = callresults;
1178 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001179 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001180 ++callresult2;
1181 }
1182 PyObject_Free(callresults);
1183 }
1184 if (abuffer)
1185 PyObject_Free(abuffer);
1186 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001187}
1188
1189#undef appendstring
1190
1191PyObject *
1192PyUnicode_FromFormat(const char *format, ...)
1193{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001194 PyObject* ret;
1195 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001196
1197#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001198 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001199#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001200 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001201#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001202 ret = PyUnicode_FromFormatV(format, vargs);
1203 va_end(vargs);
1204 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001205}
1206
Victor Stinner5593d8a2010-10-02 11:11:27 +00001207/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1208 convert a Unicode object to a wide character string.
1209
1210 - If w is NULL: return the number of wide characters (including the nul
1211 character) required to convert the unicode object. Ignore size argument.
1212
1213 - Otherwise: return the number of wide characters (excluding the nul
1214 character) written into w. Write at most size wide characters (including
1215 the nul character). */
1216static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001217unicode_aswidechar(PyUnicodeObject *unicode,
1218 wchar_t *w,
1219 Py_ssize_t size)
1220{
1221#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001222 Py_ssize_t res;
1223 if (w != NULL) {
1224 res = PyUnicode_GET_SIZE(unicode);
1225 if (size > res)
1226 size = res + 1;
1227 else
1228 res = size;
1229 memcpy(w, unicode->str, size * sizeof(wchar_t));
1230 return res;
1231 }
1232 else
1233 return PyUnicode_GET_SIZE(unicode) + 1;
1234#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1235 register const Py_UNICODE *u;
1236 const Py_UNICODE *uend;
1237 const wchar_t *worig, *wend;
1238 Py_ssize_t nchar;
1239
Victor Stinner137c34c2010-09-29 10:25:54 +00001240 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001241 uend = u + PyUnicode_GET_SIZE(unicode);
1242 if (w != NULL) {
1243 worig = w;
1244 wend = w + size;
1245 while (u != uend && w != wend) {
1246 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1247 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1248 {
1249 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1250 u += 2;
1251 }
1252 else {
1253 *w = *u;
1254 u++;
1255 }
1256 w++;
1257 }
1258 if (w != wend)
1259 *w = L'\0';
1260 return w - worig;
1261 }
1262 else {
1263 nchar = 1; /* nul character at the end */
1264 while (u != uend) {
1265 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1266 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1267 u += 2;
1268 else
1269 u++;
1270 nchar++;
1271 }
1272 }
1273 return nchar;
1274#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1275 register Py_UNICODE *u, *uend, ordinal;
1276 register Py_ssize_t i;
1277 wchar_t *worig, *wend;
1278 Py_ssize_t nchar;
1279
1280 u = PyUnicode_AS_UNICODE(unicode);
1281 uend = u + PyUnicode_GET_SIZE(u);
1282 if (w != NULL) {
1283 worig = w;
1284 wend = w + size;
1285 while (u != uend && w != wend) {
1286 ordinal = *u;
1287 if (ordinal > 0xffff) {
1288 ordinal -= 0x10000;
1289 *w++ = 0xD800 | (ordinal >> 10);
1290 *w++ = 0xDC00 | (ordinal & 0x3FF);
1291 }
1292 else
1293 *w++ = ordinal;
1294 u++;
1295 }
1296 if (w != wend)
1297 *w = 0;
1298 return w - worig;
1299 }
1300 else {
1301 nchar = 1; /* nul character */
1302 while (u != uend) {
1303 if (*u > 0xffff)
1304 nchar += 2;
1305 else
1306 nchar++;
1307 u++;
1308 }
1309 return nchar;
1310 }
1311#else
1312# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001313#endif
1314}
1315
1316Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001317PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001318 wchar_t *w,
1319 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320{
1321 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001322 PyErr_BadInternalCall();
1323 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001325 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326}
1327
Victor Stinner137c34c2010-09-29 10:25:54 +00001328wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001329PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001330 Py_ssize_t *size)
1331{
1332 wchar_t* buffer;
1333 Py_ssize_t buflen;
1334
1335 if (unicode == NULL) {
1336 PyErr_BadInternalCall();
1337 return NULL;
1338 }
1339
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001340 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001341 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001342 PyErr_NoMemory();
1343 return NULL;
1344 }
1345
Victor Stinner137c34c2010-09-29 10:25:54 +00001346 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1347 if (buffer == NULL) {
1348 PyErr_NoMemory();
1349 return NULL;
1350 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001351 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001352 if (size != NULL)
1353 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001354 return buffer;
1355}
1356
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357#endif
1358
Alexander Belopolsky40018472011-02-26 01:02:56 +00001359PyObject *
1360PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001361{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001362 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001363
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001364 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 PyErr_SetString(PyExc_ValueError,
1366 "chr() arg not in range(0x110000)");
1367 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001368 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001369
1370#ifndef Py_UNICODE_WIDE
1371 if (ordinal > 0xffff) {
1372 ordinal -= 0x10000;
1373 s[0] = 0xD800 | (ordinal >> 10);
1374 s[1] = 0xDC00 | (ordinal & 0x3FF);
1375 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001376 }
1377#endif
1378
Hye-Shik Chang40574832004-04-06 07:24:51 +00001379 s[0] = (Py_UNICODE)ordinal;
1380 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001381}
1382
Alexander Belopolsky40018472011-02-26 01:02:56 +00001383PyObject *
1384PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001386 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001387 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001388 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001389 Py_INCREF(obj);
1390 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001391 }
1392 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001393 /* For a Unicode subtype that's not a Unicode object,
1394 return a true Unicode object with the same data. */
1395 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1396 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001397 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001398 PyErr_Format(PyExc_TypeError,
1399 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001400 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001401 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001402}
1403
Alexander Belopolsky40018472011-02-26 01:02:56 +00001404PyObject *
1405PyUnicode_FromEncodedObject(register PyObject *obj,
1406 const char *encoding,
1407 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001408{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001409 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001410 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001411
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001413 PyErr_BadInternalCall();
1414 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001415 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001416
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001417 /* Decoding bytes objects is the most common case and should be fast */
1418 if (PyBytes_Check(obj)) {
1419 if (PyBytes_GET_SIZE(obj) == 0) {
1420 Py_INCREF(unicode_empty);
1421 v = (PyObject *) unicode_empty;
1422 }
1423 else {
1424 v = PyUnicode_Decode(
1425 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1426 encoding, errors);
1427 }
1428 return v;
1429 }
1430
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001431 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001432 PyErr_SetString(PyExc_TypeError,
1433 "decoding str is not supported");
1434 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001435 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001436
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001437 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1438 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1439 PyErr_Format(PyExc_TypeError,
1440 "coercing to str: need bytes, bytearray "
1441 "or buffer-like object, %.80s found",
1442 Py_TYPE(obj)->tp_name);
1443 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001444 }
Tim Petersced69f82003-09-16 20:30:58 +00001445
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001446 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001447 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001448 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001449 }
Tim Petersced69f82003-09-16 20:30:58 +00001450 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001451 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001452
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001453 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001454 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455}
1456
Victor Stinner600d3be2010-06-10 12:00:55 +00001457/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001458 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1459 1 on success. */
1460static int
1461normalize_encoding(const char *encoding,
1462 char *lower,
1463 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001465 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001466 char *l;
1467 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001468
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001469 e = encoding;
1470 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001471 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001472 while (*e) {
1473 if (l == l_end)
1474 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001475 if (Py_ISUPPER(*e)) {
1476 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001477 }
1478 else if (*e == '_') {
1479 *l++ = '-';
1480 e++;
1481 }
1482 else {
1483 *l++ = *e++;
1484 }
1485 }
1486 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001487 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001488}
1489
Alexander Belopolsky40018472011-02-26 01:02:56 +00001490PyObject *
1491PyUnicode_Decode(const char *s,
1492 Py_ssize_t size,
1493 const char *encoding,
1494 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00001495{
1496 PyObject *buffer = NULL, *unicode;
1497 Py_buffer info;
1498 char lower[11]; /* Enough for any encoding shortcut */
1499
1500 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001501 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001502
1503 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001504 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001505 if ((strcmp(lower, "utf-8") == 0) ||
1506 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001507 return PyUnicode_DecodeUTF8(s, size, errors);
1508 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001509 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001510 (strcmp(lower, "iso-8859-1") == 0))
1511 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001512#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001513 else if (strcmp(lower, "mbcs") == 0)
1514 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001515#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001516 else if (strcmp(lower, "ascii") == 0)
1517 return PyUnicode_DecodeASCII(s, size, errors);
1518 else if (strcmp(lower, "utf-16") == 0)
1519 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1520 else if (strcmp(lower, "utf-32") == 0)
1521 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523
1524 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001525 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001526 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001527 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001528 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 if (buffer == NULL)
1530 goto onError;
1531 unicode = PyCodec_Decode(buffer, encoding, errors);
1532 if (unicode == NULL)
1533 goto onError;
1534 if (!PyUnicode_Check(unicode)) {
1535 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001536 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001537 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538 Py_DECREF(unicode);
1539 goto onError;
1540 }
1541 Py_DECREF(buffer);
1542 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001543
Benjamin Peterson29060642009-01-31 22:14:21 +00001544 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545 Py_XDECREF(buffer);
1546 return NULL;
1547}
1548
Alexander Belopolsky40018472011-02-26 01:02:56 +00001549PyObject *
1550PyUnicode_AsDecodedObject(PyObject *unicode,
1551 const char *encoding,
1552 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001553{
1554 PyObject *v;
1555
1556 if (!PyUnicode_Check(unicode)) {
1557 PyErr_BadArgument();
1558 goto onError;
1559 }
1560
1561 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001562 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001563
1564 /* Decode via the codec registry */
1565 v = PyCodec_Decode(unicode, encoding, errors);
1566 if (v == NULL)
1567 goto onError;
1568 return v;
1569
Benjamin Peterson29060642009-01-31 22:14:21 +00001570 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001571 return NULL;
1572}
1573
Alexander Belopolsky40018472011-02-26 01:02:56 +00001574PyObject *
1575PyUnicode_AsDecodedUnicode(PyObject *unicode,
1576 const char *encoding,
1577 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001578{
1579 PyObject *v;
1580
1581 if (!PyUnicode_Check(unicode)) {
1582 PyErr_BadArgument();
1583 goto onError;
1584 }
1585
1586 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001588
1589 /* Decode via the codec registry */
1590 v = PyCodec_Decode(unicode, encoding, errors);
1591 if (v == NULL)
1592 goto onError;
1593 if (!PyUnicode_Check(v)) {
1594 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001595 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001596 Py_TYPE(v)->tp_name);
1597 Py_DECREF(v);
1598 goto onError;
1599 }
1600 return v;
1601
Benjamin Peterson29060642009-01-31 22:14:21 +00001602 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001603 return NULL;
1604}
1605
Alexander Belopolsky40018472011-02-26 01:02:56 +00001606PyObject *
1607PyUnicode_Encode(const Py_UNICODE *s,
1608 Py_ssize_t size,
1609 const char *encoding,
1610 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611{
1612 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001613
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 unicode = PyUnicode_FromUnicode(s, size);
1615 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1618 Py_DECREF(unicode);
1619 return v;
1620}
1621
Alexander Belopolsky40018472011-02-26 01:02:56 +00001622PyObject *
1623PyUnicode_AsEncodedObject(PyObject *unicode,
1624 const char *encoding,
1625 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001626{
1627 PyObject *v;
1628
1629 if (!PyUnicode_Check(unicode)) {
1630 PyErr_BadArgument();
1631 goto onError;
1632 }
1633
1634 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001635 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001636
1637 /* Encode via the codec registry */
1638 v = PyCodec_Encode(unicode, encoding, errors);
1639 if (v == NULL)
1640 goto onError;
1641 return v;
1642
Benjamin Peterson29060642009-01-31 22:14:21 +00001643 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001644 return NULL;
1645}
1646
Victor Stinnerad158722010-10-27 00:25:46 +00001647PyObject *
1648PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001649{
Victor Stinner313a1202010-06-11 23:56:51 +00001650#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001651 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1652 PyUnicode_GET_SIZE(unicode),
1653 NULL);
1654#elif defined(__APPLE__)
1655 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1656 PyUnicode_GET_SIZE(unicode),
1657 "surrogateescape");
1658#else
1659 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001660 return PyUnicode_AsEncodedString(unicode,
1661 Py_FileSystemDefaultEncoding,
1662 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001663 }
1664 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001665 /* locale encoding with surrogateescape */
1666 wchar_t *wchar;
1667 char *bytes;
1668 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001669 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001670
1671 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1672 if (wchar == NULL)
1673 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001674 bytes = _Py_wchar2char(wchar, &error_pos);
1675 if (bytes == NULL) {
1676 if (error_pos != (size_t)-1) {
1677 char *errmsg = strerror(errno);
1678 PyObject *exc = NULL;
1679 if (errmsg == NULL)
1680 errmsg = "Py_wchar2char() failed";
1681 raise_encode_exception(&exc,
1682 "filesystemencoding",
1683 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1684 error_pos, error_pos+1,
1685 errmsg);
1686 Py_XDECREF(exc);
1687 }
1688 else
1689 PyErr_NoMemory();
1690 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001691 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001692 }
1693 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001694
1695 bytes_obj = PyBytes_FromString(bytes);
1696 PyMem_Free(bytes);
1697 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001698 }
Victor Stinnerad158722010-10-27 00:25:46 +00001699#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001700}
1701
Alexander Belopolsky40018472011-02-26 01:02:56 +00001702PyObject *
1703PyUnicode_AsEncodedString(PyObject *unicode,
1704 const char *encoding,
1705 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001706{
1707 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001708 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001709
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 if (!PyUnicode_Check(unicode)) {
1711 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 }
Fred Drakee4315f52000-05-09 19:53:39 +00001714
Tim Petersced69f82003-09-16 20:30:58 +00001715 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001716 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1717 PyUnicode_GET_SIZE(unicode),
1718 errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001719
1720 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001721 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001722 if ((strcmp(lower, "utf-8") == 0) ||
1723 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001724 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1725 PyUnicode_GET_SIZE(unicode),
1726 errors);
1727 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001728 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001729 (strcmp(lower, "iso-8859-1") == 0))
1730 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1731 PyUnicode_GET_SIZE(unicode),
1732 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001733#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001734 else if (strcmp(lower, "mbcs") == 0)
1735 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1736 PyUnicode_GET_SIZE(unicode),
1737 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001738#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001739 else if (strcmp(lower, "ascii") == 0)
1740 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1741 PyUnicode_GET_SIZE(unicode),
1742 errors);
1743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744
1745 /* Encode via the codec registry */
1746 v = PyCodec_Encode(unicode, encoding, errors);
1747 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001748 return NULL;
1749
1750 /* The normal path */
1751 if (PyBytes_Check(v))
1752 return v;
1753
1754 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001755 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001756 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001757 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001758
1759 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1760 "encoder %s returned bytearray instead of bytes",
1761 encoding);
1762 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001763 Py_DECREF(v);
1764 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001765 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001766
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001767 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1768 Py_DECREF(v);
1769 return b;
1770 }
1771
1772 PyErr_Format(PyExc_TypeError,
1773 "encoder did not return a bytes object (type=%.400s)",
1774 Py_TYPE(v)->tp_name);
1775 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001776 return NULL;
1777}
1778
Alexander Belopolsky40018472011-02-26 01:02:56 +00001779PyObject *
1780PyUnicode_AsEncodedUnicode(PyObject *unicode,
1781 const char *encoding,
1782 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001783{
1784 PyObject *v;
1785
1786 if (!PyUnicode_Check(unicode)) {
1787 PyErr_BadArgument();
1788 goto onError;
1789 }
1790
1791 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001792 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001793
1794 /* Encode via the codec registry */
1795 v = PyCodec_Encode(unicode, encoding, errors);
1796 if (v == NULL)
1797 goto onError;
1798 if (!PyUnicode_Check(v)) {
1799 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001800 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001801 Py_TYPE(v)->tp_name);
1802 Py_DECREF(v);
1803 goto onError;
1804 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001806
Benjamin Peterson29060642009-01-31 22:14:21 +00001807 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 return NULL;
1809}
1810
Alexander Belopolsky40018472011-02-26 01:02:56 +00001811PyObject *
1812_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1813 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001814{
1815 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001816 if (v)
1817 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001818 if (errors != NULL)
1819 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001820 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001821 PyUnicode_GET_SIZE(unicode),
1822 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001823 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001824 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001825 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001826 return v;
1827}
1828
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001829PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001830PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001831 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001832 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1833}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001834
Christian Heimes5894ba72007-11-04 11:43:14 +00001835PyObject*
1836PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1837{
Victor Stinnerad158722010-10-27 00:25:46 +00001838#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1839 return PyUnicode_DecodeMBCS(s, size, NULL);
1840#elif defined(__APPLE__)
1841 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1842#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001843 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1844 can be undefined. If it is case, decode using UTF-8. The following assumes
1845 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1846 bootstrapping process where the codecs aren't ready yet.
1847 */
1848 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001849 return PyUnicode_Decode(s, size,
1850 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001851 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001852 }
1853 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001854 /* locale encoding with surrogateescape */
1855 wchar_t *wchar;
1856 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001857 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001858
1859 if (s[size] != '\0' || size != strlen(s)) {
1860 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1861 return NULL;
1862 }
1863
Victor Stinner168e1172010-10-16 23:16:16 +00001864 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001865 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001866 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001867
Victor Stinner168e1172010-10-16 23:16:16 +00001868 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001869 PyMem_Free(wchar);
1870 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001871 }
Victor Stinnerad158722010-10-27 00:25:46 +00001872#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001873}
1874
Martin v. Löwis011e8422009-05-05 04:43:17 +00001875
1876int
1877PyUnicode_FSConverter(PyObject* arg, void* addr)
1878{
1879 PyObject *output = NULL;
1880 Py_ssize_t size;
1881 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001882 if (arg == NULL) {
1883 Py_DECREF(*(PyObject**)addr);
1884 return 1;
1885 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001886 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001887 output = arg;
1888 Py_INCREF(output);
1889 }
1890 else {
1891 arg = PyUnicode_FromObject(arg);
1892 if (!arg)
1893 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001894 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001895 Py_DECREF(arg);
1896 if (!output)
1897 return 0;
1898 if (!PyBytes_Check(output)) {
1899 Py_DECREF(output);
1900 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1901 return 0;
1902 }
1903 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001904 size = PyBytes_GET_SIZE(output);
1905 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001906 if (size != strlen(data)) {
1907 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1908 Py_DECREF(output);
1909 return 0;
1910 }
1911 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001912 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001913}
1914
1915
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001916int
1917PyUnicode_FSDecoder(PyObject* arg, void* addr)
1918{
1919 PyObject *output = NULL;
1920 Py_ssize_t size;
1921 void *data;
1922 if (arg == NULL) {
1923 Py_DECREF(*(PyObject**)addr);
1924 return 1;
1925 }
1926 if (PyUnicode_Check(arg)) {
1927 output = arg;
1928 Py_INCREF(output);
1929 }
1930 else {
1931 arg = PyBytes_FromObject(arg);
1932 if (!arg)
1933 return 0;
1934 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1935 PyBytes_GET_SIZE(arg));
1936 Py_DECREF(arg);
1937 if (!output)
1938 return 0;
1939 if (!PyUnicode_Check(output)) {
1940 Py_DECREF(output);
1941 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1942 return 0;
1943 }
1944 }
1945 size = PyUnicode_GET_SIZE(output);
1946 data = PyUnicode_AS_UNICODE(output);
1947 if (size != Py_UNICODE_strlen(data)) {
1948 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1949 Py_DECREF(output);
1950 return 0;
1951 }
1952 *(PyObject**)addr = output;
1953 return Py_CLEANUP_SUPPORTED;
1954}
1955
1956
Martin v. Löwis5b222132007-06-10 09:51:05 +00001957char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001958_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001959{
Christian Heimesf3863112007-11-22 07:46:41 +00001960 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001961 if (!PyUnicode_Check(unicode)) {
1962 PyErr_BadArgument();
1963 return NULL;
1964 }
Christian Heimesf3863112007-11-22 07:46:41 +00001965 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1966 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001967 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001968 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001969 *psize = PyBytes_GET_SIZE(bytes);
1970 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001971}
1972
1973char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001974_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001975{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001976 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001977}
1978
Alexander Belopolsky40018472011-02-26 01:02:56 +00001979Py_UNICODE *
1980PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981{
1982 if (!PyUnicode_Check(unicode)) {
1983 PyErr_BadArgument();
1984 goto onError;
1985 }
1986 return PyUnicode_AS_UNICODE(unicode);
1987
Benjamin Peterson29060642009-01-31 22:14:21 +00001988 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 return NULL;
1990}
1991
Alexander Belopolsky40018472011-02-26 01:02:56 +00001992Py_ssize_t
1993PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994{
1995 if (!PyUnicode_Check(unicode)) {
1996 PyErr_BadArgument();
1997 goto onError;
1998 }
1999 return PyUnicode_GET_SIZE(unicode);
2000
Benjamin Peterson29060642009-01-31 22:14:21 +00002001 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002002 return -1;
2003}
2004
Alexander Belopolsky40018472011-02-26 01:02:56 +00002005const char *
2006PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002007{
Victor Stinner42cb4622010-09-01 19:39:01 +00002008 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002009}
2010
Victor Stinner554f3f02010-06-16 23:33:54 +00002011/* create or adjust a UnicodeDecodeError */
2012static void
2013make_decode_exception(PyObject **exceptionObject,
2014 const char *encoding,
2015 const char *input, Py_ssize_t length,
2016 Py_ssize_t startpos, Py_ssize_t endpos,
2017 const char *reason)
2018{
2019 if (*exceptionObject == NULL) {
2020 *exceptionObject = PyUnicodeDecodeError_Create(
2021 encoding, input, length, startpos, endpos, reason);
2022 }
2023 else {
2024 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2025 goto onError;
2026 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2027 goto onError;
2028 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2029 goto onError;
2030 }
2031 return;
2032
2033onError:
2034 Py_DECREF(*exceptionObject);
2035 *exceptionObject = NULL;
2036}
2037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002038/* error handling callback helper:
2039 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002040 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041 and adjust various state variables.
2042 return 0 on success, -1 on error
2043*/
2044
Alexander Belopolsky40018472011-02-26 01:02:56 +00002045static int
2046unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2047 const char *encoding, const char *reason,
2048 const char **input, const char **inend, Py_ssize_t *startinpos,
2049 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2050 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002051{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002052 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053
2054 PyObject *restuple = NULL;
2055 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002056 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002057 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002058 Py_ssize_t requiredsize;
2059 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002061 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002062 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002063 int res = -1;
2064
2065 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002066 *errorHandler = PyCodec_LookupError(errors);
2067 if (*errorHandler == NULL)
2068 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 }
2070
Victor Stinner554f3f02010-06-16 23:33:54 +00002071 make_decode_exception(exceptionObject,
2072 encoding,
2073 *input, *inend - *input,
2074 *startinpos, *endinpos,
2075 reason);
2076 if (*exceptionObject == NULL)
2077 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002078
2079 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2080 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002081 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002083 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085 }
2086 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002087 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002088
2089 /* Copy back the bytes variables, which might have been modified by the
2090 callback */
2091 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2092 if (!inputobj)
2093 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002094 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002095 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002096 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002097 *input = PyBytes_AS_STRING(inputobj);
2098 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002099 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002100 /* we can DECREF safely, as the exception has another reference,
2101 so the object won't go away. */
2102 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002103
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002104 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002105 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002106 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002107 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2108 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002109 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002110
2111 /* need more space? (at least enough for what we
2112 have+the replacement+the rest of the string (starting
2113 at the new input position), so we won't have to check space
2114 when there are no errors in the rest of the string) */
2115 repptr = PyUnicode_AS_UNICODE(repunicode);
2116 repsize = PyUnicode_GET_SIZE(repunicode);
2117 requiredsize = *outpos + repsize + insize-newpos;
2118 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002119 if (requiredsize<2*outsize)
2120 requiredsize = 2*outsize;
2121 if (_PyUnicode_Resize(output, requiredsize) < 0)
2122 goto onError;
2123 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002124 }
2125 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002126 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 Py_UNICODE_COPY(*outptr, repptr, repsize);
2128 *outptr += repsize;
2129 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002130
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002131 /* we made it! */
2132 res = 0;
2133
Benjamin Peterson29060642009-01-31 22:14:21 +00002134 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002135 Py_XDECREF(restuple);
2136 return res;
2137}
2138
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002139/* --- UTF-7 Codec -------------------------------------------------------- */
2140
Antoine Pitrou244651a2009-05-04 18:56:13 +00002141/* See RFC2152 for details. We encode conservatively and decode liberally. */
2142
2143/* Three simple macros defining base-64. */
2144
2145/* Is c a base-64 character? */
2146
2147#define IS_BASE64(c) \
2148 (((c) >= 'A' && (c) <= 'Z') || \
2149 ((c) >= 'a' && (c) <= 'z') || \
2150 ((c) >= '0' && (c) <= '9') || \
2151 (c) == '+' || (c) == '/')
2152
2153/* given that c is a base-64 character, what is its base-64 value? */
2154
2155#define FROM_BASE64(c) \
2156 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2157 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2158 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2159 (c) == '+' ? 62 : 63)
2160
2161/* What is the base-64 character of the bottom 6 bits of n? */
2162
2163#define TO_BASE64(n) \
2164 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2165
2166/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2167 * decoded as itself. We are permissive on decoding; the only ASCII
2168 * byte not decoding to itself is the + which begins a base64
2169 * string. */
2170
2171#define DECODE_DIRECT(c) \
2172 ((c) <= 127 && (c) != '+')
2173
2174/* The UTF-7 encoder treats ASCII characters differently according to
2175 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2176 * the above). See RFC2152. This array identifies these different
2177 * sets:
2178 * 0 : "Set D"
2179 * alphanumeric and '(),-./:?
2180 * 1 : "Set O"
2181 * !"#$%&*;<=>@[]^_`{|}
2182 * 2 : "whitespace"
2183 * ht nl cr sp
2184 * 3 : special (must be base64 encoded)
2185 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2186 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002187
Tim Petersced69f82003-09-16 20:30:58 +00002188static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002189char utf7_category[128] = {
2190/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2191 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2192/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2193 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2194/* sp ! " # $ % & ' ( ) * + , - . / */
2195 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2196/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2198/* @ A B C D E F G H I J K L M N O */
2199 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2200/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2202/* ` a b c d e f g h i j k l m n o */
2203 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2204/* p q r s t u v w x y z { | } ~ del */
2205 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206};
2207
Antoine Pitrou244651a2009-05-04 18:56:13 +00002208/* ENCODE_DIRECT: this character should be encoded as itself. The
2209 * answer depends on whether we are encoding set O as itself, and also
2210 * on whether we are encoding whitespace as itself. RFC2152 makes it
2211 * clear that the answers to these questions vary between
2212 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002213
Antoine Pitrou244651a2009-05-04 18:56:13 +00002214#define ENCODE_DIRECT(c, directO, directWS) \
2215 ((c) < 128 && (c) > 0 && \
2216 ((utf7_category[(c)] == 0) || \
2217 (directWS && (utf7_category[(c)] == 2)) || \
2218 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002219
Alexander Belopolsky40018472011-02-26 01:02:56 +00002220PyObject *
2221PyUnicode_DecodeUTF7(const char *s,
2222 Py_ssize_t size,
2223 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002224{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002225 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2226}
2227
Antoine Pitrou244651a2009-05-04 18:56:13 +00002228/* The decoder. The only state we preserve is our read position,
2229 * i.e. how many characters we have consumed. So if we end in the
2230 * middle of a shift sequence we have to back off the read position
2231 * and the output to the beginning of the sequence, otherwise we lose
2232 * all the shift state (seen bits, number of bits seen, high
2233 * surrogate). */
2234
Alexander Belopolsky40018472011-02-26 01:02:56 +00002235PyObject *
2236PyUnicode_DecodeUTF7Stateful(const char *s,
2237 Py_ssize_t size,
2238 const char *errors,
2239 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002240{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002241 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002242 Py_ssize_t startinpos;
2243 Py_ssize_t endinpos;
2244 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002245 const char *e;
2246 PyUnicodeObject *unicode;
2247 Py_UNICODE *p;
2248 const char *errmsg = "";
2249 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002250 Py_UNICODE *shiftOutStart;
2251 unsigned int base64bits = 0;
2252 unsigned long base64buffer = 0;
2253 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002254 PyObject *errorHandler = NULL;
2255 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002256
2257 unicode = _PyUnicode_New(size);
2258 if (!unicode)
2259 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002260 if (size == 0) {
2261 if (consumed)
2262 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002263 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002264 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002265
2266 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002267 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002268 e = s + size;
2269
2270 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002272 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002273 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002274
Antoine Pitrou244651a2009-05-04 18:56:13 +00002275 if (inShift) { /* in a base-64 section */
2276 if (IS_BASE64(ch)) { /* consume a base-64 character */
2277 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2278 base64bits += 6;
2279 s++;
2280 if (base64bits >= 16) {
2281 /* we have enough bits for a UTF-16 value */
2282 Py_UNICODE outCh = (Py_UNICODE)
2283 (base64buffer >> (base64bits-16));
2284 base64bits -= 16;
2285 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2286 if (surrogate) {
2287 /* expecting a second surrogate */
2288 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2289#ifdef Py_UNICODE_WIDE
2290 *p++ = (((surrogate & 0x3FF)<<10)
2291 | (outCh & 0x3FF)) + 0x10000;
2292#else
2293 *p++ = surrogate;
2294 *p++ = outCh;
2295#endif
2296 surrogate = 0;
2297 }
2298 else {
2299 surrogate = 0;
2300 errmsg = "second surrogate missing";
2301 goto utf7Error;
2302 }
2303 }
2304 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2305 /* first surrogate */
2306 surrogate = outCh;
2307 }
2308 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2309 errmsg = "unexpected second surrogate";
2310 goto utf7Error;
2311 }
2312 else {
2313 *p++ = outCh;
2314 }
2315 }
2316 }
2317 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002318 inShift = 0;
2319 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002320 if (surrogate) {
2321 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002322 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002323 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002324 if (base64bits > 0) { /* left-over bits */
2325 if (base64bits >= 6) {
2326 /* We've seen at least one base-64 character */
2327 errmsg = "partial character in shift sequence";
2328 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002329 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002330 else {
2331 /* Some bits remain; they should be zero */
2332 if (base64buffer != 0) {
2333 errmsg = "non-zero padding bits in shift sequence";
2334 goto utf7Error;
2335 }
2336 }
2337 }
2338 if (ch != '-') {
2339 /* '-' is absorbed; other terminating
2340 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002341 *p++ = ch;
2342 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002343 }
2344 }
2345 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002346 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002347 s++; /* consume '+' */
2348 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002349 s++;
2350 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002351 }
2352 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002353 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002354 shiftOutStart = p;
2355 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002356 }
2357 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002358 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002359 *p++ = ch;
2360 s++;
2361 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002362 else {
2363 startinpos = s-starts;
2364 s++;
2365 errmsg = "unexpected special character";
2366 goto utf7Error;
2367 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002368 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002369utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002370 outpos = p-PyUnicode_AS_UNICODE(unicode);
2371 endinpos = s-starts;
2372 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002373 errors, &errorHandler,
2374 "utf7", errmsg,
2375 &starts, &e, &startinpos, &endinpos, &exc, &s,
2376 &unicode, &outpos, &p))
2377 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002378 }
2379
Antoine Pitrou244651a2009-05-04 18:56:13 +00002380 /* end of string */
2381
2382 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2383 /* if we're in an inconsistent state, that's an error */
2384 if (surrogate ||
2385 (base64bits >= 6) ||
2386 (base64bits > 0 && base64buffer != 0)) {
2387 outpos = p-PyUnicode_AS_UNICODE(unicode);
2388 endinpos = size;
2389 if (unicode_decode_call_errorhandler(
2390 errors, &errorHandler,
2391 "utf7", "unterminated shift sequence",
2392 &starts, &e, &startinpos, &endinpos, &exc, &s,
2393 &unicode, &outpos, &p))
2394 goto onError;
2395 if (s < e)
2396 goto restart;
2397 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002398 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002399
2400 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002401 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002402 if (inShift) {
2403 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002404 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002405 }
2406 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002407 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002408 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002409 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002410
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002411 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002412 goto onError;
2413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002414 Py_XDECREF(errorHandler);
2415 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002416 return (PyObject *)unicode;
2417
Benjamin Peterson29060642009-01-31 22:14:21 +00002418 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002419 Py_XDECREF(errorHandler);
2420 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002421 Py_DECREF(unicode);
2422 return NULL;
2423}
2424
2425
Alexander Belopolsky40018472011-02-26 01:02:56 +00002426PyObject *
2427PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2428 Py_ssize_t size,
2429 int base64SetO,
2430 int base64WhiteSpace,
2431 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002432{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002433 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002434 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002435 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002436 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002437 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002438 unsigned int base64bits = 0;
2439 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002440 char * out;
2441 char * start;
2442
2443 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002444 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002445
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002446 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002447 return PyErr_NoMemory();
2448
Antoine Pitrou244651a2009-05-04 18:56:13 +00002449 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002450 if (v == NULL)
2451 return NULL;
2452
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002453 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002454 for (;i < size; ++i) {
2455 Py_UNICODE ch = s[i];
2456
Antoine Pitrou244651a2009-05-04 18:56:13 +00002457 if (inShift) {
2458 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2459 /* shifting out */
2460 if (base64bits) { /* output remaining bits */
2461 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2462 base64buffer = 0;
2463 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002464 }
2465 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002466 /* Characters not in the BASE64 set implicitly unshift the sequence
2467 so no '-' is required, except if the character is itself a '-' */
2468 if (IS_BASE64(ch) || ch == '-') {
2469 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002471 *out++ = (char) ch;
2472 }
2473 else {
2474 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002475 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002476 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002477 else { /* not in a shift sequence */
2478 if (ch == '+') {
2479 *out++ = '+';
2480 *out++ = '-';
2481 }
2482 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2483 *out++ = (char) ch;
2484 }
2485 else {
2486 *out++ = '+';
2487 inShift = 1;
2488 goto encode_char;
2489 }
2490 }
2491 continue;
2492encode_char:
2493#ifdef Py_UNICODE_WIDE
2494 if (ch >= 0x10000) {
2495 /* code first surrogate */
2496 base64bits += 16;
2497 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2498 while (base64bits >= 6) {
2499 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2500 base64bits -= 6;
2501 }
2502 /* prepare second surrogate */
2503 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2504 }
2505#endif
2506 base64bits += 16;
2507 base64buffer = (base64buffer << 16) | ch;
2508 while (base64bits >= 6) {
2509 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2510 base64bits -= 6;
2511 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002512 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002513 if (base64bits)
2514 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2515 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002516 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002517 if (_PyBytes_Resize(&v, out - start) < 0)
2518 return NULL;
2519 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002520}
2521
Antoine Pitrou244651a2009-05-04 18:56:13 +00002522#undef IS_BASE64
2523#undef FROM_BASE64
2524#undef TO_BASE64
2525#undef DECODE_DIRECT
2526#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002527
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528/* --- UTF-8 Codec -------------------------------------------------------- */
2529
Tim Petersced69f82003-09-16 20:30:58 +00002530static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002532 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2533 illegal prefix. See RFC 3629 for details */
2534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2544 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2546 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2547 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2548 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2549 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550};
2551
Alexander Belopolsky40018472011-02-26 01:02:56 +00002552PyObject *
2553PyUnicode_DecodeUTF8(const char *s,
2554 Py_ssize_t size,
2555 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556{
Walter Dörwald69652032004-09-07 20:24:22 +00002557 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2558}
2559
Antoine Pitrouab868312009-01-10 15:40:25 +00002560/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2561#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2562
2563/* Mask to quickly check whether a C 'long' contains a
2564 non-ASCII, UTF8-encoded char. */
2565#if (SIZEOF_LONG == 8)
2566# define ASCII_CHAR_MASK 0x8080808080808080L
2567#elif (SIZEOF_LONG == 4)
2568# define ASCII_CHAR_MASK 0x80808080L
2569#else
2570# error C 'long' size should be either 4 or 8!
2571#endif
2572
Alexander Belopolsky40018472011-02-26 01:02:56 +00002573PyObject *
2574PyUnicode_DecodeUTF8Stateful(const char *s,
2575 Py_ssize_t size,
2576 const char *errors,
2577 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002578{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002579 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002581 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002582 Py_ssize_t startinpos;
2583 Py_ssize_t endinpos;
2584 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002585 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002586 PyUnicodeObject *unicode;
2587 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002588 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589 PyObject *errorHandler = NULL;
2590 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591
2592 /* Note: size will always be longer than the resulting Unicode
2593 character count */
2594 unicode = _PyUnicode_New(size);
2595 if (!unicode)
2596 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002597 if (size == 0) {
2598 if (consumed)
2599 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602
2603 /* Unpack UTF-8 encoded data */
2604 p = unicode->str;
2605 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002606 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607
2608 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002609 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
2611 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002612 /* Fast path for runs of ASCII characters. Given that common UTF-8
2613 input will consist of an overwhelming majority of ASCII
2614 characters, we try to optimize for this case by checking
2615 as many characters as a C 'long' can contain.
2616 First, check if we can do an aligned read, as most CPUs have
2617 a penalty for unaligned reads.
2618 */
2619 if (!((size_t) s & LONG_PTR_MASK)) {
2620 /* Help register allocation */
2621 register const char *_s = s;
2622 register Py_UNICODE *_p = p;
2623 while (_s < aligned_end) {
2624 /* Read a whole long at a time (either 4 or 8 bytes),
2625 and do a fast unrolled copy if it only contains ASCII
2626 characters. */
2627 unsigned long data = *(unsigned long *) _s;
2628 if (data & ASCII_CHAR_MASK)
2629 break;
2630 _p[0] = (unsigned char) _s[0];
2631 _p[1] = (unsigned char) _s[1];
2632 _p[2] = (unsigned char) _s[2];
2633 _p[3] = (unsigned char) _s[3];
2634#if (SIZEOF_LONG == 8)
2635 _p[4] = (unsigned char) _s[4];
2636 _p[5] = (unsigned char) _s[5];
2637 _p[6] = (unsigned char) _s[6];
2638 _p[7] = (unsigned char) _s[7];
2639#endif
2640 _s += SIZEOF_LONG;
2641 _p += SIZEOF_LONG;
2642 }
2643 s = _s;
2644 p = _p;
2645 if (s == e)
2646 break;
2647 ch = (unsigned char)*s;
2648 }
2649 }
2650
2651 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002652 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 s++;
2654 continue;
2655 }
2656
2657 n = utf8_code_length[ch];
2658
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002659 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002660 if (consumed)
2661 break;
2662 else {
2663 errmsg = "unexpected end of data";
2664 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002665 endinpos = startinpos+1;
2666 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2667 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002668 goto utf8Error;
2669 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671
2672 switch (n) {
2673
2674 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002675 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002676 startinpos = s-starts;
2677 endinpos = startinpos+1;
2678 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679
2680 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002681 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 startinpos = s-starts;
2683 endinpos = startinpos+1;
2684 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685
2686 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002687 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002688 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002690 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002691 goto utf8Error;
2692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002694 assert ((ch > 0x007F) && (ch <= 0x07FF));
2695 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 break;
2697
2698 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002699 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2700 will result in surrogates in range d800-dfff. Surrogates are
2701 not valid UTF-8 so they are rejected.
2702 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2703 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002704 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002705 (s[2] & 0xc0) != 0x80 ||
2706 ((unsigned char)s[0] == 0xE0 &&
2707 (unsigned char)s[1] < 0xA0) ||
2708 ((unsigned char)s[0] == 0xED &&
2709 (unsigned char)s[1] > 0x9F)) {
2710 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002712 endinpos = startinpos + 1;
2713
2714 /* if s[1] first two bits are 1 and 0, then the invalid
2715 continuation byte is s[2], so increment endinpos by 1,
2716 if not, s[1] is invalid and endinpos doesn't need to
2717 be incremented. */
2718 if ((s[1] & 0xC0) == 0x80)
2719 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002720 goto utf8Error;
2721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002723 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2724 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002725 break;
2726
2727 case 4:
2728 if ((s[1] & 0xc0) != 0x80 ||
2729 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002730 (s[3] & 0xc0) != 0x80 ||
2731 ((unsigned char)s[0] == 0xF0 &&
2732 (unsigned char)s[1] < 0x90) ||
2733 ((unsigned char)s[0] == 0xF4 &&
2734 (unsigned char)s[1] > 0x8F)) {
2735 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002736 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002737 endinpos = startinpos + 1;
2738 if ((s[1] & 0xC0) == 0x80) {
2739 endinpos++;
2740 if ((s[2] & 0xC0) == 0x80)
2741 endinpos++;
2742 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002743 goto utf8Error;
2744 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002745 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002746 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2747 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2748
Fredrik Lundh8f455852001-06-27 18:59:43 +00002749#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002750 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002751#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002752 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002753
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002754 /* translate from 10000..10FFFF to 0..FFFF */
2755 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002756
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002757 /* high surrogate = top 10 bits added to D800 */
2758 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002759
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002760 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002761 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002762#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 }
2765 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002767
Benjamin Peterson29060642009-01-31 22:14:21 +00002768 utf8Error:
2769 outpos = p-PyUnicode_AS_UNICODE(unicode);
2770 if (unicode_decode_call_errorhandler(
2771 errors, &errorHandler,
2772 "utf8", errmsg,
2773 &starts, &e, &startinpos, &endinpos, &exc, &s,
2774 &unicode, &outpos, &p))
2775 goto onError;
2776 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 }
Walter Dörwald69652032004-09-07 20:24:22 +00002778 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002779 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780
2781 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002782 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 goto onError;
2784
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002785 Py_XDECREF(errorHandler);
2786 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 return (PyObject *)unicode;
2788
Benjamin Peterson29060642009-01-31 22:14:21 +00002789 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002790 Py_XDECREF(errorHandler);
2791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 Py_DECREF(unicode);
2793 return NULL;
2794}
2795
Antoine Pitrouab868312009-01-10 15:40:25 +00002796#undef ASCII_CHAR_MASK
2797
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002798#ifdef __APPLE__
2799
2800/* Simplified UTF-8 decoder using surrogateescape error handler,
2801 used to decode the command line arguments on Mac OS X. */
2802
2803wchar_t*
2804_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2805{
2806 int n;
2807 const char *e;
2808 wchar_t *unicode, *p;
2809
2810 /* Note: size will always be longer than the resulting Unicode
2811 character count */
2812 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2813 PyErr_NoMemory();
2814 return NULL;
2815 }
2816 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2817 if (!unicode)
2818 return NULL;
2819
2820 /* Unpack UTF-8 encoded data */
2821 p = unicode;
2822 e = s + size;
2823 while (s < e) {
2824 Py_UCS4 ch = (unsigned char)*s;
2825
2826 if (ch < 0x80) {
2827 *p++ = (wchar_t)ch;
2828 s++;
2829 continue;
2830 }
2831
2832 n = utf8_code_length[ch];
2833 if (s + n > e) {
2834 goto surrogateescape;
2835 }
2836
2837 switch (n) {
2838 case 0:
2839 case 1:
2840 goto surrogateescape;
2841
2842 case 2:
2843 if ((s[1] & 0xc0) != 0x80)
2844 goto surrogateescape;
2845 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2846 assert ((ch > 0x007F) && (ch <= 0x07FF));
2847 *p++ = (wchar_t)ch;
2848 break;
2849
2850 case 3:
2851 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2852 will result in surrogates in range d800-dfff. Surrogates are
2853 not valid UTF-8 so they are rejected.
2854 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2855 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2856 if ((s[1] & 0xc0) != 0x80 ||
2857 (s[2] & 0xc0) != 0x80 ||
2858 ((unsigned char)s[0] == 0xE0 &&
2859 (unsigned char)s[1] < 0xA0) ||
2860 ((unsigned char)s[0] == 0xED &&
2861 (unsigned char)s[1] > 0x9F)) {
2862
2863 goto surrogateescape;
2864 }
2865 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2866 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2867 *p++ = (Py_UNICODE)ch;
2868 break;
2869
2870 case 4:
2871 if ((s[1] & 0xc0) != 0x80 ||
2872 (s[2] & 0xc0) != 0x80 ||
2873 (s[3] & 0xc0) != 0x80 ||
2874 ((unsigned char)s[0] == 0xF0 &&
2875 (unsigned char)s[1] < 0x90) ||
2876 ((unsigned char)s[0] == 0xF4 &&
2877 (unsigned char)s[1] > 0x8F)) {
2878 goto surrogateescape;
2879 }
2880 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2881 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2882 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2883
2884#if SIZEOF_WCHAR_T == 4
2885 *p++ = (wchar_t)ch;
2886#else
2887 /* compute and append the two surrogates: */
2888
2889 /* translate from 10000..10FFFF to 0..FFFF */
2890 ch -= 0x10000;
2891
2892 /* high surrogate = top 10 bits added to D800 */
2893 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2894
2895 /* low surrogate = bottom 10 bits added to DC00 */
2896 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2897#endif
2898 break;
2899 }
2900 s += n;
2901 continue;
2902
2903 surrogateescape:
2904 *p++ = 0xDC00 + ch;
2905 s++;
2906 }
2907 *p = L'\0';
2908 return unicode;
2909}
2910
2911#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002912
Tim Peters602f7402002-04-27 18:03:26 +00002913/* Allocation strategy: if the string is short, convert into a stack buffer
2914 and allocate exactly as much space needed at the end. Else allocate the
2915 maximum possible needed (4 result bytes per Unicode character), and return
2916 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002917*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002918PyObject *
2919PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002920 Py_ssize_t size,
2921 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922{
Tim Peters602f7402002-04-27 18:03:26 +00002923#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002924
Guido van Rossum98297ee2007-11-06 21:34:58 +00002925 Py_ssize_t i; /* index into s of next input byte */
2926 PyObject *result; /* result string object */
2927 char *p; /* next free byte in output buffer */
2928 Py_ssize_t nallocated; /* number of result bytes allocated */
2929 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002930 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002931 PyObject *errorHandler = NULL;
2932 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002933
Tim Peters602f7402002-04-27 18:03:26 +00002934 assert(s != NULL);
2935 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936
Tim Peters602f7402002-04-27 18:03:26 +00002937 if (size <= MAX_SHORT_UNICHARS) {
2938 /* Write into the stack buffer; nallocated can't overflow.
2939 * At the end, we'll allocate exactly as much heap space as it
2940 * turns out we need.
2941 */
2942 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002943 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002944 p = stackbuf;
2945 }
2946 else {
2947 /* Overallocate on the heap, and give the excess back at the end. */
2948 nallocated = size * 4;
2949 if (nallocated / 4 != size) /* overflow! */
2950 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002951 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002952 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002953 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002954 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002955 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002956
Tim Peters602f7402002-04-27 18:03:26 +00002957 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002958 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002959
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002960 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002961 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002963
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002965 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002966 *p++ = (char)(0xc0 | (ch >> 6));
2967 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002968 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002969#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002970 /* Special case: check for high and low surrogate */
2971 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2972 Py_UCS4 ch2 = s[i];
2973 /* Combine the two surrogates to form a UCS4 value */
2974 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2975 i++;
2976
2977 /* Encode UCS4 Unicode ordinals */
2978 *p++ = (char)(0xf0 | (ch >> 18));
2979 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002980 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2981 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002982 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002983#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002984 Py_ssize_t newpos;
2985 PyObject *rep;
2986 Py_ssize_t repsize, k;
2987 rep = unicode_encode_call_errorhandler
2988 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2989 s, size, &exc, i-1, i, &newpos);
2990 if (!rep)
2991 goto error;
2992
2993 if (PyBytes_Check(rep))
2994 repsize = PyBytes_GET_SIZE(rep);
2995 else
2996 repsize = PyUnicode_GET_SIZE(rep);
2997
2998 if (repsize > 4) {
2999 Py_ssize_t offset;
3000
3001 if (result == NULL)
3002 offset = p - stackbuf;
3003 else
3004 offset = p - PyBytes_AS_STRING(result);
3005
3006 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3007 /* integer overflow */
3008 PyErr_NoMemory();
3009 goto error;
3010 }
3011 nallocated += repsize - 4;
3012 if (result != NULL) {
3013 if (_PyBytes_Resize(&result, nallocated) < 0)
3014 goto error;
3015 } else {
3016 result = PyBytes_FromStringAndSize(NULL, nallocated);
3017 if (result == NULL)
3018 goto error;
3019 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3020 }
3021 p = PyBytes_AS_STRING(result) + offset;
3022 }
3023
3024 if (PyBytes_Check(rep)) {
3025 char *prep = PyBytes_AS_STRING(rep);
3026 for(k = repsize; k > 0; k--)
3027 *p++ = *prep++;
3028 } else /* rep is unicode */ {
3029 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3030 Py_UNICODE c;
3031
3032 for(k=0; k<repsize; k++) {
3033 c = prep[k];
3034 if (0x80 <= c) {
3035 raise_encode_exception(&exc, "utf-8", s, size,
3036 i-1, i, "surrogates not allowed");
3037 goto error;
3038 }
3039 *p++ = (char)prep[k];
3040 }
3041 }
3042 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003043#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003044 }
Victor Stinner445a6232010-04-22 20:01:57 +00003045#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003046 } else if (ch < 0x10000) {
3047 *p++ = (char)(0xe0 | (ch >> 12));
3048 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3049 *p++ = (char)(0x80 | (ch & 0x3f));
3050 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003051 /* Encode UCS4 Unicode ordinals */
3052 *p++ = (char)(0xf0 | (ch >> 18));
3053 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3054 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3055 *p++ = (char)(0x80 | (ch & 0x3f));
3056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003058
Guido van Rossum98297ee2007-11-06 21:34:58 +00003059 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003060 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003061 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003062 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003063 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003064 }
3065 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003066 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003067 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003068 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003069 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003070 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003071 Py_XDECREF(errorHandler);
3072 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003073 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003074 error:
3075 Py_XDECREF(errorHandler);
3076 Py_XDECREF(exc);
3077 Py_XDECREF(result);
3078 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003079
Tim Peters602f7402002-04-27 18:03:26 +00003080#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081}
3082
Alexander Belopolsky40018472011-02-26 01:02:56 +00003083PyObject *
3084PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086 if (!PyUnicode_Check(unicode)) {
3087 PyErr_BadArgument();
3088 return NULL;
3089 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003090 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 PyUnicode_GET_SIZE(unicode),
3092 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093}
3094
Walter Dörwald41980ca2007-08-16 21:55:45 +00003095/* --- UTF-32 Codec ------------------------------------------------------- */
3096
3097PyObject *
3098PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003099 Py_ssize_t size,
3100 const char *errors,
3101 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003102{
3103 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3104}
3105
3106PyObject *
3107PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003108 Py_ssize_t size,
3109 const char *errors,
3110 int *byteorder,
3111 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003112{
3113 const char *starts = s;
3114 Py_ssize_t startinpos;
3115 Py_ssize_t endinpos;
3116 Py_ssize_t outpos;
3117 PyUnicodeObject *unicode;
3118 Py_UNICODE *p;
3119#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003120 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003121 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003122#else
3123 const int pairs = 0;
3124#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003125 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003126 int bo = 0; /* assume native ordering by default */
3127 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003128 /* Offsets from q for retrieving bytes in the right order. */
3129#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3130 int iorder[] = {0, 1, 2, 3};
3131#else
3132 int iorder[] = {3, 2, 1, 0};
3133#endif
3134 PyObject *errorHandler = NULL;
3135 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003136
Walter Dörwald41980ca2007-08-16 21:55:45 +00003137 q = (unsigned char *)s;
3138 e = q + size;
3139
3140 if (byteorder)
3141 bo = *byteorder;
3142
3143 /* Check for BOM marks (U+FEFF) in the input and adjust current
3144 byte order setting accordingly. In native mode, the leading BOM
3145 mark is skipped, in all other modes, it is copied to the output
3146 stream as-is (giving a ZWNBSP character). */
3147 if (bo == 0) {
3148 if (size >= 4) {
3149 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003150 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003151#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003152 if (bom == 0x0000FEFF) {
3153 q += 4;
3154 bo = -1;
3155 }
3156 else if (bom == 0xFFFE0000) {
3157 q += 4;
3158 bo = 1;
3159 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003160#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003161 if (bom == 0x0000FEFF) {
3162 q += 4;
3163 bo = 1;
3164 }
3165 else if (bom == 0xFFFE0000) {
3166 q += 4;
3167 bo = -1;
3168 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003169#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003170 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003171 }
3172
3173 if (bo == -1) {
3174 /* force LE */
3175 iorder[0] = 0;
3176 iorder[1] = 1;
3177 iorder[2] = 2;
3178 iorder[3] = 3;
3179 }
3180 else if (bo == 1) {
3181 /* force BE */
3182 iorder[0] = 3;
3183 iorder[1] = 2;
3184 iorder[2] = 1;
3185 iorder[3] = 0;
3186 }
3187
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003188 /* On narrow builds we split characters outside the BMP into two
3189 codepoints => count how much extra space we need. */
3190#ifndef Py_UNICODE_WIDE
3191 for (qq = q; qq < e; qq += 4)
3192 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3193 pairs++;
3194#endif
3195
3196 /* This might be one to much, because of a BOM */
3197 unicode = _PyUnicode_New((size+3)/4+pairs);
3198 if (!unicode)
3199 return NULL;
3200 if (size == 0)
3201 return (PyObject *)unicode;
3202
3203 /* Unpack UTF-32 encoded data */
3204 p = unicode->str;
3205
Walter Dörwald41980ca2007-08-16 21:55:45 +00003206 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003207 Py_UCS4 ch;
3208 /* remaining bytes at the end? (size should be divisible by 4) */
3209 if (e-q<4) {
3210 if (consumed)
3211 break;
3212 errmsg = "truncated data";
3213 startinpos = ((const char *)q)-starts;
3214 endinpos = ((const char *)e)-starts;
3215 goto utf32Error;
3216 /* The remaining input chars are ignored if the callback
3217 chooses to skip the input */
3218 }
3219 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3220 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003221
Benjamin Peterson29060642009-01-31 22:14:21 +00003222 if (ch >= 0x110000)
3223 {
3224 errmsg = "codepoint not in range(0x110000)";
3225 startinpos = ((const char *)q)-starts;
3226 endinpos = startinpos+4;
3227 goto utf32Error;
3228 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003229#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 if (ch >= 0x10000)
3231 {
3232 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3233 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3234 }
3235 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003236#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003237 *p++ = ch;
3238 q += 4;
3239 continue;
3240 utf32Error:
3241 outpos = p-PyUnicode_AS_UNICODE(unicode);
3242 if (unicode_decode_call_errorhandler(
3243 errors, &errorHandler,
3244 "utf32", errmsg,
3245 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3246 &unicode, &outpos, &p))
3247 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003248 }
3249
3250 if (byteorder)
3251 *byteorder = bo;
3252
3253 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003254 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003255
3256 /* Adjust length */
3257 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3258 goto onError;
3259
3260 Py_XDECREF(errorHandler);
3261 Py_XDECREF(exc);
3262 return (PyObject *)unicode;
3263
Benjamin Peterson29060642009-01-31 22:14:21 +00003264 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003265 Py_DECREF(unicode);
3266 Py_XDECREF(errorHandler);
3267 Py_XDECREF(exc);
3268 return NULL;
3269}
3270
3271PyObject *
3272PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003273 Py_ssize_t size,
3274 const char *errors,
3275 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003276{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003277 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003278 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003279 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003280#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003281 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003282#else
3283 const int pairs = 0;
3284#endif
3285 /* Offsets from p for storing byte pairs in the right order. */
3286#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3287 int iorder[] = {0, 1, 2, 3};
3288#else
3289 int iorder[] = {3, 2, 1, 0};
3290#endif
3291
Benjamin Peterson29060642009-01-31 22:14:21 +00003292#define STORECHAR(CH) \
3293 do { \
3294 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3295 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3296 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3297 p[iorder[0]] = (CH) & 0xff; \
3298 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003299 } while(0)
3300
3301 /* In narrow builds we can output surrogate pairs as one codepoint,
3302 so we need less space. */
3303#ifndef Py_UNICODE_WIDE
3304 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003305 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3306 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3307 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003308#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003309 nsize = (size - pairs + (byteorder == 0));
3310 bytesize = nsize * 4;
3311 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003312 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003313 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003314 if (v == NULL)
3315 return NULL;
3316
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003317 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003318 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003319 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003320 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003321 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003322
3323 if (byteorder == -1) {
3324 /* force LE */
3325 iorder[0] = 0;
3326 iorder[1] = 1;
3327 iorder[2] = 2;
3328 iorder[3] = 3;
3329 }
3330 else if (byteorder == 1) {
3331 /* force BE */
3332 iorder[0] = 3;
3333 iorder[1] = 2;
3334 iorder[2] = 1;
3335 iorder[3] = 0;
3336 }
3337
3338 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003339 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003340#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003341 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3342 Py_UCS4 ch2 = *s;
3343 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3344 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3345 s++;
3346 size--;
3347 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003348 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003349#endif
3350 STORECHAR(ch);
3351 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003352
3353 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003354 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003355#undef STORECHAR
3356}
3357
Alexander Belopolsky40018472011-02-26 01:02:56 +00003358PyObject *
3359PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003360{
3361 if (!PyUnicode_Check(unicode)) {
3362 PyErr_BadArgument();
3363 return NULL;
3364 }
3365 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003366 PyUnicode_GET_SIZE(unicode),
3367 NULL,
3368 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003369}
3370
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371/* --- UTF-16 Codec ------------------------------------------------------- */
3372
Tim Peters772747b2001-08-09 22:21:55 +00003373PyObject *
3374PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003375 Py_ssize_t size,
3376 const char *errors,
3377 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378{
Walter Dörwald69652032004-09-07 20:24:22 +00003379 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3380}
3381
Antoine Pitrouab868312009-01-10 15:40:25 +00003382/* Two masks for fast checking of whether a C 'long' may contain
3383 UTF16-encoded surrogate characters. This is an efficient heuristic,
3384 assuming that non-surrogate characters with a code point >= 0x8000 are
3385 rare in most input.
3386 FAST_CHAR_MASK is used when the input is in native byte ordering,
3387 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003388*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003389#if (SIZEOF_LONG == 8)
3390# define FAST_CHAR_MASK 0x8000800080008000L
3391# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3392#elif (SIZEOF_LONG == 4)
3393# define FAST_CHAR_MASK 0x80008000L
3394# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3395#else
3396# error C 'long' size should be either 4 or 8!
3397#endif
3398
Walter Dörwald69652032004-09-07 20:24:22 +00003399PyObject *
3400PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003401 Py_ssize_t size,
3402 const char *errors,
3403 int *byteorder,
3404 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003405{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003406 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003407 Py_ssize_t startinpos;
3408 Py_ssize_t endinpos;
3409 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 PyUnicodeObject *unicode;
3411 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003412 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003413 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003414 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003415 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003416 /* Offsets from q for retrieving byte pairs in the right order. */
3417#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3418 int ihi = 1, ilo = 0;
3419#else
3420 int ihi = 0, ilo = 1;
3421#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 PyObject *errorHandler = NULL;
3423 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424
3425 /* Note: size will always be longer than the resulting Unicode
3426 character count */
3427 unicode = _PyUnicode_New(size);
3428 if (!unicode)
3429 return NULL;
3430 if (size == 0)
3431 return (PyObject *)unicode;
3432
3433 /* Unpack UTF-16 encoded data */
3434 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003435 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003436 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437
3438 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003439 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003441 /* Check for BOM marks (U+FEFF) in the input and adjust current
3442 byte order setting accordingly. In native mode, the leading BOM
3443 mark is skipped, in all other modes, it is copied to the output
3444 stream as-is (giving a ZWNBSP character). */
3445 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003446 if (size >= 2) {
3447 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003448#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003449 if (bom == 0xFEFF) {
3450 q += 2;
3451 bo = -1;
3452 }
3453 else if (bom == 0xFFFE) {
3454 q += 2;
3455 bo = 1;
3456 }
Tim Petersced69f82003-09-16 20:30:58 +00003457#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003458 if (bom == 0xFEFF) {
3459 q += 2;
3460 bo = 1;
3461 }
3462 else if (bom == 0xFFFE) {
3463 q += 2;
3464 bo = -1;
3465 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003466#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469
Tim Peters772747b2001-08-09 22:21:55 +00003470 if (bo == -1) {
3471 /* force LE */
3472 ihi = 1;
3473 ilo = 0;
3474 }
3475 else if (bo == 1) {
3476 /* force BE */
3477 ihi = 0;
3478 ilo = 1;
3479 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003480#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3481 native_ordering = ilo < ihi;
3482#else
3483 native_ordering = ilo > ihi;
3484#endif
Tim Peters772747b2001-08-09 22:21:55 +00003485
Antoine Pitrouab868312009-01-10 15:40:25 +00003486 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003487 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003488 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003489 /* First check for possible aligned read of a C 'long'. Unaligned
3490 reads are more expensive, better to defer to another iteration. */
3491 if (!((size_t) q & LONG_PTR_MASK)) {
3492 /* Fast path for runs of non-surrogate chars. */
3493 register const unsigned char *_q = q;
3494 Py_UNICODE *_p = p;
3495 if (native_ordering) {
3496 /* Native ordering is simple: as long as the input cannot
3497 possibly contain a surrogate char, do an unrolled copy
3498 of several 16-bit code points to the target object.
3499 The non-surrogate check is done on several input bytes
3500 at a time (as many as a C 'long' can contain). */
3501 while (_q < aligned_end) {
3502 unsigned long data = * (unsigned long *) _q;
3503 if (data & FAST_CHAR_MASK)
3504 break;
3505 _p[0] = ((unsigned short *) _q)[0];
3506 _p[1] = ((unsigned short *) _q)[1];
3507#if (SIZEOF_LONG == 8)
3508 _p[2] = ((unsigned short *) _q)[2];
3509 _p[3] = ((unsigned short *) _q)[3];
3510#endif
3511 _q += SIZEOF_LONG;
3512 _p += SIZEOF_LONG / 2;
3513 }
3514 }
3515 else {
3516 /* Byteswapped ordering is similar, but we must decompose
3517 the copy bytewise, and take care of zero'ing out the
3518 upper bytes if the target object is in 32-bit units
3519 (that is, in UCS-4 builds). */
3520 while (_q < aligned_end) {
3521 unsigned long data = * (unsigned long *) _q;
3522 if (data & SWAPPED_FAST_CHAR_MASK)
3523 break;
3524 /* Zero upper bytes in UCS-4 builds */
3525#if (Py_UNICODE_SIZE > 2)
3526 _p[0] = 0;
3527 _p[1] = 0;
3528#if (SIZEOF_LONG == 8)
3529 _p[2] = 0;
3530 _p[3] = 0;
3531#endif
3532#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003533 /* Issue #4916; UCS-4 builds on big endian machines must
3534 fill the two last bytes of each 4-byte unit. */
3535#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3536# define OFF 2
3537#else
3538# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003539#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003540 ((unsigned char *) _p)[OFF + 1] = _q[0];
3541 ((unsigned char *) _p)[OFF + 0] = _q[1];
3542 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3543 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3544#if (SIZEOF_LONG == 8)
3545 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3546 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3547 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3548 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3549#endif
3550#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003551 _q += SIZEOF_LONG;
3552 _p += SIZEOF_LONG / 2;
3553 }
3554 }
3555 p = _p;
3556 q = _q;
3557 if (q >= e)
3558 break;
3559 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003560 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561
Benjamin Peterson14339b62009-01-31 16:36:08 +00003562 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003563
3564 if (ch < 0xD800 || ch > 0xDFFF) {
3565 *p++ = ch;
3566 continue;
3567 }
3568
3569 /* UTF-16 code pair: */
3570 if (q > e) {
3571 errmsg = "unexpected end of data";
3572 startinpos = (((const char *)q) - 2) - starts;
3573 endinpos = ((const char *)e) + 1 - starts;
3574 goto utf16Error;
3575 }
3576 if (0xD800 <= ch && ch <= 0xDBFF) {
3577 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3578 q += 2;
3579 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003580#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003581 *p++ = ch;
3582 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003583#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003584 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003585#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003586 continue;
3587 }
3588 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003589 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 startinpos = (((const char *)q)-4)-starts;
3591 endinpos = startinpos+2;
3592 goto utf16Error;
3593 }
3594
Benjamin Peterson14339b62009-01-31 16:36:08 +00003595 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 errmsg = "illegal encoding";
3597 startinpos = (((const char *)q)-2)-starts;
3598 endinpos = startinpos+2;
3599 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003600
Benjamin Peterson29060642009-01-31 22:14:21 +00003601 utf16Error:
3602 outpos = p - PyUnicode_AS_UNICODE(unicode);
3603 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003604 errors,
3605 &errorHandler,
3606 "utf16", errmsg,
3607 &starts,
3608 (const char **)&e,
3609 &startinpos,
3610 &endinpos,
3611 &exc,
3612 (const char **)&q,
3613 &unicode,
3614 &outpos,
3615 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003616 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003618 /* remaining byte at the end? (size should be even) */
3619 if (e == q) {
3620 if (!consumed) {
3621 errmsg = "truncated data";
3622 startinpos = ((const char *)q) - starts;
3623 endinpos = ((const char *)e) + 1 - starts;
3624 outpos = p - PyUnicode_AS_UNICODE(unicode);
3625 if (unicode_decode_call_errorhandler(
3626 errors,
3627 &errorHandler,
3628 "utf16", errmsg,
3629 &starts,
3630 (const char **)&e,
3631 &startinpos,
3632 &endinpos,
3633 &exc,
3634 (const char **)&q,
3635 &unicode,
3636 &outpos,
3637 &p))
3638 goto onError;
3639 /* The remaining input chars are ignored if the callback
3640 chooses to skip the input */
3641 }
3642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643
3644 if (byteorder)
3645 *byteorder = bo;
3646
Walter Dörwald69652032004-09-07 20:24:22 +00003647 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003649
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003651 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 goto onError;
3653
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 Py_XDECREF(errorHandler);
3655 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 return (PyObject *)unicode;
3657
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 Py_XDECREF(errorHandler);
3661 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 return NULL;
3663}
3664
Antoine Pitrouab868312009-01-10 15:40:25 +00003665#undef FAST_CHAR_MASK
3666#undef SWAPPED_FAST_CHAR_MASK
3667
Tim Peters772747b2001-08-09 22:21:55 +00003668PyObject *
3669PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 Py_ssize_t size,
3671 const char *errors,
3672 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003674 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003675 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003676 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003677#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003678 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003679#else
3680 const int pairs = 0;
3681#endif
Tim Peters772747b2001-08-09 22:21:55 +00003682 /* Offsets from p for storing byte pairs in the right order. */
3683#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3684 int ihi = 1, ilo = 0;
3685#else
3686 int ihi = 0, ilo = 1;
3687#endif
3688
Benjamin Peterson29060642009-01-31 22:14:21 +00003689#define STORECHAR(CH) \
3690 do { \
3691 p[ihi] = ((CH) >> 8) & 0xff; \
3692 p[ilo] = (CH) & 0xff; \
3693 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003694 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003696#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003697 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 if (s[i] >= 0x10000)
3699 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003700#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003701 /* 2 * (size + pairs + (byteorder == 0)) */
3702 if (size > PY_SSIZE_T_MAX ||
3703 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003705 nsize = size + pairs + (byteorder == 0);
3706 bytesize = nsize * 2;
3707 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003708 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003709 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 if (v == NULL)
3711 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003713 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003715 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003716 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003717 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003718
3719 if (byteorder == -1) {
3720 /* force LE */
3721 ihi = 1;
3722 ilo = 0;
3723 }
3724 else if (byteorder == 1) {
3725 /* force BE */
3726 ihi = 0;
3727 ilo = 1;
3728 }
3729
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003730 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003731 Py_UNICODE ch = *s++;
3732 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003733#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003734 if (ch >= 0x10000) {
3735 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3736 ch = 0xD800 | ((ch-0x10000) >> 10);
3737 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003738#endif
Tim Peters772747b2001-08-09 22:21:55 +00003739 STORECHAR(ch);
3740 if (ch2)
3741 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003742 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003743
3744 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003745 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003746#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747}
3748
Alexander Belopolsky40018472011-02-26 01:02:56 +00003749PyObject *
3750PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751{
3752 if (!PyUnicode_Check(unicode)) {
3753 PyErr_BadArgument();
3754 return NULL;
3755 }
3756 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 PyUnicode_GET_SIZE(unicode),
3758 NULL,
3759 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760}
3761
3762/* --- Unicode Escape Codec ----------------------------------------------- */
3763
Fredrik Lundh06d12682001-01-24 07:59:11 +00003764static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003765
Alexander Belopolsky40018472011-02-26 01:02:56 +00003766PyObject *
3767PyUnicode_DecodeUnicodeEscape(const char *s,
3768 Py_ssize_t size,
3769 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003772 Py_ssize_t startinpos;
3773 Py_ssize_t endinpos;
3774 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003779 char* message;
3780 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 PyObject *errorHandler = NULL;
3782 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003783
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 /* Escaped strings will always be longer than the resulting
3785 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 length after conversion to the true value.
3787 (but if the error callback returns a long replacement string
3788 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 v = _PyUnicode_New(size);
3790 if (v == NULL)
3791 goto onError;
3792 if (size == 0)
3793 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003794
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003797
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 while (s < end) {
3799 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003800 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003801 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802
3803 /* Non-escape characters are interpreted as Unicode ordinals */
3804 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003805 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 continue;
3807 }
3808
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 /* \ - Escapes */
3811 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003812 c = *s++;
3813 if (s > end)
3814 c = '\0'; /* Invalid after \ */
3815 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 case '\n': break;
3819 case '\\': *p++ = '\\'; break;
3820 case '\'': *p++ = '\''; break;
3821 case '\"': *p++ = '\"'; break;
3822 case 'b': *p++ = '\b'; break;
3823 case 'f': *p++ = '\014'; break; /* FF */
3824 case 't': *p++ = '\t'; break;
3825 case 'n': *p++ = '\n'; break;
3826 case 'r': *p++ = '\r'; break;
3827 case 'v': *p++ = '\013'; break; /* VT */
3828 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3829
Benjamin Peterson29060642009-01-31 22:14:21 +00003830 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 case '0': case '1': case '2': case '3':
3832 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003833 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003834 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003835 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003836 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003837 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003839 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 break;
3841
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 /* hex escapes */
3843 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003845 digits = 2;
3846 message = "truncated \\xXX escape";
3847 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003848
Benjamin Peterson29060642009-01-31 22:14:21 +00003849 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003851 digits = 4;
3852 message = "truncated \\uXXXX escape";
3853 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003856 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003857 digits = 8;
3858 message = "truncated \\UXXXXXXXX escape";
3859 hexescape:
3860 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 outpos = p-PyUnicode_AS_UNICODE(v);
3862 if (s+digits>end) {
3863 endinpos = size;
3864 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003865 errors, &errorHandler,
3866 "unicodeescape", "end of string in escape sequence",
3867 &starts, &end, &startinpos, &endinpos, &exc, &s,
3868 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003869 goto onError;
3870 goto nextByte;
3871 }
3872 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003873 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003874 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875 endinpos = (s+i+1)-starts;
3876 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003877 errors, &errorHandler,
3878 "unicodeescape", message,
3879 &starts, &end, &startinpos, &endinpos, &exc, &s,
3880 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003881 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003883 }
3884 chr = (chr<<4) & ~0xF;
3885 if (c >= '0' && c <= '9')
3886 chr += c - '0';
3887 else if (c >= 'a' && c <= 'f')
3888 chr += 10 + c - 'a';
3889 else
3890 chr += 10 + c - 'A';
3891 }
3892 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003893 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003894 /* _decoding_error will have already written into the
3895 target buffer. */
3896 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003897 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003898 /* when we get here, chr is a 32-bit unicode character */
3899 if (chr <= 0xffff)
3900 /* UCS-2 character */
3901 *p++ = (Py_UNICODE) chr;
3902 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003903 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003904 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003905#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003906 *p++ = chr;
3907#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003908 chr -= 0x10000L;
3909 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003910 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003911#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003912 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913 endinpos = s-starts;
3914 outpos = p-PyUnicode_AS_UNICODE(v);
3915 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003916 errors, &errorHandler,
3917 "unicodeescape", "illegal Unicode character",
3918 &starts, &end, &startinpos, &endinpos, &exc, &s,
3919 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003920 goto onError;
3921 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003922 break;
3923
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003925 case 'N':
3926 message = "malformed \\N character escape";
3927 if (ucnhash_CAPI == NULL) {
3928 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003929 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003930 if (ucnhash_CAPI == NULL)
3931 goto ucnhashError;
3932 }
3933 if (*s == '{') {
3934 const char *start = s+1;
3935 /* look for the closing brace */
3936 while (*s != '}' && s < end)
3937 s++;
3938 if (s > start && s < end && *s == '}') {
3939 /* found a name. look it up in the unicode database */
3940 message = "unknown Unicode character name";
3941 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003942 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003943 goto store;
3944 }
3945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003946 endinpos = s-starts;
3947 outpos = p-PyUnicode_AS_UNICODE(v);
3948 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003949 errors, &errorHandler,
3950 "unicodeescape", message,
3951 &starts, &end, &startinpos, &endinpos, &exc, &s,
3952 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003953 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003954 break;
3955
3956 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003957 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 message = "\\ at end of string";
3959 s--;
3960 endinpos = s-starts;
3961 outpos = p-PyUnicode_AS_UNICODE(v);
3962 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 errors, &errorHandler,
3964 "unicodeescape", message,
3965 &starts, &end, &startinpos, &endinpos, &exc, &s,
3966 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003967 goto onError;
3968 }
3969 else {
3970 *p++ = '\\';
3971 *p++ = (unsigned char)s[-1];
3972 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003973 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003978 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003980 Py_XDECREF(errorHandler);
3981 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003983
Benjamin Peterson29060642009-01-31 22:14:21 +00003984 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003985 PyErr_SetString(
3986 PyExc_UnicodeError,
3987 "\\N escapes not supported (can't load unicodedata module)"
3988 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003989 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 Py_XDECREF(errorHandler);
3991 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003992 return NULL;
3993
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996 Py_XDECREF(errorHandler);
3997 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 return NULL;
3999}
4000
4001/* Return a Unicode-Escape string version of the Unicode object.
4002
4003 If quotes is true, the string is enclosed in u"" or u'' quotes as
4004 appropriate.
4005
4006*/
4007
Thomas Wouters477c8d52006-05-27 19:21:47 +00004008Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004009 Py_ssize_t size,
4010 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00004011{
4012 /* like wcschr, but doesn't stop at NULL characters */
4013
4014 while (size-- > 0) {
4015 if (*s == ch)
4016 return s;
4017 s++;
4018 }
4019
4020 return NULL;
4021}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004022
Walter Dörwald79e913e2007-05-12 11:08:06 +00004023static const char *hexdigits = "0123456789abcdef";
4024
Alexander Belopolsky40018472011-02-26 01:02:56 +00004025PyObject *
4026PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4027 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004029 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004032#ifdef Py_UNICODE_WIDE
4033 const Py_ssize_t expandsize = 10;
4034#else
4035 const Py_ssize_t expandsize = 6;
4036#endif
4037
Thomas Wouters89f507f2006-12-13 04:49:30 +00004038 /* XXX(nnorwitz): rather than over-allocating, it would be
4039 better to choose a different scheme. Perhaps scan the
4040 first N-chars of the string and allocate based on that size.
4041 */
4042 /* Initial allocation is based on the longest-possible unichr
4043 escape.
4044
4045 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4046 unichr, so in this case it's the longest unichr escape. In
4047 narrow (UTF-16) builds this is five chars per source unichr
4048 since there are two unichrs in the surrogate pair, so in narrow
4049 (UTF-16) builds it's not the longest unichr escape.
4050
4051 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4052 so in the narrow (UTF-16) build case it's the longest unichr
4053 escape.
4054 */
4055
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004056 if (size == 0)
4057 return PyBytes_FromStringAndSize(NULL, 0);
4058
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004059 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004061
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004062 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 2
4064 + expandsize*size
4065 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 if (repr == NULL)
4067 return NULL;
4068
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004069 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 while (size-- > 0) {
4072 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004073
Walter Dörwald79e913e2007-05-12 11:08:06 +00004074 /* Escape backslashes */
4075 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 *p++ = '\\';
4077 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004078 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004079 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004080
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004081#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004082 /* Map 21-bit characters to '\U00xxxxxx' */
4083 else if (ch >= 0x10000) {
4084 *p++ = '\\';
4085 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004086 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4087 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4088 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4089 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4090 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4091 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4092 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4093 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004094 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004095 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004096#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4098 else if (ch >= 0xD800 && ch < 0xDC00) {
4099 Py_UNICODE ch2;
4100 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004101
Benjamin Peterson29060642009-01-31 22:14:21 +00004102 ch2 = *s++;
4103 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004104 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004105 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4106 *p++ = '\\';
4107 *p++ = 'U';
4108 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4109 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4110 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4111 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4112 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4113 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4114 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4115 *p++ = hexdigits[ucs & 0x0000000F];
4116 continue;
4117 }
4118 /* Fall through: isolated surrogates are copied as-is */
4119 s--;
4120 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004121 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004122#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004123
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004125 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 *p++ = '\\';
4127 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004128 *p++ = hexdigits[(ch >> 12) & 0x000F];
4129 *p++ = hexdigits[(ch >> 8) & 0x000F];
4130 *p++ = hexdigits[(ch >> 4) & 0x000F];
4131 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004133
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004134 /* Map special whitespace to '\t', \n', '\r' */
4135 else if (ch == '\t') {
4136 *p++ = '\\';
4137 *p++ = 't';
4138 }
4139 else if (ch == '\n') {
4140 *p++ = '\\';
4141 *p++ = 'n';
4142 }
4143 else if (ch == '\r') {
4144 *p++ = '\\';
4145 *p++ = 'r';
4146 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004147
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004148 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004149 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004151 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004152 *p++ = hexdigits[(ch >> 4) & 0x000F];
4153 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004154 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004155
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 /* Copy everything else as-is */
4157 else
4158 *p++ = (char) ch;
4159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004161 assert(p - PyBytes_AS_STRING(repr) > 0);
4162 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4163 return NULL;
4164 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165}
4166
Alexander Belopolsky40018472011-02-26 01:02:56 +00004167PyObject *
4168PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004170 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 if (!PyUnicode_Check(unicode)) {
4172 PyErr_BadArgument();
4173 return NULL;
4174 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004175 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4176 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004177 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178}
4179
4180/* --- Raw Unicode Escape Codec ------------------------------------------- */
4181
Alexander Belopolsky40018472011-02-26 01:02:56 +00004182PyObject *
4183PyUnicode_DecodeRawUnicodeEscape(const char *s,
4184 Py_ssize_t size,
4185 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004188 Py_ssize_t startinpos;
4189 Py_ssize_t endinpos;
4190 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193 const char *end;
4194 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 PyObject *errorHandler = NULL;
4196 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004197
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 /* Escaped strings will always be longer than the resulting
4199 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 length after conversion to the true value. (But decoding error
4201 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 v = _PyUnicode_New(size);
4203 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 end = s + size;
4209 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 unsigned char c;
4211 Py_UCS4 x;
4212 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004213 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214
Benjamin Peterson29060642009-01-31 22:14:21 +00004215 /* Non-escape characters are interpreted as Unicode ordinals */
4216 if (*s != '\\') {
4217 *p++ = (unsigned char)*s++;
4218 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004219 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 startinpos = s-starts;
4221
4222 /* \u-escapes are only interpreted iff the number of leading
4223 backslashes if odd */
4224 bs = s;
4225 for (;s < end;) {
4226 if (*s != '\\')
4227 break;
4228 *p++ = (unsigned char)*s++;
4229 }
4230 if (((s - bs) & 1) == 0 ||
4231 s >= end ||
4232 (*s != 'u' && *s != 'U')) {
4233 continue;
4234 }
4235 p--;
4236 count = *s=='u' ? 4 : 8;
4237 s++;
4238
4239 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4240 outpos = p-PyUnicode_AS_UNICODE(v);
4241 for (x = 0, i = 0; i < count; ++i, ++s) {
4242 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004243 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 endinpos = s-starts;
4245 if (unicode_decode_call_errorhandler(
4246 errors, &errorHandler,
4247 "rawunicodeescape", "truncated \\uXXXX",
4248 &starts, &end, &startinpos, &endinpos, &exc, &s,
4249 &v, &outpos, &p))
4250 goto onError;
4251 goto nextByte;
4252 }
4253 x = (x<<4) & ~0xF;
4254 if (c >= '0' && c <= '9')
4255 x += c - '0';
4256 else if (c >= 'a' && c <= 'f')
4257 x += 10 + c - 'a';
4258 else
4259 x += 10 + c - 'A';
4260 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004261 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 /* UCS-2 character */
4263 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004264 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 /* UCS-4 character. Either store directly, or as
4266 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004267#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004269#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004270 x -= 0x10000L;
4271 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4272 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004273#endif
4274 } else {
4275 endinpos = s-starts;
4276 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004277 if (unicode_decode_call_errorhandler(
4278 errors, &errorHandler,
4279 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004280 &starts, &end, &startinpos, &endinpos, &exc, &s,
4281 &v, &outpos, &p))
4282 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004283 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004284 nextByte:
4285 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004287 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 Py_XDECREF(errorHandler);
4290 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004292
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 return NULL;
4298}
4299
Alexander Belopolsky40018472011-02-26 01:02:56 +00004300PyObject *
4301PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4302 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004304 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 char *p;
4306 char *q;
4307
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004308#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004309 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004310#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004311 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004312#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004313
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004314 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004316
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004317 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 if (repr == NULL)
4319 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004320 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004321 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004323 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 while (size-- > 0) {
4325 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004326#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004327 /* Map 32-bit characters to '\Uxxxxxxxx' */
4328 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004329 *p++ = '\\';
4330 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004331 *p++ = hexdigits[(ch >> 28) & 0xf];
4332 *p++ = hexdigits[(ch >> 24) & 0xf];
4333 *p++ = hexdigits[(ch >> 20) & 0xf];
4334 *p++ = hexdigits[(ch >> 16) & 0xf];
4335 *p++ = hexdigits[(ch >> 12) & 0xf];
4336 *p++ = hexdigits[(ch >> 8) & 0xf];
4337 *p++ = hexdigits[(ch >> 4) & 0xf];
4338 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004339 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004340 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004341#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004342 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4343 if (ch >= 0xD800 && ch < 0xDC00) {
4344 Py_UNICODE ch2;
4345 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004346
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 ch2 = *s++;
4348 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004349 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004350 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4351 *p++ = '\\';
4352 *p++ = 'U';
4353 *p++ = hexdigits[(ucs >> 28) & 0xf];
4354 *p++ = hexdigits[(ucs >> 24) & 0xf];
4355 *p++ = hexdigits[(ucs >> 20) & 0xf];
4356 *p++ = hexdigits[(ucs >> 16) & 0xf];
4357 *p++ = hexdigits[(ucs >> 12) & 0xf];
4358 *p++ = hexdigits[(ucs >> 8) & 0xf];
4359 *p++ = hexdigits[(ucs >> 4) & 0xf];
4360 *p++ = hexdigits[ucs & 0xf];
4361 continue;
4362 }
4363 /* Fall through: isolated surrogates are copied as-is */
4364 s--;
4365 size++;
4366 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004367#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004368 /* Map 16-bit characters to '\uxxxx' */
4369 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 *p++ = '\\';
4371 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004372 *p++ = hexdigits[(ch >> 12) & 0xf];
4373 *p++ = hexdigits[(ch >> 8) & 0xf];
4374 *p++ = hexdigits[(ch >> 4) & 0xf];
4375 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 /* Copy everything else as-is */
4378 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 *p++ = (char) ch;
4380 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004381 size = p - q;
4382
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004383 assert(size > 0);
4384 if (_PyBytes_Resize(&repr, size) < 0)
4385 return NULL;
4386 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387}
4388
Alexander Belopolsky40018472011-02-26 01:02:56 +00004389PyObject *
4390PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004392 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004394 PyErr_BadArgument();
4395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004397 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4398 PyUnicode_GET_SIZE(unicode));
4399
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004400 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401}
4402
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004403/* --- Unicode Internal Codec ------------------------------------------- */
4404
Alexander Belopolsky40018472011-02-26 01:02:56 +00004405PyObject *
4406_PyUnicode_DecodeUnicodeInternal(const char *s,
4407 Py_ssize_t size,
4408 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004409{
4410 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004411 Py_ssize_t startinpos;
4412 Py_ssize_t endinpos;
4413 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004414 PyUnicodeObject *v;
4415 Py_UNICODE *p;
4416 const char *end;
4417 const char *reason;
4418 PyObject *errorHandler = NULL;
4419 PyObject *exc = NULL;
4420
Neal Norwitzd43069c2006-01-08 01:12:10 +00004421#ifdef Py_UNICODE_WIDE
4422 Py_UNICODE unimax = PyUnicode_GetMax();
4423#endif
4424
Thomas Wouters89f507f2006-12-13 04:49:30 +00004425 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004426 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4427 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004429 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004430 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004431 p = PyUnicode_AS_UNICODE(v);
4432 end = s + size;
4433
4434 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004435 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004436 /* We have to sanity check the raw data, otherwise doom looms for
4437 some malformed UCS-4 data. */
4438 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004439#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004440 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004441#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004442 end-s < Py_UNICODE_SIZE
4443 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004445 startinpos = s - starts;
4446 if (end-s < Py_UNICODE_SIZE) {
4447 endinpos = end-starts;
4448 reason = "truncated input";
4449 }
4450 else {
4451 endinpos = s - starts + Py_UNICODE_SIZE;
4452 reason = "illegal code point (> 0x10FFFF)";
4453 }
4454 outpos = p - PyUnicode_AS_UNICODE(v);
4455 if (unicode_decode_call_errorhandler(
4456 errors, &errorHandler,
4457 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004458 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004459 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004460 goto onError;
4461 }
4462 }
4463 else {
4464 p++;
4465 s += Py_UNICODE_SIZE;
4466 }
4467 }
4468
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004469 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004470 goto onError;
4471 Py_XDECREF(errorHandler);
4472 Py_XDECREF(exc);
4473 return (PyObject *)v;
4474
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004476 Py_XDECREF(v);
4477 Py_XDECREF(errorHandler);
4478 Py_XDECREF(exc);
4479 return NULL;
4480}
4481
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482/* --- Latin-1 Codec ------------------------------------------------------ */
4483
Alexander Belopolsky40018472011-02-26 01:02:56 +00004484PyObject *
4485PyUnicode_DecodeLatin1(const char *s,
4486 Py_ssize_t size,
4487 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488{
4489 PyUnicodeObject *v;
4490 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004491 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004492
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004494 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 Py_UNICODE r = *(unsigned char*)s;
4496 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004497 }
4498
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499 v = _PyUnicode_New(size);
4500 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004505 e = s + size;
4506 /* Unrolling the copy makes it much faster by reducing the looping
4507 overhead. This is similar to what many memcpy() implementations do. */
4508 unrolled_end = e - 4;
4509 while (s < unrolled_end) {
4510 p[0] = (unsigned char) s[0];
4511 p[1] = (unsigned char) s[1];
4512 p[2] = (unsigned char) s[2];
4513 p[3] = (unsigned char) s[3];
4514 s += 4;
4515 p += 4;
4516 }
4517 while (s < e)
4518 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004520
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 Py_XDECREF(v);
4523 return NULL;
4524}
4525
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004527static void
4528make_encode_exception(PyObject **exceptionObject,
4529 const char *encoding,
4530 const Py_UNICODE *unicode, Py_ssize_t size,
4531 Py_ssize_t startpos, Py_ssize_t endpos,
4532 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 *exceptionObject = PyUnicodeEncodeError_Create(
4536 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 }
4538 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4540 goto onError;
4541 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4542 goto onError;
4543 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4544 goto onError;
4545 return;
4546 onError:
4547 Py_DECREF(*exceptionObject);
4548 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549 }
4550}
4551
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004553static void
4554raise_encode_exception(PyObject **exceptionObject,
4555 const char *encoding,
4556 const Py_UNICODE *unicode, Py_ssize_t size,
4557 Py_ssize_t startpos, Py_ssize_t endpos,
4558 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559{
4560 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004561 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564}
4565
4566/* error handling callback helper:
4567 build arguments, call the callback and check the arguments,
4568 put the result into newpos and return the replacement string, which
4569 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004570static PyObject *
4571unicode_encode_call_errorhandler(const char *errors,
4572 PyObject **errorHandler,
4573 const char *encoding, const char *reason,
4574 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4575 Py_ssize_t startpos, Py_ssize_t endpos,
4576 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004578 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579
4580 PyObject *restuple;
4581 PyObject *resunicode;
4582
4583 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 }
4588
4589 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004592 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593
4594 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004599 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004600 Py_DECREF(restuple);
4601 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004603 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 &resunicode, newpos)) {
4605 Py_DECREF(restuple);
4606 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004608 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4609 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4610 Py_DECREF(restuple);
4611 return NULL;
4612 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004615 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4617 Py_DECREF(restuple);
4618 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004619 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 Py_INCREF(resunicode);
4621 Py_DECREF(restuple);
4622 return resunicode;
4623}
4624
Alexander Belopolsky40018472011-02-26 01:02:56 +00004625static PyObject *
4626unicode_encode_ucs1(const Py_UNICODE *p,
4627 Py_ssize_t size,
4628 const char *errors,
4629 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004630{
4631 /* output object */
4632 PyObject *res;
4633 /* pointers to the beginning and end+1 of input */
4634 const Py_UNICODE *startp = p;
4635 const Py_UNICODE *endp = p + size;
4636 /* pointer to the beginning of the unencodable characters */
4637 /* const Py_UNICODE *badp = NULL; */
4638 /* pointer into the output */
4639 char *str;
4640 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004641 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004642 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4643 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 PyObject *errorHandler = NULL;
4645 PyObject *exc = NULL;
4646 /* the following variable is used for caching string comparisons
4647 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4648 int known_errorHandler = -1;
4649
4650 /* allocate enough for a simple encoding without
4651 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004652 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004653 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004654 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004656 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004657 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658 ressize = size;
4659
4660 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004661 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 /* can we encode this? */
4664 if (c<limit) {
4665 /* no overflow check, because we know that the space is enough */
4666 *str++ = (char)c;
4667 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004668 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004669 else {
4670 Py_ssize_t unicodepos = p-startp;
4671 Py_ssize_t requiredsize;
4672 PyObject *repunicode;
4673 Py_ssize_t repsize;
4674 Py_ssize_t newpos;
4675 Py_ssize_t respos;
4676 Py_UNICODE *uni2;
4677 /* startpos for collecting unencodable chars */
4678 const Py_UNICODE *collstart = p;
4679 const Py_UNICODE *collend = p;
4680 /* find all unecodable characters */
4681 while ((collend < endp) && ((*collend)>=limit))
4682 ++collend;
4683 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4684 if (known_errorHandler==-1) {
4685 if ((errors==NULL) || (!strcmp(errors, "strict")))
4686 known_errorHandler = 1;
4687 else if (!strcmp(errors, "replace"))
4688 known_errorHandler = 2;
4689 else if (!strcmp(errors, "ignore"))
4690 known_errorHandler = 3;
4691 else if (!strcmp(errors, "xmlcharrefreplace"))
4692 known_errorHandler = 4;
4693 else
4694 known_errorHandler = 0;
4695 }
4696 switch (known_errorHandler) {
4697 case 1: /* strict */
4698 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4699 goto onError;
4700 case 2: /* replace */
4701 while (collstart++<collend)
4702 *str++ = '?'; /* fall through */
4703 case 3: /* ignore */
4704 p = collend;
4705 break;
4706 case 4: /* xmlcharrefreplace */
4707 respos = str - PyBytes_AS_STRING(res);
4708 /* determine replacement size (temporarily (mis)uses p) */
4709 for (p = collstart, repsize = 0; p < collend; ++p) {
4710 if (*p<10)
4711 repsize += 2+1+1;
4712 else if (*p<100)
4713 repsize += 2+2+1;
4714 else if (*p<1000)
4715 repsize += 2+3+1;
4716 else if (*p<10000)
4717 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004718#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004719 else
4720 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004721#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 else if (*p<100000)
4723 repsize += 2+5+1;
4724 else if (*p<1000000)
4725 repsize += 2+6+1;
4726 else
4727 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004728#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 }
4730 requiredsize = respos+repsize+(endp-collend);
4731 if (requiredsize > ressize) {
4732 if (requiredsize<2*ressize)
4733 requiredsize = 2*ressize;
4734 if (_PyBytes_Resize(&res, requiredsize))
4735 goto onError;
4736 str = PyBytes_AS_STRING(res) + respos;
4737 ressize = requiredsize;
4738 }
4739 /* generate replacement (temporarily (mis)uses p) */
4740 for (p = collstart; p < collend; ++p) {
4741 str += sprintf(str, "&#%d;", (int)*p);
4742 }
4743 p = collend;
4744 break;
4745 default:
4746 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4747 encoding, reason, startp, size, &exc,
4748 collstart-startp, collend-startp, &newpos);
4749 if (repunicode == NULL)
4750 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004751 if (PyBytes_Check(repunicode)) {
4752 /* Directly copy bytes result to output. */
4753 repsize = PyBytes_Size(repunicode);
4754 if (repsize > 1) {
4755 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004756 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004757 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4758 Py_DECREF(repunicode);
4759 goto onError;
4760 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004761 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004762 ressize += repsize-1;
4763 }
4764 memcpy(str, PyBytes_AsString(repunicode), repsize);
4765 str += repsize;
4766 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004767 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004768 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004769 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 /* need more space? (at least enough for what we
4771 have+the replacement+the rest of the string, so
4772 we won't have to check space for encodable characters) */
4773 respos = str - PyBytes_AS_STRING(res);
4774 repsize = PyUnicode_GET_SIZE(repunicode);
4775 requiredsize = respos+repsize+(endp-collend);
4776 if (requiredsize > ressize) {
4777 if (requiredsize<2*ressize)
4778 requiredsize = 2*ressize;
4779 if (_PyBytes_Resize(&res, requiredsize)) {
4780 Py_DECREF(repunicode);
4781 goto onError;
4782 }
4783 str = PyBytes_AS_STRING(res) + respos;
4784 ressize = requiredsize;
4785 }
4786 /* check if there is anything unencodable in the replacement
4787 and copy it to the output */
4788 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4789 c = *uni2;
4790 if (c >= limit) {
4791 raise_encode_exception(&exc, encoding, startp, size,
4792 unicodepos, unicodepos+1, reason);
4793 Py_DECREF(repunicode);
4794 goto onError;
4795 }
4796 *str = (char)c;
4797 }
4798 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004799 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004800 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004801 }
4802 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004803 /* Resize if we allocated to much */
4804 size = str - PyBytes_AS_STRING(res);
4805 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004806 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004807 if (_PyBytes_Resize(&res, size) < 0)
4808 goto onError;
4809 }
4810
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 Py_XDECREF(errorHandler);
4812 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004813 return res;
4814
4815 onError:
4816 Py_XDECREF(res);
4817 Py_XDECREF(errorHandler);
4818 Py_XDECREF(exc);
4819 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820}
4821
Alexander Belopolsky40018472011-02-26 01:02:56 +00004822PyObject *
4823PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4824 Py_ssize_t size,
4825 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004827 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828}
4829
Alexander Belopolsky40018472011-02-26 01:02:56 +00004830PyObject *
4831PyUnicode_AsLatin1String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832{
4833 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004834 PyErr_BadArgument();
4835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836 }
4837 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 PyUnicode_GET_SIZE(unicode),
4839 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840}
4841
4842/* --- 7-bit ASCII Codec -------------------------------------------------- */
4843
Alexander Belopolsky40018472011-02-26 01:02:56 +00004844PyObject *
4845PyUnicode_DecodeASCII(const char *s,
4846 Py_ssize_t size,
4847 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 PyUnicodeObject *v;
4851 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 Py_ssize_t startinpos;
4853 Py_ssize_t endinpos;
4854 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 const char *e;
4856 PyObject *errorHandler = NULL;
4857 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004858
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004860 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 Py_UNICODE r = *(unsigned char*)s;
4862 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004863 }
Tim Petersced69f82003-09-16 20:30:58 +00004864
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 v = _PyUnicode_New(size);
4866 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004869 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 e = s + size;
4872 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004873 register unsigned char c = (unsigned char)*s;
4874 if (c < 128) {
4875 *p++ = c;
4876 ++s;
4877 }
4878 else {
4879 startinpos = s-starts;
4880 endinpos = startinpos + 1;
4881 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4882 if (unicode_decode_call_errorhandler(
4883 errors, &errorHandler,
4884 "ascii", "ordinal not in range(128)",
4885 &starts, &e, &startinpos, &endinpos, &exc, &s,
4886 &v, &outpos, &p))
4887 goto onError;
4888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004890 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4892 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 Py_XDECREF(errorHandler);
4894 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004896
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899 Py_XDECREF(errorHandler);
4900 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 return NULL;
4902}
4903
Alexander Belopolsky40018472011-02-26 01:02:56 +00004904PyObject *
4905PyUnicode_EncodeASCII(const Py_UNICODE *p,
4906 Py_ssize_t size,
4907 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004909 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910}
4911
Alexander Belopolsky40018472011-02-26 01:02:56 +00004912PyObject *
4913PyUnicode_AsASCIIString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914{
4915 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 PyErr_BadArgument();
4917 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 }
4919 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 PyUnicode_GET_SIZE(unicode),
4921 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922}
4923
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004924#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004925
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004926/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004927
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004928#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004929#define NEED_RETRY
4930#endif
4931
4932/* XXX This code is limited to "true" double-byte encodings, as
4933 a) it assumes an incomplete character consists of a single byte, and
4934 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004936
Alexander Belopolsky40018472011-02-26 01:02:56 +00004937static int
4938is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004939{
4940 const char *curr = s + offset;
4941
4942 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 const char *prev = CharPrev(s, curr);
4944 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004945 }
4946 return 0;
4947}
4948
4949/*
4950 * Decode MBCS string into unicode object. If 'final' is set, converts
4951 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4952 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004953static int
4954decode_mbcs(PyUnicodeObject **v,
4955 const char *s, /* MBCS string */
4956 int size, /* sizeof MBCS string */
4957 int final,
4958 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004959{
4960 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004961 Py_ssize_t n;
4962 DWORD usize;
4963 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004964
4965 assert(size >= 0);
4966
Victor Stinner554f3f02010-06-16 23:33:54 +00004967 /* check and handle 'errors' arg */
4968 if (errors==NULL || strcmp(errors, "strict")==0)
4969 flags = MB_ERR_INVALID_CHARS;
4970 else if (strcmp(errors, "ignore")==0)
4971 flags = 0;
4972 else {
4973 PyErr_Format(PyExc_ValueError,
4974 "mbcs encoding does not support errors='%s'",
4975 errors);
4976 return -1;
4977 }
4978
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004979 /* Skip trailing lead-byte unless 'final' is set */
4980 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004982
4983 /* First get the size of the result */
4984 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004985 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4986 if (usize==0)
4987 goto mbcs_decode_error;
4988 } else
4989 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004990
4991 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 /* Create unicode object */
4993 *v = _PyUnicode_New(usize);
4994 if (*v == NULL)
4995 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004996 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004997 }
4998 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 /* Extend unicode object */
5000 n = PyUnicode_GET_SIZE(*v);
5001 if (_PyUnicode_Resize(v, n + usize) < 0)
5002 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005003 }
5004
5005 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00005006 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005008 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
5009 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005011 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005012 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00005013
5014mbcs_decode_error:
5015 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5016 we raise a UnicodeDecodeError - else it is a 'generic'
5017 windows error
5018 */
5019 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5020 /* Ideally, we should get reason from FormatMessage - this
5021 is the Windows 2000 English version of the message
5022 */
5023 PyObject *exc = NULL;
5024 const char *reason = "No mapping for the Unicode character exists "
5025 "in the target multi-byte code page.";
5026 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5027 if (exc != NULL) {
5028 PyCodec_StrictErrors(exc);
5029 Py_DECREF(exc);
5030 }
5031 } else {
5032 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5033 }
5034 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005035}
5036
Alexander Belopolsky40018472011-02-26 01:02:56 +00005037PyObject *
5038PyUnicode_DecodeMBCSStateful(const char *s,
5039 Py_ssize_t size,
5040 const char *errors,
5041 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005042{
5043 PyUnicodeObject *v = NULL;
5044 int done;
5045
5046 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005048
5049#ifdef NEED_RETRY
5050 retry:
5051 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005052 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005053 else
5054#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005055 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005056
5057 if (done < 0) {
5058 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005059 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005060 }
5061
5062 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005064
5065#ifdef NEED_RETRY
5066 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 s += done;
5068 size -= done;
5069 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005070 }
5071#endif
5072
5073 return (PyObject *)v;
5074}
5075
Alexander Belopolsky40018472011-02-26 01:02:56 +00005076PyObject *
5077PyUnicode_DecodeMBCS(const char *s,
5078 Py_ssize_t size,
5079 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005080{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005081 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5082}
5083
5084/*
5085 * Convert unicode into string object (MBCS).
5086 * Returns 0 if succeed, -1 otherwise.
5087 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005088static int
5089encode_mbcs(PyObject **repr,
5090 const Py_UNICODE *p, /* unicode */
5091 int size, /* size of unicode */
5092 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005093{
Victor Stinner554f3f02010-06-16 23:33:54 +00005094 BOOL usedDefaultChar = FALSE;
5095 BOOL *pusedDefaultChar;
5096 int mbcssize;
5097 Py_ssize_t n;
5098 PyObject *exc = NULL;
5099 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005100
5101 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005102
Victor Stinner554f3f02010-06-16 23:33:54 +00005103 /* check and handle 'errors' arg */
5104 if (errors==NULL || strcmp(errors, "strict")==0) {
5105 flags = WC_NO_BEST_FIT_CHARS;
5106 pusedDefaultChar = &usedDefaultChar;
5107 } else if (strcmp(errors, "replace")==0) {
5108 flags = 0;
5109 pusedDefaultChar = NULL;
5110 } else {
5111 PyErr_Format(PyExc_ValueError,
5112 "mbcs encoding does not support errors='%s'",
5113 errors);
5114 return -1;
5115 }
5116
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005117 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005118 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005119 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5120 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005121 if (mbcssize == 0) {
5122 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5123 return -1;
5124 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005125 /* If we used a default char, then we failed! */
5126 if (pusedDefaultChar && *pusedDefaultChar)
5127 goto mbcs_encode_error;
5128 } else {
5129 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005130 }
5131
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005132 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 /* Create string object */
5134 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5135 if (*repr == NULL)
5136 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005137 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005138 }
5139 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 /* Extend string object */
5141 n = PyBytes_Size(*repr);
5142 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5143 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005144 }
5145
5146 /* Do the conversion */
5147 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005149 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5150 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5152 return -1;
5153 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005154 if (pusedDefaultChar && *pusedDefaultChar)
5155 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005156 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005157 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005158
5159mbcs_encode_error:
5160 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5161 Py_XDECREF(exc);
5162 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005163}
5164
Alexander Belopolsky40018472011-02-26 01:02:56 +00005165PyObject *
5166PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5167 Py_ssize_t size,
5168 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005169{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005170 PyObject *repr = NULL;
5171 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005172
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005173#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005175 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005176 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005177 else
5178#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005179 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005180
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005181 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 Py_XDECREF(repr);
5183 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005184 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005185
5186#ifdef NEED_RETRY
5187 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 p += INT_MAX;
5189 size -= INT_MAX;
5190 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005191 }
5192#endif
5193
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005194 return repr;
5195}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005196
Alexander Belopolsky40018472011-02-26 01:02:56 +00005197PyObject *
5198PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005199{
5200 if (!PyUnicode_Check(unicode)) {
5201 PyErr_BadArgument();
5202 return NULL;
5203 }
5204 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 PyUnicode_GET_SIZE(unicode),
5206 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005207}
5208
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005209#undef NEED_RETRY
5210
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005211#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005212
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213/* --- Character Mapping Codec -------------------------------------------- */
5214
Alexander Belopolsky40018472011-02-26 01:02:56 +00005215PyObject *
5216PyUnicode_DecodeCharmap(const char *s,
5217 Py_ssize_t size,
5218 PyObject *mapping,
5219 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005221 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005222 Py_ssize_t startinpos;
5223 Py_ssize_t endinpos;
5224 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005225 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 PyUnicodeObject *v;
5227 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005228 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005229 PyObject *errorHandler = NULL;
5230 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005231 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005233
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 /* Default to Latin-1 */
5235 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237
5238 v = _PyUnicode_New(size);
5239 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005245 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 mapstring = PyUnicode_AS_UNICODE(mapping);
5247 maplen = PyUnicode_GET_SIZE(mapping);
5248 while (s < e) {
5249 unsigned char ch = *s;
5250 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 if (ch < maplen)
5253 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 if (x == 0xfffe) {
5256 /* undefined mapping */
5257 outpos = p-PyUnicode_AS_UNICODE(v);
5258 startinpos = s-starts;
5259 endinpos = startinpos+1;
5260 if (unicode_decode_call_errorhandler(
5261 errors, &errorHandler,
5262 "charmap", "character maps to <undefined>",
5263 &starts, &e, &startinpos, &endinpos, &exc, &s,
5264 &v, &outpos, &p)) {
5265 goto onError;
5266 }
5267 continue;
5268 }
5269 *p++ = x;
5270 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005271 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005272 }
5273 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 while (s < e) {
5275 unsigned char ch = *s;
5276 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005277
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5279 w = PyLong_FromLong((long)ch);
5280 if (w == NULL)
5281 goto onError;
5282 x = PyObject_GetItem(mapping, w);
5283 Py_DECREF(w);
5284 if (x == NULL) {
5285 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5286 /* No mapping found means: mapping is undefined. */
5287 PyErr_Clear();
5288 x = Py_None;
5289 Py_INCREF(x);
5290 } else
5291 goto onError;
5292 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005293
Benjamin Peterson29060642009-01-31 22:14:21 +00005294 /* Apply mapping */
5295 if (PyLong_Check(x)) {
5296 long value = PyLong_AS_LONG(x);
5297 if (value < 0 || value > 65535) {
5298 PyErr_SetString(PyExc_TypeError,
5299 "character mapping must be in range(65536)");
5300 Py_DECREF(x);
5301 goto onError;
5302 }
5303 *p++ = (Py_UNICODE)value;
5304 }
5305 else if (x == Py_None) {
5306 /* undefined mapping */
5307 outpos = p-PyUnicode_AS_UNICODE(v);
5308 startinpos = s-starts;
5309 endinpos = startinpos+1;
5310 if (unicode_decode_call_errorhandler(
5311 errors, &errorHandler,
5312 "charmap", "character maps to <undefined>",
5313 &starts, &e, &startinpos, &endinpos, &exc, &s,
5314 &v, &outpos, &p)) {
5315 Py_DECREF(x);
5316 goto onError;
5317 }
5318 Py_DECREF(x);
5319 continue;
5320 }
5321 else if (PyUnicode_Check(x)) {
5322 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005323
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 if (targetsize == 1)
5325 /* 1-1 mapping */
5326 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005327
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 else if (targetsize > 1) {
5329 /* 1-n mapping */
5330 if (targetsize > extrachars) {
5331 /* resize first */
5332 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5333 Py_ssize_t needed = (targetsize - extrachars) + \
5334 (targetsize << 2);
5335 extrachars += needed;
5336 /* XXX overflow detection missing */
5337 if (_PyUnicode_Resize(&v,
5338 PyUnicode_GET_SIZE(v) + needed) < 0) {
5339 Py_DECREF(x);
5340 goto onError;
5341 }
5342 p = PyUnicode_AS_UNICODE(v) + oldpos;
5343 }
5344 Py_UNICODE_COPY(p,
5345 PyUnicode_AS_UNICODE(x),
5346 targetsize);
5347 p += targetsize;
5348 extrachars -= targetsize;
5349 }
5350 /* 1-0 mapping: skip the character */
5351 }
5352 else {
5353 /* wrong return value */
5354 PyErr_SetString(PyExc_TypeError,
5355 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005356 Py_DECREF(x);
5357 goto onError;
5358 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 Py_DECREF(x);
5360 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005361 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 }
5363 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5365 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 Py_XDECREF(errorHandler);
5367 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005369
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371 Py_XDECREF(errorHandler);
5372 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 Py_XDECREF(v);
5374 return NULL;
5375}
5376
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005377/* Charmap encoding: the lookup table */
5378
Alexander Belopolsky40018472011-02-26 01:02:56 +00005379struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 PyObject_HEAD
5381 unsigned char level1[32];
5382 int count2, count3;
5383 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005384};
5385
5386static PyObject*
5387encoding_map_size(PyObject *obj, PyObject* args)
5388{
5389 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005390 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005392}
5393
5394static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005395 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 PyDoc_STR("Return the size (in bytes) of this object") },
5397 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005398};
5399
5400static void
5401encoding_map_dealloc(PyObject* o)
5402{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005403 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005404}
5405
5406static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005407 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 "EncodingMap", /*tp_name*/
5409 sizeof(struct encoding_map), /*tp_basicsize*/
5410 0, /*tp_itemsize*/
5411 /* methods */
5412 encoding_map_dealloc, /*tp_dealloc*/
5413 0, /*tp_print*/
5414 0, /*tp_getattr*/
5415 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005416 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 0, /*tp_repr*/
5418 0, /*tp_as_number*/
5419 0, /*tp_as_sequence*/
5420 0, /*tp_as_mapping*/
5421 0, /*tp_hash*/
5422 0, /*tp_call*/
5423 0, /*tp_str*/
5424 0, /*tp_getattro*/
5425 0, /*tp_setattro*/
5426 0, /*tp_as_buffer*/
5427 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5428 0, /*tp_doc*/
5429 0, /*tp_traverse*/
5430 0, /*tp_clear*/
5431 0, /*tp_richcompare*/
5432 0, /*tp_weaklistoffset*/
5433 0, /*tp_iter*/
5434 0, /*tp_iternext*/
5435 encoding_map_methods, /*tp_methods*/
5436 0, /*tp_members*/
5437 0, /*tp_getset*/
5438 0, /*tp_base*/
5439 0, /*tp_dict*/
5440 0, /*tp_descr_get*/
5441 0, /*tp_descr_set*/
5442 0, /*tp_dictoffset*/
5443 0, /*tp_init*/
5444 0, /*tp_alloc*/
5445 0, /*tp_new*/
5446 0, /*tp_free*/
5447 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005448};
5449
5450PyObject*
5451PyUnicode_BuildEncodingMap(PyObject* string)
5452{
5453 Py_UNICODE *decode;
5454 PyObject *result;
5455 struct encoding_map *mresult;
5456 int i;
5457 int need_dict = 0;
5458 unsigned char level1[32];
5459 unsigned char level2[512];
5460 unsigned char *mlevel1, *mlevel2, *mlevel3;
5461 int count2 = 0, count3 = 0;
5462
5463 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5464 PyErr_BadArgument();
5465 return NULL;
5466 }
5467 decode = PyUnicode_AS_UNICODE(string);
5468 memset(level1, 0xFF, sizeof level1);
5469 memset(level2, 0xFF, sizeof level2);
5470
5471 /* If there isn't a one-to-one mapping of NULL to \0,
5472 or if there are non-BMP characters, we need to use
5473 a mapping dictionary. */
5474 if (decode[0] != 0)
5475 need_dict = 1;
5476 for (i = 1; i < 256; i++) {
5477 int l1, l2;
5478 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005479#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005480 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005481#endif
5482 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005483 need_dict = 1;
5484 break;
5485 }
5486 if (decode[i] == 0xFFFE)
5487 /* unmapped character */
5488 continue;
5489 l1 = decode[i] >> 11;
5490 l2 = decode[i] >> 7;
5491 if (level1[l1] == 0xFF)
5492 level1[l1] = count2++;
5493 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005494 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005495 }
5496
5497 if (count2 >= 0xFF || count3 >= 0xFF)
5498 need_dict = 1;
5499
5500 if (need_dict) {
5501 PyObject *result = PyDict_New();
5502 PyObject *key, *value;
5503 if (!result)
5504 return NULL;
5505 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005506 key = PyLong_FromLong(decode[i]);
5507 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005508 if (!key || !value)
5509 goto failed1;
5510 if (PyDict_SetItem(result, key, value) == -1)
5511 goto failed1;
5512 Py_DECREF(key);
5513 Py_DECREF(value);
5514 }
5515 return result;
5516 failed1:
5517 Py_XDECREF(key);
5518 Py_XDECREF(value);
5519 Py_DECREF(result);
5520 return NULL;
5521 }
5522
5523 /* Create a three-level trie */
5524 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5525 16*count2 + 128*count3 - 1);
5526 if (!result)
5527 return PyErr_NoMemory();
5528 PyObject_Init(result, &EncodingMapType);
5529 mresult = (struct encoding_map*)result;
5530 mresult->count2 = count2;
5531 mresult->count3 = count3;
5532 mlevel1 = mresult->level1;
5533 mlevel2 = mresult->level23;
5534 mlevel3 = mresult->level23 + 16*count2;
5535 memcpy(mlevel1, level1, 32);
5536 memset(mlevel2, 0xFF, 16*count2);
5537 memset(mlevel3, 0, 128*count3);
5538 count3 = 0;
5539 for (i = 1; i < 256; i++) {
5540 int o1, o2, o3, i2, i3;
5541 if (decode[i] == 0xFFFE)
5542 /* unmapped character */
5543 continue;
5544 o1 = decode[i]>>11;
5545 o2 = (decode[i]>>7) & 0xF;
5546 i2 = 16*mlevel1[o1] + o2;
5547 if (mlevel2[i2] == 0xFF)
5548 mlevel2[i2] = count3++;
5549 o3 = decode[i] & 0x7F;
5550 i3 = 128*mlevel2[i2] + o3;
5551 mlevel3[i3] = i;
5552 }
5553 return result;
5554}
5555
5556static int
5557encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5558{
5559 struct encoding_map *map = (struct encoding_map*)mapping;
5560 int l1 = c>>11;
5561 int l2 = (c>>7) & 0xF;
5562 int l3 = c & 0x7F;
5563 int i;
5564
5565#ifdef Py_UNICODE_WIDE
5566 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005568 }
5569#endif
5570 if (c == 0)
5571 return 0;
5572 /* level 1*/
5573 i = map->level1[l1];
5574 if (i == 0xFF) {
5575 return -1;
5576 }
5577 /* level 2*/
5578 i = map->level23[16*i+l2];
5579 if (i == 0xFF) {
5580 return -1;
5581 }
5582 /* level 3 */
5583 i = map->level23[16*map->count2 + 128*i + l3];
5584 if (i == 0) {
5585 return -1;
5586 }
5587 return i;
5588}
5589
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005590/* Lookup the character ch in the mapping. If the character
5591 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005592 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005593static PyObject *
5594charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595{
Christian Heimes217cfd12007-12-02 14:31:20 +00005596 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597 PyObject *x;
5598
5599 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005601 x = PyObject_GetItem(mapping, w);
5602 Py_DECREF(w);
5603 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5605 /* No mapping found means: mapping is undefined. */
5606 PyErr_Clear();
5607 x = Py_None;
5608 Py_INCREF(x);
5609 return x;
5610 } else
5611 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005613 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005614 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005615 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 long value = PyLong_AS_LONG(x);
5617 if (value < 0 || value > 255) {
5618 PyErr_SetString(PyExc_TypeError,
5619 "character mapping must be in range(256)");
5620 Py_DECREF(x);
5621 return NULL;
5622 }
5623 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005625 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 /* wrong return value */
5629 PyErr_Format(PyExc_TypeError,
5630 "character mapping must return integer, bytes or None, not %.400s",
5631 x->ob_type->tp_name);
5632 Py_DECREF(x);
5633 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 }
5635}
5636
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005637static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005638charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005639{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005640 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5641 /* exponentially overallocate to minimize reallocations */
5642 if (requiredsize < 2*outsize)
5643 requiredsize = 2*outsize;
5644 if (_PyBytes_Resize(outobj, requiredsize))
5645 return -1;
5646 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005647}
5648
Benjamin Peterson14339b62009-01-31 16:36:08 +00005649typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00005651} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005652/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005653 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654 space is available. Return a new reference to the object that
5655 was put in the output buffer, or Py_None, if the mapping was undefined
5656 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005657 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005658static charmapencode_result
5659charmapencode_output(Py_UNICODE c, PyObject *mapping,
5660 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005662 PyObject *rep;
5663 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005664 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665
Christian Heimes90aa7642007-12-19 02:45:37 +00005666 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005667 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005669 if (res == -1)
5670 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 if (outsize<requiredsize)
5672 if (charmapencode_resize(outobj, outpos, requiredsize))
5673 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005674 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 outstart[(*outpos)++] = (char)res;
5676 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005677 }
5678
5679 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005682 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 Py_DECREF(rep);
5684 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005685 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 if (PyLong_Check(rep)) {
5687 Py_ssize_t requiredsize = *outpos+1;
5688 if (outsize<requiredsize)
5689 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5690 Py_DECREF(rep);
5691 return enc_EXCEPTION;
5692 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005693 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005695 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 else {
5697 const char *repchars = PyBytes_AS_STRING(rep);
5698 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5699 Py_ssize_t requiredsize = *outpos+repsize;
5700 if (outsize<requiredsize)
5701 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5702 Py_DECREF(rep);
5703 return enc_EXCEPTION;
5704 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005705 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 memcpy(outstart + *outpos, repchars, repsize);
5707 *outpos += repsize;
5708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005710 Py_DECREF(rep);
5711 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712}
5713
5714/* handle an error in PyUnicode_EncodeCharmap
5715 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005716static int
5717charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005718 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005719 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005720 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005721 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722{
5723 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005724 Py_ssize_t repsize;
5725 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 Py_UNICODE *uni2;
5727 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005728 Py_ssize_t collstartpos = *inpos;
5729 Py_ssize_t collendpos = *inpos+1;
5730 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731 char *encoding = "charmap";
5732 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005733 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 /* find all unencodable characters */
5736 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005737 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005738 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 int res = encoding_map_lookup(p[collendpos], mapping);
5740 if (res != -1)
5741 break;
5742 ++collendpos;
5743 continue;
5744 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005745
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 rep = charmapencode_lookup(p[collendpos], mapping);
5747 if (rep==NULL)
5748 return -1;
5749 else if (rep!=Py_None) {
5750 Py_DECREF(rep);
5751 break;
5752 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005753 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 }
5756 /* cache callback name lookup
5757 * (if not done yet, i.e. it's the first error) */
5758 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 if ((errors==NULL) || (!strcmp(errors, "strict")))
5760 *known_errorHandler = 1;
5761 else if (!strcmp(errors, "replace"))
5762 *known_errorHandler = 2;
5763 else if (!strcmp(errors, "ignore"))
5764 *known_errorHandler = 3;
5765 else if (!strcmp(errors, "xmlcharrefreplace"))
5766 *known_errorHandler = 4;
5767 else
5768 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 }
5770 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005771 case 1: /* strict */
5772 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5773 return -1;
5774 case 2: /* replace */
5775 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 x = charmapencode_output('?', mapping, res, respos);
5777 if (x==enc_EXCEPTION) {
5778 return -1;
5779 }
5780 else if (x==enc_FAILED) {
5781 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5782 return -1;
5783 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005784 }
5785 /* fall through */
5786 case 3: /* ignore */
5787 *inpos = collendpos;
5788 break;
5789 case 4: /* xmlcharrefreplace */
5790 /* generate replacement (temporarily (mis)uses p) */
5791 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 char buffer[2+29+1+1];
5793 char *cp;
5794 sprintf(buffer, "&#%d;", (int)p[collpos]);
5795 for (cp = buffer; *cp; ++cp) {
5796 x = charmapencode_output(*cp, mapping, res, respos);
5797 if (x==enc_EXCEPTION)
5798 return -1;
5799 else if (x==enc_FAILED) {
5800 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5801 return -1;
5802 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005803 }
5804 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005805 *inpos = collendpos;
5806 break;
5807 default:
5808 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 encoding, reason, p, size, exceptionObject,
5810 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005811 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005813 if (PyBytes_Check(repunicode)) {
5814 /* Directly copy bytes result to output. */
5815 Py_ssize_t outsize = PyBytes_Size(*res);
5816 Py_ssize_t requiredsize;
5817 repsize = PyBytes_Size(repunicode);
5818 requiredsize = *respos + repsize;
5819 if (requiredsize > outsize)
5820 /* Make room for all additional bytes. */
5821 if (charmapencode_resize(res, respos, requiredsize)) {
5822 Py_DECREF(repunicode);
5823 return -1;
5824 }
5825 memcpy(PyBytes_AsString(*res) + *respos,
5826 PyBytes_AsString(repunicode), repsize);
5827 *respos += repsize;
5828 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005829 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005830 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005831 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005832 /* generate replacement */
5833 repsize = PyUnicode_GET_SIZE(repunicode);
5834 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 x = charmapencode_output(*uni2, mapping, res, respos);
5836 if (x==enc_EXCEPTION) {
5837 return -1;
5838 }
5839 else if (x==enc_FAILED) {
5840 Py_DECREF(repunicode);
5841 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5842 return -1;
5843 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005844 }
5845 *inpos = newpos;
5846 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005847 }
5848 return 0;
5849}
5850
Alexander Belopolsky40018472011-02-26 01:02:56 +00005851PyObject *
5852PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5853 Py_ssize_t size,
5854 PyObject *mapping,
5855 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 /* output object */
5858 PyObject *res = NULL;
5859 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005860 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005862 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 PyObject *errorHandler = NULL;
5864 PyObject *exc = NULL;
5865 /* the following variable is used for caching string comparisons
5866 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5867 * 3=ignore, 4=xmlcharrefreplace */
5868 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
5870 /* Default to Latin-1 */
5871 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 /* allocate enough for a simple encoding without
5875 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005876 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877 if (res == NULL)
5878 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005879 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 /* try to encode it */
5884 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5885 if (x==enc_EXCEPTION) /* error */
5886 goto onError;
5887 if (x==enc_FAILED) { /* unencodable character */
5888 if (charmap_encoding_error(p, size, &inpos, mapping,
5889 &exc,
5890 &known_errorHandler, &errorHandler, errors,
5891 &res, &respos)) {
5892 goto onError;
5893 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005894 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 else
5896 /* done with this character => adjust input position */
5897 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005900 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005901 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005902 if (_PyBytes_Resize(&res, respos) < 0)
5903 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005904
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 Py_XDECREF(exc);
5906 Py_XDECREF(errorHandler);
5907 return res;
5908
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 Py_XDECREF(res);
5911 Py_XDECREF(exc);
5912 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 return NULL;
5914}
5915
Alexander Belopolsky40018472011-02-26 01:02:56 +00005916PyObject *
5917PyUnicode_AsCharmapString(PyObject *unicode,
5918 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919{
5920 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 PyErr_BadArgument();
5922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 }
5924 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 PyUnicode_GET_SIZE(unicode),
5926 mapping,
5927 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928}
5929
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005931static void
5932make_translate_exception(PyObject **exceptionObject,
5933 const Py_UNICODE *unicode, Py_ssize_t size,
5934 Py_ssize_t startpos, Py_ssize_t endpos,
5935 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005937 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005938 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 }
5941 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5943 goto onError;
5944 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5945 goto onError;
5946 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5947 goto onError;
5948 return;
5949 onError:
5950 Py_DECREF(*exceptionObject);
5951 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 }
5953}
5954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005955/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005956static void
5957raise_translate_exception(PyObject **exceptionObject,
5958 const Py_UNICODE *unicode, Py_ssize_t size,
5959 Py_ssize_t startpos, Py_ssize_t endpos,
5960 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961{
5962 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005964 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005965 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005966}
5967
5968/* error handling callback helper:
5969 build arguments, call the callback and check the arguments,
5970 put the result into newpos and return the replacement string, which
5971 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005972static PyObject *
5973unicode_translate_call_errorhandler(const char *errors,
5974 PyObject **errorHandler,
5975 const char *reason,
5976 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5977 Py_ssize_t startpos, Py_ssize_t endpos,
5978 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005979{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005980 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005981
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005982 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005983 PyObject *restuple;
5984 PyObject *resunicode;
5985
5986 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005988 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005990 }
5991
5992 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996
5997 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005999 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006001 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00006002 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 Py_DECREF(restuple);
6004 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006005 }
6006 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 &resunicode, &i_newpos)) {
6008 Py_DECREF(restuple);
6009 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006011 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006013 else
6014 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006015 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6017 Py_DECREF(restuple);
6018 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006019 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020 Py_INCREF(resunicode);
6021 Py_DECREF(restuple);
6022 return resunicode;
6023}
6024
6025/* Lookup the character ch in the mapping and put the result in result,
6026 which must be decrefed by the caller.
6027 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006028static int
6029charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006030{
Christian Heimes217cfd12007-12-02 14:31:20 +00006031 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006032 PyObject *x;
6033
6034 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006036 x = PyObject_GetItem(mapping, w);
6037 Py_DECREF(w);
6038 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6040 /* No mapping found means: use 1:1 mapping. */
6041 PyErr_Clear();
6042 *result = NULL;
6043 return 0;
6044 } else
6045 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046 }
6047 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 *result = x;
6049 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006051 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 long value = PyLong_AS_LONG(x);
6053 long max = PyUnicode_GetMax();
6054 if (value < 0 || value > max) {
6055 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006056 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 Py_DECREF(x);
6058 return -1;
6059 }
6060 *result = x;
6061 return 0;
6062 }
6063 else if (PyUnicode_Check(x)) {
6064 *result = x;
6065 return 0;
6066 }
6067 else {
6068 /* wrong return value */
6069 PyErr_SetString(PyExc_TypeError,
6070 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006071 Py_DECREF(x);
6072 return -1;
6073 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074}
6075/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 if not reallocate and adjust various state variables.
6077 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006078static int
6079charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006082 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006083 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 /* remember old output position */
6085 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6086 /* exponentially overallocate to minimize reallocations */
6087 if (requiredsize < 2 * oldsize)
6088 requiredsize = 2 * oldsize;
6089 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6090 return -1;
6091 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092 }
6093 return 0;
6094}
6095/* lookup the character, put the result in the output string and adjust
6096 various state variables. Return a new reference to the object that
6097 was put in the output buffer in *result, or Py_None, if the mapping was
6098 undefined (in which case no character was written).
6099 The called must decref result.
6100 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006101static int
6102charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6103 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6104 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006105{
Walter Dörwald4894c302003-10-24 14:25:28 +00006106 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 /* not found => default to 1:1 mapping */
6110 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 }
6112 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006114 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 /* no overflow check, because we know that the space is enough */
6116 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006117 }
6118 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6120 if (repsize==1) {
6121 /* no overflow check, because we know that the space is enough */
6122 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6123 }
6124 else if (repsize!=0) {
6125 /* more than one character */
6126 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6127 (insize - (curinp-startinp)) +
6128 repsize - 1;
6129 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6130 return -1;
6131 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6132 *outp += repsize;
6133 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 }
6135 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006137 return 0;
6138}
6139
Alexander Belopolsky40018472011-02-26 01:02:56 +00006140PyObject *
6141PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6142 Py_ssize_t size,
6143 PyObject *mapping,
6144 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006146 /* output object */
6147 PyObject *res = NULL;
6148 /* pointers to the beginning and end+1 of input */
6149 const Py_UNICODE *startp = p;
6150 const Py_UNICODE *endp = p + size;
6151 /* pointer into the output */
6152 Py_UNICODE *str;
6153 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006154 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006155 char *reason = "character maps to <undefined>";
6156 PyObject *errorHandler = NULL;
6157 PyObject *exc = NULL;
6158 /* the following variable is used for caching string comparisons
6159 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6160 * 3=ignore, 4=xmlcharrefreplace */
6161 int known_errorHandler = -1;
6162
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 PyErr_BadArgument();
6165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167
6168 /* allocate enough for a simple 1:1 translation without
6169 replacements, if we need more, we'll resize */
6170 res = PyUnicode_FromUnicode(NULL, size);
6171 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006175 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006177 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 /* try to encode it */
6179 PyObject *x = NULL;
6180 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6181 Py_XDECREF(x);
6182 goto onError;
6183 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006184 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 if (x!=Py_None) /* it worked => adjust input pointer */
6186 ++p;
6187 else { /* untranslatable character */
6188 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6189 Py_ssize_t repsize;
6190 Py_ssize_t newpos;
6191 Py_UNICODE *uni2;
6192 /* startpos for collecting untranslatable chars */
6193 const Py_UNICODE *collstart = p;
6194 const Py_UNICODE *collend = p+1;
6195 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 /* find all untranslatable characters */
6198 while (collend < endp) {
6199 if (charmaptranslate_lookup(*collend, mapping, &x))
6200 goto onError;
6201 Py_XDECREF(x);
6202 if (x!=Py_None)
6203 break;
6204 ++collend;
6205 }
6206 /* cache callback name lookup
6207 * (if not done yet, i.e. it's the first error) */
6208 if (known_errorHandler==-1) {
6209 if ((errors==NULL) || (!strcmp(errors, "strict")))
6210 known_errorHandler = 1;
6211 else if (!strcmp(errors, "replace"))
6212 known_errorHandler = 2;
6213 else if (!strcmp(errors, "ignore"))
6214 known_errorHandler = 3;
6215 else if (!strcmp(errors, "xmlcharrefreplace"))
6216 known_errorHandler = 4;
6217 else
6218 known_errorHandler = 0;
6219 }
6220 switch (known_errorHandler) {
6221 case 1: /* strict */
6222 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006223 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 case 2: /* replace */
6225 /* No need to check for space, this is a 1:1 replacement */
6226 for (coll = collstart; coll<collend; ++coll)
6227 *str++ = '?';
6228 /* fall through */
6229 case 3: /* ignore */
6230 p = collend;
6231 break;
6232 case 4: /* xmlcharrefreplace */
6233 /* generate replacement (temporarily (mis)uses p) */
6234 for (p = collstart; p < collend; ++p) {
6235 char buffer[2+29+1+1];
6236 char *cp;
6237 sprintf(buffer, "&#%d;", (int)*p);
6238 if (charmaptranslate_makespace(&res, &str,
6239 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6240 goto onError;
6241 for (cp = buffer; *cp; ++cp)
6242 *str++ = *cp;
6243 }
6244 p = collend;
6245 break;
6246 default:
6247 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6248 reason, startp, size, &exc,
6249 collstart-startp, collend-startp, &newpos);
6250 if (repunicode == NULL)
6251 goto onError;
6252 /* generate replacement */
6253 repsize = PyUnicode_GET_SIZE(repunicode);
6254 if (charmaptranslate_makespace(&res, &str,
6255 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6256 Py_DECREF(repunicode);
6257 goto onError;
6258 }
6259 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6260 *str++ = *uni2;
6261 p = startp + newpos;
6262 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006263 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006264 }
6265 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006266 /* Resize if we allocated to much */
6267 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006268 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 if (PyUnicode_Resize(&res, respos) < 0)
6270 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271 }
6272 Py_XDECREF(exc);
6273 Py_XDECREF(errorHandler);
6274 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006277 Py_XDECREF(res);
6278 Py_XDECREF(exc);
6279 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 return NULL;
6281}
6282
Alexander Belopolsky40018472011-02-26 01:02:56 +00006283PyObject *
6284PyUnicode_Translate(PyObject *str,
6285 PyObject *mapping,
6286 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287{
6288 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006289
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 str = PyUnicode_FromObject(str);
6291 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 PyUnicode_GET_SIZE(str),
6295 mapping,
6296 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 Py_DECREF(str);
6298 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006299
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 Py_XDECREF(str);
6302 return NULL;
6303}
Tim Petersced69f82003-09-16 20:30:58 +00006304
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006305PyObject *
6306PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6307 Py_ssize_t length)
6308{
6309 PyObject *result;
6310 Py_UNICODE *p; /* write pointer into result */
6311 Py_ssize_t i;
6312 /* Copy to a new string */
6313 result = (PyObject *)_PyUnicode_New(length);
6314 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6315 if (result == NULL)
6316 return result;
6317 p = PyUnicode_AS_UNICODE(result);
6318 /* Iterate over code points */
6319 for (i = 0; i < length; i++) {
6320 Py_UNICODE ch =s[i];
6321 if (ch > 127) {
6322 int decimal = Py_UNICODE_TODECIMAL(ch);
6323 if (decimal >= 0)
6324 p[i] = '0' + decimal;
6325 }
6326 }
6327 return result;
6328}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006329/* --- Decimal Encoder ---------------------------------------------------- */
6330
Alexander Belopolsky40018472011-02-26 01:02:56 +00006331int
6332PyUnicode_EncodeDecimal(Py_UNICODE *s,
6333 Py_ssize_t length,
6334 char *output,
6335 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006336{
6337 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338 PyObject *errorHandler = NULL;
6339 PyObject *exc = NULL;
6340 const char *encoding = "decimal";
6341 const char *reason = "invalid decimal Unicode string";
6342 /* the following variable is used for caching string comparisons
6343 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6344 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006345
6346 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 PyErr_BadArgument();
6348 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006349 }
6350
6351 p = s;
6352 end = s + length;
6353 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 register Py_UNICODE ch = *p;
6355 int decimal;
6356 PyObject *repunicode;
6357 Py_ssize_t repsize;
6358 Py_ssize_t newpos;
6359 Py_UNICODE *uni2;
6360 Py_UNICODE *collstart;
6361 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006362
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006364 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 ++p;
6366 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006367 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 decimal = Py_UNICODE_TODECIMAL(ch);
6369 if (decimal >= 0) {
6370 *output++ = '0' + decimal;
6371 ++p;
6372 continue;
6373 }
6374 if (0 < ch && ch < 256) {
6375 *output++ = (char)ch;
6376 ++p;
6377 continue;
6378 }
6379 /* All other characters are considered unencodable */
6380 collstart = p;
6381 collend = p+1;
6382 while (collend < end) {
6383 if ((0 < *collend && *collend < 256) ||
6384 !Py_UNICODE_ISSPACE(*collend) ||
6385 Py_UNICODE_TODECIMAL(*collend))
6386 break;
6387 }
6388 /* cache callback name lookup
6389 * (if not done yet, i.e. it's the first error) */
6390 if (known_errorHandler==-1) {
6391 if ((errors==NULL) || (!strcmp(errors, "strict")))
6392 known_errorHandler = 1;
6393 else if (!strcmp(errors, "replace"))
6394 known_errorHandler = 2;
6395 else if (!strcmp(errors, "ignore"))
6396 known_errorHandler = 3;
6397 else if (!strcmp(errors, "xmlcharrefreplace"))
6398 known_errorHandler = 4;
6399 else
6400 known_errorHandler = 0;
6401 }
6402 switch (known_errorHandler) {
6403 case 1: /* strict */
6404 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6405 goto onError;
6406 case 2: /* replace */
6407 for (p = collstart; p < collend; ++p)
6408 *output++ = '?';
6409 /* fall through */
6410 case 3: /* ignore */
6411 p = collend;
6412 break;
6413 case 4: /* xmlcharrefreplace */
6414 /* generate replacement (temporarily (mis)uses p) */
6415 for (p = collstart; p < collend; ++p)
6416 output += sprintf(output, "&#%d;", (int)*p);
6417 p = collend;
6418 break;
6419 default:
6420 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6421 encoding, reason, s, length, &exc,
6422 collstart-s, collend-s, &newpos);
6423 if (repunicode == NULL)
6424 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006425 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006426 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006427 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6428 Py_DECREF(repunicode);
6429 goto onError;
6430 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 /* generate replacement */
6432 repsize = PyUnicode_GET_SIZE(repunicode);
6433 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6434 Py_UNICODE ch = *uni2;
6435 if (Py_UNICODE_ISSPACE(ch))
6436 *output++ = ' ';
6437 else {
6438 decimal = Py_UNICODE_TODECIMAL(ch);
6439 if (decimal >= 0)
6440 *output++ = '0' + decimal;
6441 else if (0 < ch && ch < 256)
6442 *output++ = (char)ch;
6443 else {
6444 Py_DECREF(repunicode);
6445 raise_encode_exception(&exc, encoding,
6446 s, length, collstart-s, collend-s, reason);
6447 goto onError;
6448 }
6449 }
6450 }
6451 p = s + newpos;
6452 Py_DECREF(repunicode);
6453 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006454 }
6455 /* 0-terminate the output string */
6456 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 Py_XDECREF(exc);
6458 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006459 return 0;
6460
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462 Py_XDECREF(exc);
6463 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006464 return -1;
6465}
6466
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467/* --- Helpers ------------------------------------------------------------ */
6468
Eric Smith8c663262007-08-25 02:26:07 +00006469#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006470#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006471
Thomas Wouters477c8d52006-05-27 19:21:47 +00006472#include "stringlib/count.h"
6473#include "stringlib/find.h"
6474#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006475#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006476
Eric Smith5807c412008-05-11 21:00:57 +00006477#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006478#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006479#include "stringlib/localeutil.h"
6480
Thomas Wouters477c8d52006-05-27 19:21:47 +00006481/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006482#define ADJUST_INDICES(start, end, len) \
6483 if (end > len) \
6484 end = len; \
6485 else if (end < 0) { \
6486 end += len; \
6487 if (end < 0) \
6488 end = 0; \
6489 } \
6490 if (start < 0) { \
6491 start += len; \
6492 if (start < 0) \
6493 start = 0; \
6494 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006495
Alexander Belopolsky40018472011-02-26 01:02:56 +00006496Py_ssize_t
6497PyUnicode_Count(PyObject *str,
6498 PyObject *substr,
6499 Py_ssize_t start,
6500 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006502 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006503 PyUnicodeObject* str_obj;
6504 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006505
Thomas Wouters477c8d52006-05-27 19:21:47 +00006506 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6507 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006509 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6510 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 Py_DECREF(str_obj);
6512 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 }
Tim Petersced69f82003-09-16 20:30:58 +00006514
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006515 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006516 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006517 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6518 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006519 );
6520
6521 Py_DECREF(sub_obj);
6522 Py_DECREF(str_obj);
6523
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524 return result;
6525}
6526
Alexander Belopolsky40018472011-02-26 01:02:56 +00006527Py_ssize_t
6528PyUnicode_Find(PyObject *str,
6529 PyObject *sub,
6530 Py_ssize_t start,
6531 Py_ssize_t end,
6532 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006534 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006535
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006537 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006539 sub = PyUnicode_FromObject(sub);
6540 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 Py_DECREF(str);
6542 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 }
Tim Petersced69f82003-09-16 20:30:58 +00006544
Thomas Wouters477c8d52006-05-27 19:21:47 +00006545 if (direction > 0)
6546 result = stringlib_find_slice(
6547 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6548 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6549 start, end
6550 );
6551 else
6552 result = stringlib_rfind_slice(
6553 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6554 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6555 start, end
6556 );
6557
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006559 Py_DECREF(sub);
6560
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 return result;
6562}
6563
Alexander Belopolsky40018472011-02-26 01:02:56 +00006564static int
6565tailmatch(PyUnicodeObject *self,
6566 PyUnicodeObject *substring,
6567 Py_ssize_t start,
6568 Py_ssize_t end,
6569 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 if (substring->length == 0)
6572 return 1;
6573
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006574 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 end -= substring->length;
6576 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578
6579 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006580 if (Py_UNICODE_MATCH(self, end, substring))
6581 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 } else {
6583 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 }
6586
6587 return 0;
6588}
6589
Alexander Belopolsky40018472011-02-26 01:02:56 +00006590Py_ssize_t
6591PyUnicode_Tailmatch(PyObject *str,
6592 PyObject *substr,
6593 Py_ssize_t start,
6594 Py_ssize_t end,
6595 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006597 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006598
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 str = PyUnicode_FromObject(str);
6600 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 substr = PyUnicode_FromObject(substr);
6603 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 Py_DECREF(str);
6605 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 }
Tim Petersced69f82003-09-16 20:30:58 +00006607
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 (PyUnicodeObject *)substr,
6610 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 Py_DECREF(str);
6612 Py_DECREF(substr);
6613 return result;
6614}
6615
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616/* Apply fixfct filter to the Unicode object self and return a
6617 reference to the modified object */
6618
Alexander Belopolsky40018472011-02-26 01:02:56 +00006619static PyObject *
6620fixup(PyUnicodeObject *self,
6621 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622{
6623
6624 PyUnicodeObject *u;
6625
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006626 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006629
6630 Py_UNICODE_COPY(u->str, self->str, self->length);
6631
Tim Peters7a29bd52001-09-12 03:03:31 +00006632 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 /* fixfct should return TRUE if it modified the buffer. If
6634 FALSE, return a reference to the original buffer instead
6635 (to save space, not time) */
6636 Py_INCREF(self);
6637 Py_DECREF(u);
6638 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 }
6640 return (PyObject*) u;
6641}
6642
Alexander Belopolsky40018472011-02-26 01:02:56 +00006643static int
6644fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006646 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 Py_UNICODE *s = self->str;
6648 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006649
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006652
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 ch = Py_UNICODE_TOUPPER(*s);
6654 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 *s = ch;
6657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 s++;
6659 }
6660
6661 return status;
6662}
6663
Alexander Belopolsky40018472011-02-26 01:02:56 +00006664static int
6665fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006667 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 Py_UNICODE *s = self->str;
6669 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006670
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006673
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 ch = Py_UNICODE_TOLOWER(*s);
6675 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 *s = ch;
6678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 s++;
6680 }
6681
6682 return status;
6683}
6684
Alexander Belopolsky40018472011-02-26 01:02:56 +00006685static int
6686fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006688 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 Py_UNICODE *s = self->str;
6690 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006691
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 while (len-- > 0) {
6693 if (Py_UNICODE_ISUPPER(*s)) {
6694 *s = Py_UNICODE_TOLOWER(*s);
6695 status = 1;
6696 } else if (Py_UNICODE_ISLOWER(*s)) {
6697 *s = Py_UNICODE_TOUPPER(*s);
6698 status = 1;
6699 }
6700 s++;
6701 }
6702
6703 return status;
6704}
6705
Alexander Belopolsky40018472011-02-26 01:02:56 +00006706static int
6707fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006709 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006710 Py_UNICODE *s = self->str;
6711 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006712
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006713 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006715 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 *s = Py_UNICODE_TOUPPER(*s);
6717 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006719 s++;
6720 while (--len > 0) {
6721 if (Py_UNICODE_ISUPPER(*s)) {
6722 *s = Py_UNICODE_TOLOWER(*s);
6723 status = 1;
6724 }
6725 s++;
6726 }
6727 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728}
6729
Alexander Belopolsky40018472011-02-26 01:02:56 +00006730static int
6731fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732{
6733 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6734 register Py_UNICODE *e;
6735 int previous_is_cased;
6736
6737 /* Shortcut for single character strings */
6738 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6740 if (*p != ch) {
6741 *p = ch;
6742 return 1;
6743 }
6744 else
6745 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 }
Tim Petersced69f82003-09-16 20:30:58 +00006747
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 e = p + PyUnicode_GET_SIZE(self);
6749 previous_is_cased = 0;
6750 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006752
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 if (previous_is_cased)
6754 *p = Py_UNICODE_TOLOWER(ch);
6755 else
6756 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006757
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 if (Py_UNICODE_ISLOWER(ch) ||
6759 Py_UNICODE_ISUPPER(ch) ||
6760 Py_UNICODE_ISTITLE(ch))
6761 previous_is_cased = 1;
6762 else
6763 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 }
6765 return 1;
6766}
6767
Tim Peters8ce9f162004-08-27 01:49:32 +00006768PyObject *
6769PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
Skip Montanaro6543b452004-09-16 03:28:13 +00006771 const Py_UNICODE blank = ' ';
6772 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006773 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006774 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006775 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6776 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006777 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6778 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006779 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006780 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781
Tim Peters05eba1f2004-08-27 21:32:02 +00006782 fseq = PySequence_Fast(seq, "");
6783 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006784 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006785 }
6786
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006787 /* NOTE: the following code can't call back into Python code,
6788 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006789 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006790
Tim Peters05eba1f2004-08-27 21:32:02 +00006791 seqlen = PySequence_Fast_GET_SIZE(fseq);
6792 /* If empty sequence, return u"". */
6793 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006794 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6795 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006796 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006797 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006798 /* If singleton sequence with an exact Unicode, return that. */
6799 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 item = items[0];
6801 if (PyUnicode_CheckExact(item)) {
6802 Py_INCREF(item);
6803 res = (PyUnicodeObject *)item;
6804 goto Done;
6805 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006806 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006807 else {
6808 /* Set up sep and seplen */
6809 if (separator == NULL) {
6810 sep = &blank;
6811 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006812 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006813 else {
6814 if (!PyUnicode_Check(separator)) {
6815 PyErr_Format(PyExc_TypeError,
6816 "separator: expected str instance,"
6817 " %.80s found",
6818 Py_TYPE(separator)->tp_name);
6819 goto onError;
6820 }
6821 sep = PyUnicode_AS_UNICODE(separator);
6822 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006823 }
6824 }
6825
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006826 /* There are at least two things to join, or else we have a subclass
6827 * of str in the sequence.
6828 * Do a pre-pass to figure out the total amount of space we'll
6829 * need (sz), and see whether all argument are strings.
6830 */
6831 sz = 0;
6832 for (i = 0; i < seqlen; i++) {
6833 const Py_ssize_t old_sz = sz;
6834 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 if (!PyUnicode_Check(item)) {
6836 PyErr_Format(PyExc_TypeError,
6837 "sequence item %zd: expected str instance,"
6838 " %.80s found",
6839 i, Py_TYPE(item)->tp_name);
6840 goto onError;
6841 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006842 sz += PyUnicode_GET_SIZE(item);
6843 if (i != 0)
6844 sz += seplen;
6845 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6846 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006848 goto onError;
6849 }
6850 }
Tim Petersced69f82003-09-16 20:30:58 +00006851
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006852 res = _PyUnicode_New(sz);
6853 if (res == NULL)
6854 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006855
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006856 /* Catenate everything. */
6857 res_p = PyUnicode_AS_UNICODE(res);
6858 for (i = 0; i < seqlen; ++i) {
6859 Py_ssize_t itemlen;
6860 item = items[i];
6861 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 /* Copy item, and maybe the separator. */
6863 if (i) {
6864 Py_UNICODE_COPY(res_p, sep, seplen);
6865 res_p += seplen;
6866 }
6867 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6868 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006869 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006870
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006872 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 return (PyObject *)res;
6874
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006876 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006877 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 return NULL;
6879}
6880
Alexander Belopolsky40018472011-02-26 01:02:56 +00006881static PyUnicodeObject *
6882pad(PyUnicodeObject *self,
6883 Py_ssize_t left,
6884 Py_ssize_t right,
6885 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
6887 PyUnicodeObject *u;
6888
6889 if (left < 0)
6890 left = 0;
6891 if (right < 0)
6892 right = 0;
6893
Tim Peters7a29bd52001-09-12 03:03:31 +00006894 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 Py_INCREF(self);
6896 return self;
6897 }
6898
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006899 if (left > PY_SSIZE_T_MAX - self->length ||
6900 right > PY_SSIZE_T_MAX - (left + self->length)) {
6901 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6902 return NULL;
6903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 u = _PyUnicode_New(left + self->length + right);
6905 if (u) {
6906 if (left)
6907 Py_UNICODE_FILL(u->str, fill, left);
6908 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6909 if (right)
6910 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6911 }
6912
6913 return u;
6914}
6915
Alexander Belopolsky40018472011-02-26 01:02:56 +00006916PyObject *
6917PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920
6921 string = PyUnicode_FromObject(string);
6922 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006925 list = stringlib_splitlines(
6926 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6927 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928
6929 Py_DECREF(string);
6930 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931}
6932
Alexander Belopolsky40018472011-02-26 01:02:56 +00006933static PyObject *
6934split(PyUnicodeObject *self,
6935 PyUnicodeObject *substring,
6936 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006939 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006942 return stringlib_split_whitespace(
6943 (PyObject*) self, self->str, self->length, maxcount
6944 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006946 return stringlib_split(
6947 (PyObject*) self, self->str, self->length,
6948 substring->str, substring->length,
6949 maxcount
6950 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951}
6952
Alexander Belopolsky40018472011-02-26 01:02:56 +00006953static PyObject *
6954rsplit(PyUnicodeObject *self,
6955 PyUnicodeObject *substring,
6956 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006957{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006958 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006959 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006960
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006961 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006962 return stringlib_rsplit_whitespace(
6963 (PyObject*) self, self->str, self->length, maxcount
6964 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006965
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006966 return stringlib_rsplit(
6967 (PyObject*) self, self->str, self->length,
6968 substring->str, substring->length,
6969 maxcount
6970 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006971}
6972
Alexander Belopolsky40018472011-02-26 01:02:56 +00006973static PyObject *
6974replace(PyUnicodeObject *self,
6975 PyUnicodeObject *str1,
6976 PyUnicodeObject *str2,
6977 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978{
6979 PyUnicodeObject *u;
6980
6981 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006983 else if (maxcount == 0 || self->length == 0)
6984 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985
Thomas Wouters477c8d52006-05-27 19:21:47 +00006986 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006987 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006988 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006989 if (str1->length == 0)
6990 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006991 if (str1->length == 1) {
6992 /* replace characters */
6993 Py_UNICODE u1, u2;
6994 if (!findchar(self->str, self->length, str1->str[0]))
6995 goto nothing;
6996 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6997 if (!u)
6998 return NULL;
6999 Py_UNICODE_COPY(u->str, self->str, self->length);
7000 u1 = str1->str[0];
7001 u2 = str2->str[0];
7002 for (i = 0; i < u->length; i++)
7003 if (u->str[i] == u1) {
7004 if (--maxcount < 0)
7005 break;
7006 u->str[i] = u2;
7007 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007009 i = stringlib_find(
7010 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007012 if (i < 0)
7013 goto nothing;
7014 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7015 if (!u)
7016 return NULL;
7017 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007018
7019 /* change everything in-place, starting with this one */
7020 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7021 i += str1->length;
7022
7023 while ( --maxcount > 0) {
7024 i = stringlib_find(self->str+i, self->length-i,
7025 str1->str, str1->length,
7026 i);
7027 if (i == -1)
7028 break;
7029 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7030 i += str1->length;
7031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007034
Brett Cannonb94767f2011-02-22 20:15:44 +00007035 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007036 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 Py_UNICODE *p;
7038
7039 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007040 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7041 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007042 if (n == 0)
7043 goto nothing;
7044 /* new_size = self->length + n * (str2->length - str1->length)); */
7045 delta = (str2->length - str1->length);
7046 if (delta == 0) {
7047 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007049 product = n * (str2->length - str1->length);
7050 if ((product / (str2->length - str1->length)) != n) {
7051 PyErr_SetString(PyExc_OverflowError,
7052 "replace string is too long");
7053 return NULL;
7054 }
7055 new_size = self->length + product;
7056 if (new_size < 0) {
7057 PyErr_SetString(PyExc_OverflowError,
7058 "replace string is too long");
7059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 }
7061 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007062 u = _PyUnicode_New(new_size);
7063 if (!u)
7064 return NULL;
7065 i = 0;
7066 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007067 if (str1->length > 0) {
7068 while (n-- > 0) {
7069 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007070 j = stringlib_find(self->str+i, self->length-i,
7071 str1->str, str1->length,
7072 i);
7073 if (j == -1)
7074 break;
7075 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007076 /* copy unchanged part [i:j] */
7077 Py_UNICODE_COPY(p, self->str+i, j-i);
7078 p += j - i;
7079 }
7080 /* copy substitution string */
7081 if (str2->length > 0) {
7082 Py_UNICODE_COPY(p, str2->str, str2->length);
7083 p += str2->length;
7084 }
7085 i = j + str1->length;
7086 }
7087 if (i < self->length)
7088 /* copy tail [i:] */
7089 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7090 } else {
7091 /* interleave */
7092 while (n > 0) {
7093 Py_UNICODE_COPY(p, str2->str, str2->length);
7094 p += str2->length;
7095 if (--n <= 0)
7096 break;
7097 *p++ = self->str[i++];
7098 }
7099 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7100 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007103
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007105 /* nothing to replace; return original string (when possible) */
7106 if (PyUnicode_CheckExact(self)) {
7107 Py_INCREF(self);
7108 return (PyObject *) self;
7109 }
7110 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111}
7112
7113/* --- Unicode Object Methods --------------------------------------------- */
7114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007115PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117\n\
7118Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007119characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
7121static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007122unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 return fixup(self, fixtitle);
7125}
7126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007127PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129\n\
7130Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007131have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
7133static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007134unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 return fixup(self, fixcapitalize);
7137}
7138
7139#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007140PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142\n\
7143Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007144normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145
7146static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007147unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148{
7149 PyObject *list;
7150 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007151 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 /* Split into words */
7154 list = split(self, NULL, -1);
7155 if (!list)
7156 return NULL;
7157
7158 /* Capitalize each word */
7159 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7160 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 if (item == NULL)
7163 goto onError;
7164 Py_DECREF(PyList_GET_ITEM(list, i));
7165 PyList_SET_ITEM(list, i, item);
7166 }
7167
7168 /* Join the words to form a new string */
7169 item = PyUnicode_Join(NULL, list);
7170
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 Py_DECREF(list);
7173 return (PyObject *)item;
7174}
7175#endif
7176
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007177/* Argument converter. Coerces to a single unicode character */
7178
7179static int
7180convert_uc(PyObject *obj, void *addr)
7181{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007182 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7183 PyObject *uniobj;
7184 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007185
Benjamin Peterson14339b62009-01-31 16:36:08 +00007186 uniobj = PyUnicode_FromObject(obj);
7187 if (uniobj == NULL) {
7188 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007190 return 0;
7191 }
7192 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7193 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007195 Py_DECREF(uniobj);
7196 return 0;
7197 }
7198 unistr = PyUnicode_AS_UNICODE(uniobj);
7199 *fillcharloc = unistr[0];
7200 Py_DECREF(uniobj);
7201 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007202}
7203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007204PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007207Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007208done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209
7210static PyObject *
7211unicode_center(PyUnicodeObject *self, PyObject *args)
7212{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007213 Py_ssize_t marg, left;
7214 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007215 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216
Thomas Woutersde017742006-02-16 19:34:37 +00007217 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 return NULL;
7219
Tim Peters7a29bd52001-09-12 03:03:31 +00007220 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 Py_INCREF(self);
7222 return (PyObject*) self;
7223 }
7224
7225 marg = width - self->length;
7226 left = marg / 2 + (marg & width & 1);
7227
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007228 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229}
7230
Marc-André Lemburge5034372000-08-08 08:04:29 +00007231#if 0
7232
7233/* This code should go into some future Unicode collation support
7234 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007235 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007236
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007237/* speedy UTF-16 code point order comparison */
7238/* gleaned from: */
7239/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7240
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007241static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007242{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007243 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007244 0, 0, 0, 0, 0, 0, 0, 0,
7245 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007246 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007247};
7248
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249static int
7250unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7251{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007252 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007253
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 Py_UNICODE *s1 = str1->str;
7255 Py_UNICODE *s2 = str2->str;
7256
7257 len1 = str1->length;
7258 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007259
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007261 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007262
7263 c1 = *s1++;
7264 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007265
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 if (c1 > (1<<11) * 26)
7267 c1 += utf16Fixup[c1>>11];
7268 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007269 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007270 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007271
7272 if (c1 != c2)
7273 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007274
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007275 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 }
7277
7278 return (len1 < len2) ? -1 : (len1 != len2);
7279}
7280
Marc-André Lemburge5034372000-08-08 08:04:29 +00007281#else
7282
7283static int
7284unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7285{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007286 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007287
7288 Py_UNICODE *s1 = str1->str;
7289 Py_UNICODE *s2 = str2->str;
7290
7291 len1 = str1->length;
7292 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007293
Marc-André Lemburge5034372000-08-08 08:04:29 +00007294 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007295 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007296
Fredrik Lundh45714e92001-06-26 16:39:36 +00007297 c1 = *s1++;
7298 c2 = *s2++;
7299
7300 if (c1 != c2)
7301 return (c1 < c2) ? -1 : 1;
7302
Marc-André Lemburge5034372000-08-08 08:04:29 +00007303 len1--; len2--;
7304 }
7305
7306 return (len1 < len2) ? -1 : (len1 != len2);
7307}
7308
7309#endif
7310
Alexander Belopolsky40018472011-02-26 01:02:56 +00007311int
7312PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007314 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7315 return unicode_compare((PyUnicodeObject *)left,
7316 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007317 PyErr_Format(PyExc_TypeError,
7318 "Can't compare %.100s and %.100s",
7319 left->ob_type->tp_name,
7320 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 return -1;
7322}
7323
Martin v. Löwis5b222132007-06-10 09:51:05 +00007324int
7325PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7326{
7327 int i;
7328 Py_UNICODE *id;
7329 assert(PyUnicode_Check(uni));
7330 id = PyUnicode_AS_UNICODE(uni);
7331 /* Compare Unicode string and source character set string */
7332 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007333 if (id[i] != str[i])
7334 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007335 /* This check keeps Python strings that end in '\0' from comparing equal
7336 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007337 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007339 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007341 return 0;
7342}
7343
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007344
Benjamin Peterson29060642009-01-31 22:14:21 +00007345#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007346 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007347
Alexander Belopolsky40018472011-02-26 01:02:56 +00007348PyObject *
7349PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007350{
7351 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007352
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007353 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7354 PyObject *v;
7355 if (((PyUnicodeObject *) left)->length !=
7356 ((PyUnicodeObject *) right)->length) {
7357 if (op == Py_EQ) {
7358 Py_INCREF(Py_False);
7359 return Py_False;
7360 }
7361 if (op == Py_NE) {
7362 Py_INCREF(Py_True);
7363 return Py_True;
7364 }
7365 }
7366 if (left == right)
7367 result = 0;
7368 else
7369 result = unicode_compare((PyUnicodeObject *)left,
7370 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007371
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007372 /* Convert the return value to a Boolean */
7373 switch (op) {
7374 case Py_EQ:
7375 v = TEST_COND(result == 0);
7376 break;
7377 case Py_NE:
7378 v = TEST_COND(result != 0);
7379 break;
7380 case Py_LE:
7381 v = TEST_COND(result <= 0);
7382 break;
7383 case Py_GE:
7384 v = TEST_COND(result >= 0);
7385 break;
7386 case Py_LT:
7387 v = TEST_COND(result == -1);
7388 break;
7389 case Py_GT:
7390 v = TEST_COND(result == 1);
7391 break;
7392 default:
7393 PyErr_BadArgument();
7394 return NULL;
7395 }
7396 Py_INCREF(v);
7397 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007398 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007399
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007400 Py_INCREF(Py_NotImplemented);
7401 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007402}
7403
Alexander Belopolsky40018472011-02-26 01:02:56 +00007404int
7405PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007406{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007407 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007408 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007409
7410 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007411 sub = PyUnicode_FromObject(element);
7412 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 PyErr_Format(PyExc_TypeError,
7414 "'in <string>' requires string as left operand, not %s",
7415 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007416 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007417 }
7418
Thomas Wouters477c8d52006-05-27 19:21:47 +00007419 str = PyUnicode_FromObject(container);
7420 if (!str) {
7421 Py_DECREF(sub);
7422 return -1;
7423 }
7424
7425 result = stringlib_contains_obj(str, sub);
7426
7427 Py_DECREF(str);
7428 Py_DECREF(sub);
7429
Guido van Rossum403d68b2000-03-13 15:55:09 +00007430 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007431}
7432
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433/* Concat to string or Unicode object giving a new Unicode object. */
7434
Alexander Belopolsky40018472011-02-26 01:02:56 +00007435PyObject *
7436PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437{
7438 PyUnicodeObject *u = NULL, *v = NULL, *w;
7439
7440 /* Coerce the two arguments */
7441 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7442 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7445 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447
7448 /* Shortcuts */
7449 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 Py_DECREF(v);
7451 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452 }
7453 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 Py_DECREF(u);
7455 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 }
7457
7458 /* Concat the two Unicode strings */
7459 w = _PyUnicode_New(u->length + v->length);
7460 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 Py_UNICODE_COPY(w->str, u->str, u->length);
7463 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7464
7465 Py_DECREF(u);
7466 Py_DECREF(v);
7467 return (PyObject *)w;
7468
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470 Py_XDECREF(u);
7471 Py_XDECREF(v);
7472 return NULL;
7473}
7474
Walter Dörwald1ab83302007-05-18 17:15:44 +00007475void
7476PyUnicode_Append(PyObject **pleft, PyObject *right)
7477{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007478 PyObject *new;
7479 if (*pleft == NULL)
7480 return;
7481 if (right == NULL || !PyUnicode_Check(*pleft)) {
7482 Py_DECREF(*pleft);
7483 *pleft = NULL;
7484 return;
7485 }
7486 new = PyUnicode_Concat(*pleft, right);
7487 Py_DECREF(*pleft);
7488 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007489}
7490
7491void
7492PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7493{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007494 PyUnicode_Append(pleft, right);
7495 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007496}
7497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007498PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007501Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007502string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007503interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504
7505static PyObject *
7506unicode_count(PyUnicodeObject *self, PyObject *args)
7507{
7508 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007509 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007510 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 PyObject *result;
7512
Guido van Rossumb8872e62000-05-09 14:14:27 +00007513 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 return NULL;
7516
7517 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007518 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007521
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007522 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007523 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007524 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007525 substring->str, substring->length,
7526 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007527 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528
7529 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007530
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 return result;
7532}
7533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007534PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007535 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007537Encode S using the codec registered for encoding. Default encoding\n\
7538is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007539handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007540a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7541'xmlcharrefreplace' as well as any other name registered with\n\
7542codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543
7544static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007545unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007547 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 char *encoding = NULL;
7549 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007550
Benjamin Peterson308d6372009-09-18 21:42:35 +00007551 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7552 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007554 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007555}
7556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007557PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559\n\
7560Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007561If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562
7563static PyObject*
7564unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7565{
7566 Py_UNICODE *e;
7567 Py_UNICODE *p;
7568 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007569 Py_UNICODE *qe;
7570 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 PyUnicodeObject *u;
7572 int tabsize = 8;
7573
7574 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
Thomas Wouters7e474022000-07-16 12:04:32 +00007577 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007578 i = 0; /* chars up to and including most recent \n or \r */
7579 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7580 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 for (p = self->str; p < e; p++)
7582 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 if (tabsize > 0) {
7584 incr = tabsize - (j % tabsize); /* cannot overflow */
7585 if (j > PY_SSIZE_T_MAX - incr)
7586 goto overflow1;
7587 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007588 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 if (j > PY_SSIZE_T_MAX - 1)
7592 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 j++;
7594 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 if (i > PY_SSIZE_T_MAX - j)
7596 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007598 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 }
7600 }
7601
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007602 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007604
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 /* Second pass: create output string and fill it */
7606 u = _PyUnicode_New(i + j);
7607 if (!u)
7608 return NULL;
7609
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007610 j = 0; /* same as in first pass */
7611 q = u->str; /* next output char */
7612 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613
7614 for (p = self->str; p < e; p++)
7615 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 if (tabsize > 0) {
7617 i = tabsize - (j % tabsize);
7618 j += i;
7619 while (i--) {
7620 if (q >= qe)
7621 goto overflow2;
7622 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007623 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007625 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 else {
7627 if (q >= qe)
7628 goto overflow2;
7629 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007630 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631 if (*p == '\n' || *p == '\r')
7632 j = 0;
7633 }
7634
7635 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007636
7637 overflow2:
7638 Py_DECREF(u);
7639 overflow1:
7640 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7641 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642}
7643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007644PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646\n\
7647Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007648such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649arguments start and end are interpreted as in slice notation.\n\
7650\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007651Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652
7653static PyObject *
7654unicode_find(PyUnicodeObject *self, PyObject *args)
7655{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007656 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007657 Py_ssize_t start;
7658 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007659 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
Christian Heimes9cd17752007-11-18 19:35:23 +00007661 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663
Thomas Wouters477c8d52006-05-27 19:21:47 +00007664 result = stringlib_find_slice(
7665 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7666 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7667 start, end
7668 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669
7670 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007671
Christian Heimes217cfd12007-12-02 14:31:20 +00007672 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673}
7674
7675static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007676unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677{
7678 if (index < 0 || index >= self->length) {
7679 PyErr_SetString(PyExc_IndexError, "string index out of range");
7680 return NULL;
7681 }
7682
7683 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7684}
7685
Guido van Rossumc2504932007-09-18 19:42:40 +00007686/* Believe it or not, this produces the same value for ASCII strings
7687 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007688static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007689unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690{
Guido van Rossumc2504932007-09-18 19:42:40 +00007691 Py_ssize_t len;
7692 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007693 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007694
7695 if (self->hash != -1)
7696 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007697 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007698 p = self->str;
7699 x = *p << 7;
7700 while (--len >= 0)
7701 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007702 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007703 if (x == -1)
7704 x = -2;
7705 self->hash = x;
7706 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707}
7708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007709PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007712Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713
7714static PyObject *
7715unicode_index(PyUnicodeObject *self, PyObject *args)
7716{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007717 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007718 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007719 Py_ssize_t start;
7720 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721
Christian Heimes9cd17752007-11-18 19:35:23 +00007722 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724
Thomas Wouters477c8d52006-05-27 19:21:47 +00007725 result = stringlib_find_slice(
7726 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7727 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7728 start, end
7729 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
7731 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007732
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 if (result < 0) {
7734 PyErr_SetString(PyExc_ValueError, "substring not found");
7735 return NULL;
7736 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007737
Christian Heimes217cfd12007-12-02 14:31:20 +00007738 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739}
7740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007741PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007744Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007745at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746
7747static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007748unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749{
7750 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7751 register const Py_UNICODE *e;
7752 int cased;
7753
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 /* Shortcut for single character strings */
7755 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007758 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007759 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007761
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762 e = p + PyUnicode_GET_SIZE(self);
7763 cased = 0;
7764 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007766
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7768 return PyBool_FromLong(0);
7769 else if (!cased && Py_UNICODE_ISLOWER(ch))
7770 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007772 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773}
7774
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007775PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007778Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007779at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780
7781static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007782unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783{
7784 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7785 register const Py_UNICODE *e;
7786 int cased;
7787
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788 /* Shortcut for single character strings */
7789 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007792 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007793 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007795
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 e = p + PyUnicode_GET_SIZE(self);
7797 cased = 0;
7798 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007800
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7802 return PyBool_FromLong(0);
7803 else if (!cased && Py_UNICODE_ISUPPER(ch))
7804 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007806 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807}
7808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007809PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007812Return True if S is a titlecased string and there is at least one\n\
7813character in S, i.e. upper- and titlecase characters may only\n\
7814follow uncased characters and lowercase characters only cased ones.\n\
7815Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816
7817static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007818unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819{
7820 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7821 register const Py_UNICODE *e;
7822 int cased, previous_is_cased;
7823
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824 /* Shortcut for single character strings */
7825 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7827 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007829 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007830 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007832
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833 e = p + PyUnicode_GET_SIZE(self);
7834 cased = 0;
7835 previous_is_cased = 0;
7836 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007838
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7840 if (previous_is_cased)
7841 return PyBool_FromLong(0);
7842 previous_is_cased = 1;
7843 cased = 1;
7844 }
7845 else if (Py_UNICODE_ISLOWER(ch)) {
7846 if (!previous_is_cased)
7847 return PyBool_FromLong(0);
7848 previous_is_cased = 1;
7849 cased = 1;
7850 }
7851 else
7852 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007854 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855}
7856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007857PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007860Return True if all characters in S are whitespace\n\
7861and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862
7863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007864unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865{
7866 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7867 register const Py_UNICODE *e;
7868
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 /* Shortcut for single character strings */
7870 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 Py_UNICODE_ISSPACE(*p))
7872 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007874 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007875 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007877
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 e = p + PyUnicode_GET_SIZE(self);
7879 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 if (!Py_UNICODE_ISSPACE(*p))
7881 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007883 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884}
7885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007886PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007888\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007889Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007890and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007891
7892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007893unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007894{
7895 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7896 register const Py_UNICODE *e;
7897
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007898 /* Shortcut for single character strings */
7899 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 Py_UNICODE_ISALPHA(*p))
7901 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007902
7903 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007904 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007906
7907 e = p + PyUnicode_GET_SIZE(self);
7908 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 if (!Py_UNICODE_ISALPHA(*p))
7910 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007911 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007912 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007913}
7914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007915PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007917\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007918Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007919and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007920
7921static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007922unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007923{
7924 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7925 register const Py_UNICODE *e;
7926
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007927 /* Shortcut for single character strings */
7928 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 Py_UNICODE_ISALNUM(*p))
7930 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007931
7932 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007933 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007935
7936 e = p + PyUnicode_GET_SIZE(self);
7937 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 if (!Py_UNICODE_ISALNUM(*p))
7939 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007940 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007941 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007942}
7943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007944PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007947Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007948False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949
7950static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007951unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952{
7953 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7954 register const Py_UNICODE *e;
7955
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 /* Shortcut for single character strings */
7957 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 Py_UNICODE_ISDECIMAL(*p))
7959 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007961 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007962 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007964
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 e = p + PyUnicode_GET_SIZE(self);
7966 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 if (!Py_UNICODE_ISDECIMAL(*p))
7968 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007970 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971}
7972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007973PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007976Return True if all characters in S are digits\n\
7977and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978
7979static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007980unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981{
7982 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7983 register const Py_UNICODE *e;
7984
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 /* Shortcut for single character strings */
7986 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 Py_UNICODE_ISDIGIT(*p))
7988 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007990 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007991 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007993
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 e = p + PyUnicode_GET_SIZE(self);
7995 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 if (!Py_UNICODE_ISDIGIT(*p))
7997 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007999 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000}
8001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008002PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008005Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008006False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007
8008static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008009unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010{
8011 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8012 register const Py_UNICODE *e;
8013
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 /* Shortcut for single character strings */
8015 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 Py_UNICODE_ISNUMERIC(*p))
8017 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008019 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008020 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008022
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 e = p + PyUnicode_GET_SIZE(self);
8024 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 if (!Py_UNICODE_ISNUMERIC(*p))
8026 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008028 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029}
8030
Martin v. Löwis47383402007-08-15 07:32:56 +00008031int
8032PyUnicode_IsIdentifier(PyObject *self)
8033{
8034 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8035 register const Py_UNICODE *e;
8036
8037 /* Special case for empty strings */
8038 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008040
8041 /* PEP 3131 says that the first character must be in
8042 XID_Start and subsequent characters in XID_Continue,
8043 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008045 letters, digits, underscore). However, given the current
8046 definition of XID_Start and XID_Continue, it is sufficient
8047 to check just for these, except that _ must be allowed
8048 as starting an identifier. */
8049 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8050 return 0;
8051
8052 e = p + PyUnicode_GET_SIZE(self);
8053 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 if (!_PyUnicode_IsXidContinue(*p))
8055 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008056 }
8057 return 1;
8058}
8059
8060PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008062\n\
8063Return True if S is a valid identifier according\n\
8064to the language definition.");
8065
8066static PyObject*
8067unicode_isidentifier(PyObject *self)
8068{
8069 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8070}
8071
Georg Brandl559e5d72008-06-11 18:37:52 +00008072PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008074\n\
8075Return True if all characters in S are considered\n\
8076printable in repr() or S is empty, False otherwise.");
8077
8078static PyObject*
8079unicode_isprintable(PyObject *self)
8080{
8081 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8082 register const Py_UNICODE *e;
8083
8084 /* Shortcut for single character strings */
8085 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8086 Py_RETURN_TRUE;
8087 }
8088
8089 e = p + PyUnicode_GET_SIZE(self);
8090 for (; p < e; p++) {
8091 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8092 Py_RETURN_FALSE;
8093 }
8094 }
8095 Py_RETURN_TRUE;
8096}
8097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008098PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008099 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100\n\
8101Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008102iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103
8104static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008105unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008107 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108}
8109
Martin v. Löwis18e16552006-02-15 17:27:45 +00008110static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111unicode_length(PyUnicodeObject *self)
8112{
8113 return self->length;
8114}
8115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008116PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008119Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008120done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121
8122static PyObject *
8123unicode_ljust(PyUnicodeObject *self, PyObject *args)
8124{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008125 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008126 Py_UNICODE fillchar = ' ';
8127
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008128 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 return NULL;
8130
Tim Peters7a29bd52001-09-12 03:03:31 +00008131 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132 Py_INCREF(self);
8133 return (PyObject*) self;
8134 }
8135
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008136 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137}
8138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008139PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008140 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008142Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143
8144static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008145unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 return fixup(self, fixlower);
8148}
8149
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008150#define LEFTSTRIP 0
8151#define RIGHTSTRIP 1
8152#define BOTHSTRIP 2
8153
8154/* Arrays indexed by above */
8155static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8156
8157#define STRIPNAME(i) (stripformat[i]+3)
8158
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008159/* externally visible for str.strip(unicode) */
8160PyObject *
8161_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8162{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008163 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8164 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8165 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8166 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8167 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008168
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008170
Benjamin Peterson14339b62009-01-31 16:36:08 +00008171 i = 0;
8172 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8174 i++;
8175 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008176 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008177
Benjamin Peterson14339b62009-01-31 16:36:08 +00008178 j = len;
8179 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 do {
8181 j--;
8182 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8183 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008184 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008185
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 Py_INCREF(self);
8188 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008189 }
8190 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008192}
8193
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194
8195static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008196do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008198 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8199 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008200
Benjamin Peterson14339b62009-01-31 16:36:08 +00008201 i = 0;
8202 if (striptype != RIGHTSTRIP) {
8203 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8204 i++;
8205 }
8206 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008207
Benjamin Peterson14339b62009-01-31 16:36:08 +00008208 j = len;
8209 if (striptype != LEFTSTRIP) {
8210 do {
8211 j--;
8212 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8213 j++;
8214 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008215
Benjamin Peterson14339b62009-01-31 16:36:08 +00008216 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8217 Py_INCREF(self);
8218 return (PyObject*)self;
8219 }
8220 else
8221 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222}
8223
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008224
8225static PyObject *
8226do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8227{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008228 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008229
Benjamin Peterson14339b62009-01-31 16:36:08 +00008230 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8231 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008232
Benjamin Peterson14339b62009-01-31 16:36:08 +00008233 if (sep != NULL && sep != Py_None) {
8234 if (PyUnicode_Check(sep))
8235 return _PyUnicode_XStrip(self, striptype, sep);
8236 else {
8237 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 "%s arg must be None or str",
8239 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008240 return NULL;
8241 }
8242 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008243
Benjamin Peterson14339b62009-01-31 16:36:08 +00008244 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008245}
8246
8247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008248PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008250\n\
8251Return a copy of the string S with leading and trailing\n\
8252whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008253If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008254
8255static PyObject *
8256unicode_strip(PyUnicodeObject *self, PyObject *args)
8257{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008258 if (PyTuple_GET_SIZE(args) == 0)
8259 return do_strip(self, BOTHSTRIP); /* Common case */
8260 else
8261 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008262}
8263
8264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008265PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008267\n\
8268Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008269If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008270
8271static PyObject *
8272unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8273{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008274 if (PyTuple_GET_SIZE(args) == 0)
8275 return do_strip(self, LEFTSTRIP); /* Common case */
8276 else
8277 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008278}
8279
8280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008281PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008283\n\
8284Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008285If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008286
8287static PyObject *
8288unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8289{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008290 if (PyTuple_GET_SIZE(args) == 0)
8291 return do_strip(self, RIGHTSTRIP); /* Common case */
8292 else
8293 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008294}
8295
8296
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008298unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299{
8300 PyUnicodeObject *u;
8301 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008302 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008303 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304
Georg Brandl222de0f2009-04-12 12:01:50 +00008305 if (len < 1) {
8306 Py_INCREF(unicode_empty);
8307 return (PyObject *)unicode_empty;
8308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309
Tim Peters7a29bd52001-09-12 03:03:31 +00008310 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 /* no repeat, return original string */
8312 Py_INCREF(str);
8313 return (PyObject*) str;
8314 }
Tim Peters8f422462000-09-09 06:13:41 +00008315
8316 /* ensure # of chars needed doesn't overflow int and # of bytes
8317 * needed doesn't overflow size_t
8318 */
8319 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008320 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008321 PyErr_SetString(PyExc_OverflowError,
8322 "repeated string is too long");
8323 return NULL;
8324 }
8325 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8326 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8327 PyErr_SetString(PyExc_OverflowError,
8328 "repeated string is too long");
8329 return NULL;
8330 }
8331 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332 if (!u)
8333 return NULL;
8334
8335 p = u->str;
8336
Georg Brandl222de0f2009-04-12 12:01:50 +00008337 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008338 Py_UNICODE_FILL(p, str->str[0], len);
8339 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008340 Py_ssize_t done = str->length; /* number of characters copied this far */
8341 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008343 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008344 Py_UNICODE_COPY(p+done, p, n);
8345 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 }
8348
8349 return (PyObject*) u;
8350}
8351
Alexander Belopolsky40018472011-02-26 01:02:56 +00008352PyObject *
8353PyUnicode_Replace(PyObject *obj,
8354 PyObject *subobj,
8355 PyObject *replobj,
8356 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357{
8358 PyObject *self;
8359 PyObject *str1;
8360 PyObject *str2;
8361 PyObject *result;
8362
8363 self = PyUnicode_FromObject(obj);
8364 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 str1 = PyUnicode_FromObject(subobj);
8367 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 Py_DECREF(self);
8369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 }
8371 str2 = PyUnicode_FromObject(replobj);
8372 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 Py_DECREF(self);
8374 Py_DECREF(str1);
8375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376 }
Tim Petersced69f82003-09-16 20:30:58 +00008377 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 (PyUnicodeObject *)str1,
8379 (PyUnicodeObject *)str2,
8380 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 Py_DECREF(self);
8382 Py_DECREF(str1);
8383 Py_DECREF(str2);
8384 return result;
8385}
8386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008387PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008388 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389\n\
8390Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008391old replaced by new. If the optional argument count is\n\
8392given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393
8394static PyObject*
8395unicode_replace(PyUnicodeObject *self, PyObject *args)
8396{
8397 PyUnicodeObject *str1;
8398 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008399 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 PyObject *result;
8401
Martin v. Löwis18e16552006-02-15 17:27:45 +00008402 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403 return NULL;
8404 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8405 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008408 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 Py_DECREF(str1);
8410 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412
8413 result = replace(self, str1, str2, maxcount);
8414
8415 Py_DECREF(str1);
8416 Py_DECREF(str2);
8417 return result;
8418}
8419
Alexander Belopolsky40018472011-02-26 01:02:56 +00008420static PyObject *
8421unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008423 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008424 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008425 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8426 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8427
8428 /* XXX(nnorwitz): rather than over-allocating, it would be
8429 better to choose a different scheme. Perhaps scan the
8430 first N-chars of the string and allocate based on that size.
8431 */
8432 /* Initial allocation is based on the longest-possible unichr
8433 escape.
8434
8435 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8436 unichr, so in this case it's the longest unichr escape. In
8437 narrow (UTF-16) builds this is five chars per source unichr
8438 since there are two unichrs in the surrogate pair, so in narrow
8439 (UTF-16) builds it's not the longest unichr escape.
8440
8441 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8442 so in the narrow (UTF-16) build case it's the longest unichr
8443 escape.
8444 */
8445
Walter Dörwald1ab83302007-05-18 17:15:44 +00008446 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008448#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008450#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008452#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008454 if (repr == NULL)
8455 return NULL;
8456
Walter Dörwald1ab83302007-05-18 17:15:44 +00008457 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008458
8459 /* Add quote */
8460 *p++ = (findchar(s, size, '\'') &&
8461 !findchar(s, size, '"')) ? '"' : '\'';
8462 while (size-- > 0) {
8463 Py_UNICODE ch = *s++;
8464
8465 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008466 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008467 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008468 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008469 continue;
8470 }
8471
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008473 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008474 *p++ = '\\';
8475 *p++ = 't';
8476 }
8477 else if (ch == '\n') {
8478 *p++ = '\\';
8479 *p++ = 'n';
8480 }
8481 else if (ch == '\r') {
8482 *p++ = '\\';
8483 *p++ = 'r';
8484 }
8485
8486 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008487 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008488 *p++ = '\\';
8489 *p++ = 'x';
8490 *p++ = hexdigits[(ch >> 4) & 0x000F];
8491 *p++ = hexdigits[ch & 0x000F];
8492 }
8493
Georg Brandl559e5d72008-06-11 18:37:52 +00008494 /* Copy ASCII characters as-is */
8495 else if (ch < 0x7F) {
8496 *p++ = ch;
8497 }
8498
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008500 else {
8501 Py_UCS4 ucs = ch;
8502
8503#ifndef Py_UNICODE_WIDE
8504 Py_UNICODE ch2 = 0;
8505 /* Get code point from surrogate pair */
8506 if (size > 0) {
8507 ch2 = *s;
8508 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008510 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008512 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008513 size--;
8514 }
8515 }
8516#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008517 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008518 (categories Z* and C* except ASCII space)
8519 */
8520 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8521 /* Map 8-bit characters to '\xhh' */
8522 if (ucs <= 0xff) {
8523 *p++ = '\\';
8524 *p++ = 'x';
8525 *p++ = hexdigits[(ch >> 4) & 0x000F];
8526 *p++ = hexdigits[ch & 0x000F];
8527 }
8528 /* Map 21-bit characters to '\U00xxxxxx' */
8529 else if (ucs >= 0x10000) {
8530 *p++ = '\\';
8531 *p++ = 'U';
8532 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8533 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8534 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8535 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8536 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8537 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8538 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8539 *p++ = hexdigits[ucs & 0x0000000F];
8540 }
8541 /* Map 16-bit characters to '\uxxxx' */
8542 else {
8543 *p++ = '\\';
8544 *p++ = 'u';
8545 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8546 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8547 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8548 *p++ = hexdigits[ucs & 0x000F];
8549 }
8550 }
8551 /* Copy characters as-is */
8552 else {
8553 *p++ = ch;
8554#ifndef Py_UNICODE_WIDE
8555 if (ucs >= 0x10000)
8556 *p++ = ch2;
8557#endif
8558 }
8559 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008560 }
8561 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008562 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008563
8564 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008565 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008566 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567}
8568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008569PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571\n\
8572Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008573such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574arguments start and end are interpreted as in slice notation.\n\
8575\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008576Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577
8578static PyObject *
8579unicode_rfind(PyUnicodeObject *self, PyObject *args)
8580{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008581 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008582 Py_ssize_t start;
8583 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008584 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585
Christian Heimes9cd17752007-11-18 19:35:23 +00008586 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
Thomas Wouters477c8d52006-05-27 19:21:47 +00008589 result = stringlib_rfind_slice(
8590 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8591 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8592 start, end
8593 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594
8595 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008596
Christian Heimes217cfd12007-12-02 14:31:20 +00008597 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598}
8599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008600PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008603Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604
8605static PyObject *
8606unicode_rindex(PyUnicodeObject *self, PyObject *args)
8607{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008608 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008609 Py_ssize_t start;
8610 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008611 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612
Christian Heimes9cd17752007-11-18 19:35:23 +00008613 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008614 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615
Thomas Wouters477c8d52006-05-27 19:21:47 +00008616 result = stringlib_rfind_slice(
8617 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8618 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8619 start, end
8620 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621
8622 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008623
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 if (result < 0) {
8625 PyErr_SetString(PyExc_ValueError, "substring not found");
8626 return NULL;
8627 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008628 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629}
8630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008631PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008634Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008635done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636
8637static PyObject *
8638unicode_rjust(PyUnicodeObject *self, PyObject *args)
8639{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008640 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008641 Py_UNICODE fillchar = ' ';
8642
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008643 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 return NULL;
8645
Tim Peters7a29bd52001-09-12 03:03:31 +00008646 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647 Py_INCREF(self);
8648 return (PyObject*) self;
8649 }
8650
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008651 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652}
8653
Alexander Belopolsky40018472011-02-26 01:02:56 +00008654PyObject *
8655PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656{
8657 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008658
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 s = PyUnicode_FromObject(s);
8660 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008661 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 if (sep != NULL) {
8663 sep = PyUnicode_FromObject(sep);
8664 if (sep == NULL) {
8665 Py_DECREF(s);
8666 return NULL;
8667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 }
8669
8670 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8671
8672 Py_DECREF(s);
8673 Py_XDECREF(sep);
8674 return result;
8675}
8676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008677PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679\n\
8680Return a list of the words in S, using sep as the\n\
8681delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008682splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008683whitespace string is a separator and empty strings are\n\
8684removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685
8686static PyObject*
8687unicode_split(PyUnicodeObject *self, PyObject *args)
8688{
8689 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008690 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691
Martin v. Löwis18e16552006-02-15 17:27:45 +00008692 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 return NULL;
8694
8695 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701}
8702
Thomas Wouters477c8d52006-05-27 19:21:47 +00008703PyObject *
8704PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8705{
8706 PyObject* str_obj;
8707 PyObject* sep_obj;
8708 PyObject* out;
8709
8710 str_obj = PyUnicode_FromObject(str_in);
8711 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008713 sep_obj = PyUnicode_FromObject(sep_in);
8714 if (!sep_obj) {
8715 Py_DECREF(str_obj);
8716 return NULL;
8717 }
8718
8719 out = stringlib_partition(
8720 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8721 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8722 );
8723
8724 Py_DECREF(sep_obj);
8725 Py_DECREF(str_obj);
8726
8727 return out;
8728}
8729
8730
8731PyObject *
8732PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8733{
8734 PyObject* str_obj;
8735 PyObject* sep_obj;
8736 PyObject* out;
8737
8738 str_obj = PyUnicode_FromObject(str_in);
8739 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008741 sep_obj = PyUnicode_FromObject(sep_in);
8742 if (!sep_obj) {
8743 Py_DECREF(str_obj);
8744 return NULL;
8745 }
8746
8747 out = stringlib_rpartition(
8748 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8749 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8750 );
8751
8752 Py_DECREF(sep_obj);
8753 Py_DECREF(str_obj);
8754
8755 return out;
8756}
8757
8758PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008760\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008761Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008762the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008763found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008764
8765static PyObject*
8766unicode_partition(PyUnicodeObject *self, PyObject *separator)
8767{
8768 return PyUnicode_Partition((PyObject *)self, separator);
8769}
8770
8771PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008772 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008773\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008774Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008775the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008776separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008777
8778static PyObject*
8779unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8780{
8781 return PyUnicode_RPartition((PyObject *)self, separator);
8782}
8783
Alexander Belopolsky40018472011-02-26 01:02:56 +00008784PyObject *
8785PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008786{
8787 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008788
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008789 s = PyUnicode_FromObject(s);
8790 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008791 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 if (sep != NULL) {
8793 sep = PyUnicode_FromObject(sep);
8794 if (sep == NULL) {
8795 Py_DECREF(s);
8796 return NULL;
8797 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008798 }
8799
8800 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8801
8802 Py_DECREF(s);
8803 Py_XDECREF(sep);
8804 return result;
8805}
8806
8807PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008809\n\
8810Return a list of the words in S, using sep as the\n\
8811delimiter string, starting at the end of the string and\n\
8812working to the front. If maxsplit is given, at most maxsplit\n\
8813splits are done. If sep is not specified, any whitespace string\n\
8814is a separator.");
8815
8816static PyObject*
8817unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8818{
8819 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008820 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008821
Martin v. Löwis18e16552006-02-15 17:27:45 +00008822 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008823 return NULL;
8824
8825 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008827 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008829 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008831}
8832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008833PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835\n\
8836Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008837Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008838is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839
8840static PyObject*
8841unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8842{
Guido van Rossum86662912000-04-11 15:38:46 +00008843 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844
Guido van Rossum86662912000-04-11 15:38:46 +00008845 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 return NULL;
8847
Guido van Rossum86662912000-04-11 15:38:46 +00008848 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849}
8850
8851static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008852PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853{
Walter Dörwald346737f2007-05-31 10:44:43 +00008854 if (PyUnicode_CheckExact(self)) {
8855 Py_INCREF(self);
8856 return self;
8857 } else
8858 /* Subtype -- return genuine unicode string with the same value. */
8859 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8860 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861}
8862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008863PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865\n\
8866Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008867and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868
8869static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008870unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 return fixup(self, fixswapcase);
8873}
8874
Georg Brandlceee0772007-11-27 23:48:05 +00008875PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008877\n\
8878Return a translation table usable for str.translate().\n\
8879If there is only one argument, it must be a dictionary mapping Unicode\n\
8880ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008881Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008882If there are two arguments, they must be strings of equal length, and\n\
8883in the resulting dictionary, each character in x will be mapped to the\n\
8884character at the same position in y. If there is a third argument, it\n\
8885must be a string, whose characters will be mapped to None in the result.");
8886
8887static PyObject*
8888unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8889{
8890 PyObject *x, *y = NULL, *z = NULL;
8891 PyObject *new = NULL, *key, *value;
8892 Py_ssize_t i = 0;
8893 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008894
Georg Brandlceee0772007-11-27 23:48:05 +00008895 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8896 return NULL;
8897 new = PyDict_New();
8898 if (!new)
8899 return NULL;
8900 if (y != NULL) {
8901 /* x must be a string too, of equal length */
8902 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8903 if (!PyUnicode_Check(x)) {
8904 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8905 "be a string if there is a second argument");
8906 goto err;
8907 }
8908 if (PyUnicode_GET_SIZE(x) != ylen) {
8909 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8910 "arguments must have equal length");
8911 goto err;
8912 }
8913 /* create entries for translating chars in x to those in y */
8914 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008915 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8916 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008917 if (!key || !value)
8918 goto err;
8919 res = PyDict_SetItem(new, key, value);
8920 Py_DECREF(key);
8921 Py_DECREF(value);
8922 if (res < 0)
8923 goto err;
8924 }
8925 /* create entries for deleting chars in z */
8926 if (z != NULL) {
8927 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008928 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008929 if (!key)
8930 goto err;
8931 res = PyDict_SetItem(new, key, Py_None);
8932 Py_DECREF(key);
8933 if (res < 0)
8934 goto err;
8935 }
8936 }
8937 } else {
8938 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008939 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008940 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8941 "to maketrans it must be a dict");
8942 goto err;
8943 }
8944 /* copy entries into the new dict, converting string keys to int keys */
8945 while (PyDict_Next(x, &i, &key, &value)) {
8946 if (PyUnicode_Check(key)) {
8947 /* convert string keys to integer keys */
8948 PyObject *newkey;
8949 if (PyUnicode_GET_SIZE(key) != 1) {
8950 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8951 "table must be of length 1");
8952 goto err;
8953 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008954 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008955 if (!newkey)
8956 goto err;
8957 res = PyDict_SetItem(new, newkey, value);
8958 Py_DECREF(newkey);
8959 if (res < 0)
8960 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008961 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008962 /* just keep integer keys */
8963 if (PyDict_SetItem(new, key, value) < 0)
8964 goto err;
8965 } else {
8966 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8967 "be strings or integers");
8968 goto err;
8969 }
8970 }
8971 }
8972 return new;
8973 err:
8974 Py_DECREF(new);
8975 return NULL;
8976}
8977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008978PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980\n\
8981Return a copy of the string S, where all characters have been mapped\n\
8982through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008983Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008984Unmapped characters are left untouched. Characters mapped to None\n\
8985are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986
8987static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008988unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989{
Georg Brandlceee0772007-11-27 23:48:05 +00008990 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991}
8992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008993PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008996Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997
8998static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008999unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001 return fixup(self, fixupper);
9002}
9003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009004PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009007Pad a numeric string S with zeros on the left, to fill a field\n\
9008of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009
9010static PyObject *
9011unicode_zfill(PyUnicodeObject *self, PyObject *args)
9012{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009013 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 PyUnicodeObject *u;
9015
Martin v. Löwis18e16552006-02-15 17:27:45 +00009016 Py_ssize_t width;
9017 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 return NULL;
9019
9020 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009021 if (PyUnicode_CheckExact(self)) {
9022 Py_INCREF(self);
9023 return (PyObject*) self;
9024 }
9025 else
9026 return PyUnicode_FromUnicode(
9027 PyUnicode_AS_UNICODE(self),
9028 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030 }
9031
9032 fill = width - self->length;
9033
9034 u = pad(self, fill, 0, '0');
9035
Walter Dörwald068325e2002-04-15 13:36:47 +00009036 if (u == NULL)
9037 return NULL;
9038
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 if (u->str[fill] == '+' || u->str[fill] == '-') {
9040 /* move sign to beginning of string */
9041 u->str[0] = u->str[fill];
9042 u->str[fill] = '0';
9043 }
9044
9045 return (PyObject*) u;
9046}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047
9048#if 0
9049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009050unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051{
Christian Heimes2202f872008-02-06 14:31:34 +00009052 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009054
9055static PyObject *
9056unicode__decimal2ascii(PyObject *self)
9057{
9058 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9059 PyUnicode_GET_SIZE(self));
9060}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061#endif
9062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009063PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009064 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009066Return True if S starts with the specified prefix, False otherwise.\n\
9067With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009068With optional end, stop comparing S at that position.\n\
9069prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070
9071static PyObject *
9072unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009073 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009075 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009077 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009078 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009079 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009081 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9083 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009084 if (PyTuple_Check(subobj)) {
9085 Py_ssize_t i;
9086 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9087 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009089 if (substring == NULL)
9090 return NULL;
9091 result = tailmatch(self, substring, start, end, -1);
9092 Py_DECREF(substring);
9093 if (result) {
9094 Py_RETURN_TRUE;
9095 }
9096 }
9097 /* nothing matched */
9098 Py_RETURN_FALSE;
9099 }
9100 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009103 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009105 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106}
9107
9108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009109PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009112Return True if S ends with the specified suffix, False otherwise.\n\
9113With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009114With optional end, stop comparing S at that position.\n\
9115suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116
9117static PyObject *
9118unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009121 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009123 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009124 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009125 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009127 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9129 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009130 if (PyTuple_Check(subobj)) {
9131 Py_ssize_t i;
9132 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9133 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009135 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009136 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009137 result = tailmatch(self, substring, start, end, +1);
9138 Py_DECREF(substring);
9139 if (result) {
9140 Py_RETURN_TRUE;
9141 }
9142 }
9143 Py_RETURN_FALSE;
9144 }
9145 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009149 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009151 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152}
9153
Eric Smith8c663262007-08-25 02:26:07 +00009154#include "stringlib/string_format.h"
9155
9156PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009158\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009159Return a formatted version of S, using substitutions from args and kwargs.\n\
9160The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009161
Eric Smith27bbca62010-11-04 17:06:58 +00009162PyDoc_STRVAR(format_map__doc__,
9163 "S.format_map(mapping) -> str\n\
9164\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009165Return a formatted version of S, using substitutions from mapping.\n\
9166The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009167
Eric Smith4a7d76d2008-05-30 18:10:19 +00009168static PyObject *
9169unicode__format__(PyObject* self, PyObject* args)
9170{
9171 PyObject *format_spec;
9172
9173 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9174 return NULL;
9175
9176 return _PyUnicode_FormatAdvanced(self,
9177 PyUnicode_AS_UNICODE(format_spec),
9178 PyUnicode_GET_SIZE(format_spec));
9179}
9180
Eric Smith8c663262007-08-25 02:26:07 +00009181PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009182 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009183\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009184Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009185
9186static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009187unicode__sizeof__(PyUnicodeObject *v)
9188{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009189 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9190 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009191}
9192
9193PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009195
9196static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009197unicode_getnewargs(PyUnicodeObject *v)
9198{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009199 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009200}
9201
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202static PyMethodDef unicode_methods[] = {
9203
9204 /* Order is according to common usage: often used methods should
9205 appear first, since lookup is done sequentially. */
9206
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009207 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009208 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9209 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009210 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009211 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9212 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9213 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9214 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9215 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9216 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9217 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009218 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009219 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9220 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9221 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009222 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009223 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9224 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9225 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009226 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009227 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009228 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009229 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009230 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9231 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9232 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9233 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9234 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9235 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9236 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9237 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9238 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9239 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9240 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9241 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9242 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9243 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009244 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009245 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009246 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009247 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009248 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009249 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009250 {"maketrans", (PyCFunction) unicode_maketrans,
9251 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009252 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009253#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009254 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255#endif
9256
9257#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009258 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009259 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009260 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261#endif
9262
Benjamin Peterson14339b62009-01-31 16:36:08 +00009263 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 {NULL, NULL}
9265};
9266
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009267static PyObject *
9268unicode_mod(PyObject *v, PyObject *w)
9269{
Benjamin Peterson29060642009-01-31 22:14:21 +00009270 if (!PyUnicode_Check(v)) {
9271 Py_INCREF(Py_NotImplemented);
9272 return Py_NotImplemented;
9273 }
9274 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009275}
9276
9277static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009278 0, /*nb_add*/
9279 0, /*nb_subtract*/
9280 0, /*nb_multiply*/
9281 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009282};
9283
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009285 (lenfunc) unicode_length, /* sq_length */
9286 PyUnicode_Concat, /* sq_concat */
9287 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9288 (ssizeargfunc) unicode_getitem, /* sq_item */
9289 0, /* sq_slice */
9290 0, /* sq_ass_item */
9291 0, /* sq_ass_slice */
9292 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293};
9294
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009295static PyObject*
9296unicode_subscript(PyUnicodeObject* self, PyObject* item)
9297{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009298 if (PyIndex_Check(item)) {
9299 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009300 if (i == -1 && PyErr_Occurred())
9301 return NULL;
9302 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009303 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009304 return unicode_getitem(self, i);
9305 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009306 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009307 Py_UNICODE* source_buf;
9308 Py_UNICODE* result_buf;
9309 PyObject* result;
9310
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009311 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009312 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009313 return NULL;
9314 }
9315
9316 if (slicelength <= 0) {
9317 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009318 } else if (start == 0 && step == 1 && slicelength == self->length &&
9319 PyUnicode_CheckExact(self)) {
9320 Py_INCREF(self);
9321 return (PyObject *)self;
9322 } else if (step == 1) {
9323 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009324 } else {
9325 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009326 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9327 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009328
Benjamin Peterson29060642009-01-31 22:14:21 +00009329 if (result_buf == NULL)
9330 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009331
9332 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9333 result_buf[i] = source_buf[cur];
9334 }
Tim Petersced69f82003-09-16 20:30:58 +00009335
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009336 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009337 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009338 return result;
9339 }
9340 } else {
9341 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9342 return NULL;
9343 }
9344}
9345
9346static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009347 (lenfunc)unicode_length, /* mp_length */
9348 (binaryfunc)unicode_subscript, /* mp_subscript */
9349 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009350};
9351
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353/* Helpers for PyUnicode_Format() */
9354
9355static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009356getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009358 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009360 (*p_argidx)++;
9361 if (arglen < 0)
9362 return args;
9363 else
9364 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 }
9366 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 return NULL;
9369}
9370
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009371/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009373static PyObject *
9374formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009376 char *p;
9377 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009379
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380 x = PyFloat_AsDouble(v);
9381 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009382 return NULL;
9383
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009385 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009386
Eric Smith0923d1d2009-04-16 20:16:10 +00009387 p = PyOS_double_to_string(x, type, prec,
9388 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009389 if (p == NULL)
9390 return NULL;
9391 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009392 PyMem_Free(p);
9393 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394}
9395
Tim Peters38fd5b62000-09-21 05:43:11 +00009396static PyObject*
9397formatlong(PyObject *val, int flags, int prec, int type)
9398{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009399 char *buf;
9400 int len;
9401 PyObject *str; /* temporary string object. */
9402 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009403
Benjamin Peterson14339b62009-01-31 16:36:08 +00009404 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9405 if (!str)
9406 return NULL;
9407 result = PyUnicode_FromStringAndSize(buf, len);
9408 Py_DECREF(str);
9409 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009410}
9411
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412static int
9413formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009414 size_t buflen,
9415 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009417 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009418 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009419 if (PyUnicode_GET_SIZE(v) == 1) {
9420 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9421 buf[1] = '\0';
9422 return 1;
9423 }
9424#ifndef Py_UNICODE_WIDE
9425 if (PyUnicode_GET_SIZE(v) == 2) {
9426 /* Decode a valid surrogate pair */
9427 int c0 = PyUnicode_AS_UNICODE(v)[0];
9428 int c1 = PyUnicode_AS_UNICODE(v)[1];
9429 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9430 0xDC00 <= c1 && c1 <= 0xDFFF) {
9431 buf[0] = c0;
9432 buf[1] = c1;
9433 buf[2] = '\0';
9434 return 2;
9435 }
9436 }
9437#endif
9438 goto onError;
9439 }
9440 else {
9441 /* Integer input truncated to a character */
9442 long x;
9443 x = PyLong_AsLong(v);
9444 if (x == -1 && PyErr_Occurred())
9445 goto onError;
9446
9447 if (x < 0 || x > 0x10ffff) {
9448 PyErr_SetString(PyExc_OverflowError,
9449 "%c arg not in range(0x110000)");
9450 return -1;
9451 }
9452
9453#ifndef Py_UNICODE_WIDE
9454 if (x > 0xffff) {
9455 x -= 0x10000;
9456 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9457 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9458 return 2;
9459 }
9460#endif
9461 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009462 buf[1] = '\0';
9463 return 1;
9464 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009465
Benjamin Peterson29060642009-01-31 22:14:21 +00009466 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009467 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009469 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470}
9471
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009472/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009473 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009474*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009475#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009476
Alexander Belopolsky40018472011-02-26 01:02:56 +00009477PyObject *
9478PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479{
9480 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009481 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482 int args_owned = 0;
9483 PyUnicodeObject *result = NULL;
9484 PyObject *dict = NULL;
9485 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009486
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009488 PyErr_BadInternalCall();
9489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490 }
9491 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009492 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494 fmt = PyUnicode_AS_UNICODE(uformat);
9495 fmtcnt = PyUnicode_GET_SIZE(uformat);
9496
9497 reslen = rescnt = fmtcnt + 100;
9498 result = _PyUnicode_New(reslen);
9499 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009500 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 res = PyUnicode_AS_UNICODE(result);
9502
9503 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 arglen = PyTuple_Size(args);
9505 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 }
9507 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 arglen = -1;
9509 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009511 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009512 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514
9515 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 if (*fmt != '%') {
9517 if (--rescnt < 0) {
9518 rescnt = fmtcnt + 100;
9519 reslen += rescnt;
9520 if (_PyUnicode_Resize(&result, reslen) < 0)
9521 goto onError;
9522 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9523 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009524 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009525 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009526 }
9527 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009528 /* Got a format specifier */
9529 int flags = 0;
9530 Py_ssize_t width = -1;
9531 int prec = -1;
9532 Py_UNICODE c = '\0';
9533 Py_UNICODE fill;
9534 int isnumok;
9535 PyObject *v = NULL;
9536 PyObject *temp = NULL;
9537 Py_UNICODE *pbuf;
9538 Py_UNICODE sign;
9539 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009540 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 fmt++;
9543 if (*fmt == '(') {
9544 Py_UNICODE *keystart;
9545 Py_ssize_t keylen;
9546 PyObject *key;
9547 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009548
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 if (dict == NULL) {
9550 PyErr_SetString(PyExc_TypeError,
9551 "format requires a mapping");
9552 goto onError;
9553 }
9554 ++fmt;
9555 --fmtcnt;
9556 keystart = fmt;
9557 /* Skip over balanced parentheses */
9558 while (pcount > 0 && --fmtcnt >= 0) {
9559 if (*fmt == ')')
9560 --pcount;
9561 else if (*fmt == '(')
9562 ++pcount;
9563 fmt++;
9564 }
9565 keylen = fmt - keystart - 1;
9566 if (fmtcnt < 0 || pcount > 0) {
9567 PyErr_SetString(PyExc_ValueError,
9568 "incomplete format key");
9569 goto onError;
9570 }
9571#if 0
9572 /* keys are converted to strings using UTF-8 and
9573 then looked up since Python uses strings to hold
9574 variables names etc. in its namespaces and we
9575 wouldn't want to break common idioms. */
9576 key = PyUnicode_EncodeUTF8(keystart,
9577 keylen,
9578 NULL);
9579#else
9580 key = PyUnicode_FromUnicode(keystart, keylen);
9581#endif
9582 if (key == NULL)
9583 goto onError;
9584 if (args_owned) {
9585 Py_DECREF(args);
9586 args_owned = 0;
9587 }
9588 args = PyObject_GetItem(dict, key);
9589 Py_DECREF(key);
9590 if (args == NULL) {
9591 goto onError;
9592 }
9593 args_owned = 1;
9594 arglen = -1;
9595 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009596 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009597 while (--fmtcnt >= 0) {
9598 switch (c = *fmt++) {
9599 case '-': flags |= F_LJUST; continue;
9600 case '+': flags |= F_SIGN; continue;
9601 case ' ': flags |= F_BLANK; continue;
9602 case '#': flags |= F_ALT; continue;
9603 case '0': flags |= F_ZERO; continue;
9604 }
9605 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009606 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009607 if (c == '*') {
9608 v = getnextarg(args, arglen, &argidx);
9609 if (v == NULL)
9610 goto onError;
9611 if (!PyLong_Check(v)) {
9612 PyErr_SetString(PyExc_TypeError,
9613 "* wants int");
9614 goto onError;
9615 }
9616 width = PyLong_AsLong(v);
9617 if (width == -1 && PyErr_Occurred())
9618 goto onError;
9619 if (width < 0) {
9620 flags |= F_LJUST;
9621 width = -width;
9622 }
9623 if (--fmtcnt >= 0)
9624 c = *fmt++;
9625 }
9626 else if (c >= '0' && c <= '9') {
9627 width = c - '0';
9628 while (--fmtcnt >= 0) {
9629 c = *fmt++;
9630 if (c < '0' || c > '9')
9631 break;
9632 if ((width*10) / 10 != width) {
9633 PyErr_SetString(PyExc_ValueError,
9634 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009635 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009636 }
9637 width = width*10 + (c - '0');
9638 }
9639 }
9640 if (c == '.') {
9641 prec = 0;
9642 if (--fmtcnt >= 0)
9643 c = *fmt++;
9644 if (c == '*') {
9645 v = getnextarg(args, arglen, &argidx);
9646 if (v == NULL)
9647 goto onError;
9648 if (!PyLong_Check(v)) {
9649 PyErr_SetString(PyExc_TypeError,
9650 "* wants int");
9651 goto onError;
9652 }
9653 prec = PyLong_AsLong(v);
9654 if (prec == -1 && PyErr_Occurred())
9655 goto onError;
9656 if (prec < 0)
9657 prec = 0;
9658 if (--fmtcnt >= 0)
9659 c = *fmt++;
9660 }
9661 else if (c >= '0' && c <= '9') {
9662 prec = c - '0';
9663 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009664 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009665 if (c < '0' || c > '9')
9666 break;
9667 if ((prec*10) / 10 != prec) {
9668 PyErr_SetString(PyExc_ValueError,
9669 "prec too big");
9670 goto onError;
9671 }
9672 prec = prec*10 + (c - '0');
9673 }
9674 }
9675 } /* prec */
9676 if (fmtcnt >= 0) {
9677 if (c == 'h' || c == 'l' || c == 'L') {
9678 if (--fmtcnt >= 0)
9679 c = *fmt++;
9680 }
9681 }
9682 if (fmtcnt < 0) {
9683 PyErr_SetString(PyExc_ValueError,
9684 "incomplete format");
9685 goto onError;
9686 }
9687 if (c != '%') {
9688 v = getnextarg(args, arglen, &argidx);
9689 if (v == NULL)
9690 goto onError;
9691 }
9692 sign = 0;
9693 fill = ' ';
9694 switch (c) {
9695
9696 case '%':
9697 pbuf = formatbuf;
9698 /* presume that buffer length is at least 1 */
9699 pbuf[0] = '%';
9700 len = 1;
9701 break;
9702
9703 case 's':
9704 case 'r':
9705 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009706 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009707 temp = v;
9708 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009709 }
9710 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009711 if (c == 's')
9712 temp = PyObject_Str(v);
9713 else if (c == 'r')
9714 temp = PyObject_Repr(v);
9715 else
9716 temp = PyObject_ASCII(v);
9717 if (temp == NULL)
9718 goto onError;
9719 if (PyUnicode_Check(temp))
9720 /* nothing to do */;
9721 else {
9722 Py_DECREF(temp);
9723 PyErr_SetString(PyExc_TypeError,
9724 "%s argument has non-string str()");
9725 goto onError;
9726 }
9727 }
9728 pbuf = PyUnicode_AS_UNICODE(temp);
9729 len = PyUnicode_GET_SIZE(temp);
9730 if (prec >= 0 && len > prec)
9731 len = prec;
9732 break;
9733
9734 case 'i':
9735 case 'd':
9736 case 'u':
9737 case 'o':
9738 case 'x':
9739 case 'X':
9740 if (c == 'i')
9741 c = 'd';
9742 isnumok = 0;
9743 if (PyNumber_Check(v)) {
9744 PyObject *iobj=NULL;
9745
9746 if (PyLong_Check(v)) {
9747 iobj = v;
9748 Py_INCREF(iobj);
9749 }
9750 else {
9751 iobj = PyNumber_Long(v);
9752 }
9753 if (iobj!=NULL) {
9754 if (PyLong_Check(iobj)) {
9755 isnumok = 1;
9756 temp = formatlong(iobj, flags, prec, c);
9757 Py_DECREF(iobj);
9758 if (!temp)
9759 goto onError;
9760 pbuf = PyUnicode_AS_UNICODE(temp);
9761 len = PyUnicode_GET_SIZE(temp);
9762 sign = 1;
9763 }
9764 else {
9765 Py_DECREF(iobj);
9766 }
9767 }
9768 }
9769 if (!isnumok) {
9770 PyErr_Format(PyExc_TypeError,
9771 "%%%c format: a number is required, "
9772 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9773 goto onError;
9774 }
9775 if (flags & F_ZERO)
9776 fill = '0';
9777 break;
9778
9779 case 'e':
9780 case 'E':
9781 case 'f':
9782 case 'F':
9783 case 'g':
9784 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009785 temp = formatfloat(v, flags, prec, c);
9786 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009787 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009788 pbuf = PyUnicode_AS_UNICODE(temp);
9789 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009790 sign = 1;
9791 if (flags & F_ZERO)
9792 fill = '0';
9793 break;
9794
9795 case 'c':
9796 pbuf = formatbuf;
9797 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9798 if (len < 0)
9799 goto onError;
9800 break;
9801
9802 default:
9803 PyErr_Format(PyExc_ValueError,
9804 "unsupported format character '%c' (0x%x) "
9805 "at index %zd",
9806 (31<=c && c<=126) ? (char)c : '?',
9807 (int)c,
9808 (Py_ssize_t)(fmt - 1 -
9809 PyUnicode_AS_UNICODE(uformat)));
9810 goto onError;
9811 }
9812 if (sign) {
9813 if (*pbuf == '-' || *pbuf == '+') {
9814 sign = *pbuf++;
9815 len--;
9816 }
9817 else if (flags & F_SIGN)
9818 sign = '+';
9819 else if (flags & F_BLANK)
9820 sign = ' ';
9821 else
9822 sign = 0;
9823 }
9824 if (width < len)
9825 width = len;
9826 if (rescnt - (sign != 0) < width) {
9827 reslen -= rescnt;
9828 rescnt = width + fmtcnt + 100;
9829 reslen += rescnt;
9830 if (reslen < 0) {
9831 Py_XDECREF(temp);
9832 PyErr_NoMemory();
9833 goto onError;
9834 }
9835 if (_PyUnicode_Resize(&result, reslen) < 0) {
9836 Py_XDECREF(temp);
9837 goto onError;
9838 }
9839 res = PyUnicode_AS_UNICODE(result)
9840 + reslen - rescnt;
9841 }
9842 if (sign) {
9843 if (fill != ' ')
9844 *res++ = sign;
9845 rescnt--;
9846 if (width > len)
9847 width--;
9848 }
9849 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9850 assert(pbuf[0] == '0');
9851 assert(pbuf[1] == c);
9852 if (fill != ' ') {
9853 *res++ = *pbuf++;
9854 *res++ = *pbuf++;
9855 }
9856 rescnt -= 2;
9857 width -= 2;
9858 if (width < 0)
9859 width = 0;
9860 len -= 2;
9861 }
9862 if (width > len && !(flags & F_LJUST)) {
9863 do {
9864 --rescnt;
9865 *res++ = fill;
9866 } while (--width > len);
9867 }
9868 if (fill == ' ') {
9869 if (sign)
9870 *res++ = sign;
9871 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9872 assert(pbuf[0] == '0');
9873 assert(pbuf[1] == c);
9874 *res++ = *pbuf++;
9875 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009876 }
9877 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009878 Py_UNICODE_COPY(res, pbuf, len);
9879 res += len;
9880 rescnt -= len;
9881 while (--width >= len) {
9882 --rescnt;
9883 *res++ = ' ';
9884 }
9885 if (dict && (argidx < arglen) && c != '%') {
9886 PyErr_SetString(PyExc_TypeError,
9887 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009888 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009889 goto onError;
9890 }
9891 Py_XDECREF(temp);
9892 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893 } /* until end */
9894 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009895 PyErr_SetString(PyExc_TypeError,
9896 "not all arguments converted during string formatting");
9897 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898 }
9899
Thomas Woutersa96affe2006-03-12 00:29:36 +00009900 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009901 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009903 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904 }
9905 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906 return (PyObject *)result;
9907
Benjamin Peterson29060642009-01-31 22:14:21 +00009908 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909 Py_XDECREF(result);
9910 Py_DECREF(uformat);
9911 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009912 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913 }
9914 return NULL;
9915}
9916
Jeremy Hylton938ace62002-07-17 16:30:39 +00009917static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009918unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9919
Tim Peters6d6c1a32001-08-02 04:15:00 +00009920static PyObject *
9921unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9922{
Benjamin Peterson29060642009-01-31 22:14:21 +00009923 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009924 static char *kwlist[] = {"object", "encoding", "errors", 0};
9925 char *encoding = NULL;
9926 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009927
Benjamin Peterson14339b62009-01-31 16:36:08 +00009928 if (type != &PyUnicode_Type)
9929 return unicode_subtype_new(type, args, kwds);
9930 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009932 return NULL;
9933 if (x == NULL)
9934 return (PyObject *)_PyUnicode_New(0);
9935 if (encoding == NULL && errors == NULL)
9936 return PyObject_Str(x);
9937 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009938 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009939}
9940
Guido van Rossume023fe02001-08-30 03:12:59 +00009941static PyObject *
9942unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9943{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009944 PyUnicodeObject *tmp, *pnew;
9945 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009946
Benjamin Peterson14339b62009-01-31 16:36:08 +00009947 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9948 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9949 if (tmp == NULL)
9950 return NULL;
9951 assert(PyUnicode_Check(tmp));
9952 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9953 if (pnew == NULL) {
9954 Py_DECREF(tmp);
9955 return NULL;
9956 }
9957 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9958 if (pnew->str == NULL) {
9959 _Py_ForgetReference((PyObject *)pnew);
9960 PyObject_Del(pnew);
9961 Py_DECREF(tmp);
9962 return PyErr_NoMemory();
9963 }
9964 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9965 pnew->length = n;
9966 pnew->hash = tmp->hash;
9967 Py_DECREF(tmp);
9968 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009969}
9970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009971PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009973\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009974Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009975encoding defaults to the current default string encoding.\n\
9976errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009977
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009978static PyObject *unicode_iter(PyObject *seq);
9979
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009981 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009982 "str", /* tp_name */
9983 sizeof(PyUnicodeObject), /* tp_size */
9984 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009986 (destructor)unicode_dealloc, /* tp_dealloc */
9987 0, /* tp_print */
9988 0, /* tp_getattr */
9989 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009990 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009991 unicode_repr, /* tp_repr */
9992 &unicode_as_number, /* tp_as_number */
9993 &unicode_as_sequence, /* tp_as_sequence */
9994 &unicode_as_mapping, /* tp_as_mapping */
9995 (hashfunc) unicode_hash, /* tp_hash*/
9996 0, /* tp_call*/
9997 (reprfunc) unicode_str, /* tp_str */
9998 PyObject_GenericGetAttr, /* tp_getattro */
9999 0, /* tp_setattro */
10000 0, /* tp_as_buffer */
10001 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000010002 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010003 unicode_doc, /* tp_doc */
10004 0, /* tp_traverse */
10005 0, /* tp_clear */
10006 PyUnicode_RichCompare, /* tp_richcompare */
10007 0, /* tp_weaklistoffset */
10008 unicode_iter, /* tp_iter */
10009 0, /* tp_iternext */
10010 unicode_methods, /* tp_methods */
10011 0, /* tp_members */
10012 0, /* tp_getset */
10013 &PyBaseObject_Type, /* tp_base */
10014 0, /* tp_dict */
10015 0, /* tp_descr_get */
10016 0, /* tp_descr_set */
10017 0, /* tp_dictoffset */
10018 0, /* tp_init */
10019 0, /* tp_alloc */
10020 unicode_new, /* tp_new */
10021 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022};
10023
10024/* Initialize the Unicode implementation */
10025
Thomas Wouters78890102000-07-22 19:25:51 +000010026void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010028 int i;
10029
Thomas Wouters477c8d52006-05-27 19:21:47 +000010030 /* XXX - move this array to unicodectype.c ? */
10031 Py_UNICODE linebreak[] = {
10032 0x000A, /* LINE FEED */
10033 0x000D, /* CARRIAGE RETURN */
10034 0x001C, /* FILE SEPARATOR */
10035 0x001D, /* GROUP SEPARATOR */
10036 0x001E, /* RECORD SEPARATOR */
10037 0x0085, /* NEXT LINE */
10038 0x2028, /* LINE SEPARATOR */
10039 0x2029, /* PARAGRAPH SEPARATOR */
10040 };
10041
Fred Drakee4315f52000-05-09 19:53:39 +000010042 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010043 free_list = NULL;
10044 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010046 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010047 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010048
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010049 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010050 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010051 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010052 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010053
10054 /* initialize the linebreak bloom filter */
10055 bloom_linebreak = make_bloom_mask(
10056 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10057 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010058
10059 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060}
10061
10062/* Finalize the Unicode implementation */
10063
Christian Heimesa156e092008-02-16 07:38:31 +000010064int
10065PyUnicode_ClearFreeList(void)
10066{
10067 int freelist_size = numfree;
10068 PyUnicodeObject *u;
10069
10070 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010071 PyUnicodeObject *v = u;
10072 u = *(PyUnicodeObject **)u;
10073 if (v->str)
10074 PyObject_DEL(v->str);
10075 Py_XDECREF(v->defenc);
10076 PyObject_Del(v);
10077 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010078 }
10079 free_list = NULL;
10080 assert(numfree == 0);
10081 return freelist_size;
10082}
10083
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084void
Thomas Wouters78890102000-07-22 19:25:51 +000010085_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010086{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010087 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010089 Py_XDECREF(unicode_empty);
10090 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010091
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010092 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010093 if (unicode_latin1[i]) {
10094 Py_DECREF(unicode_latin1[i]);
10095 unicode_latin1[i] = NULL;
10096 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010097 }
Christian Heimesa156e092008-02-16 07:38:31 +000010098 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010100
Walter Dörwald16807132007-05-25 13:52:07 +000010101void
10102PyUnicode_InternInPlace(PyObject **p)
10103{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010104 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10105 PyObject *t;
10106 if (s == NULL || !PyUnicode_Check(s))
10107 Py_FatalError(
10108 "PyUnicode_InternInPlace: unicode strings only please!");
10109 /* If it's a subclass, we don't really know what putting
10110 it in the interned dict might do. */
10111 if (!PyUnicode_CheckExact(s))
10112 return;
10113 if (PyUnicode_CHECK_INTERNED(s))
10114 return;
10115 if (interned == NULL) {
10116 interned = PyDict_New();
10117 if (interned == NULL) {
10118 PyErr_Clear(); /* Don't leave an exception */
10119 return;
10120 }
10121 }
10122 /* It might be that the GetItem call fails even
10123 though the key is present in the dictionary,
10124 namely when this happens during a stack overflow. */
10125 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010126 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010127 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010128
Benjamin Peterson29060642009-01-31 22:14:21 +000010129 if (t) {
10130 Py_INCREF(t);
10131 Py_DECREF(*p);
10132 *p = t;
10133 return;
10134 }
Walter Dörwald16807132007-05-25 13:52:07 +000010135
Benjamin Peterson14339b62009-01-31 16:36:08 +000010136 PyThreadState_GET()->recursion_critical = 1;
10137 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10138 PyErr_Clear();
10139 PyThreadState_GET()->recursion_critical = 0;
10140 return;
10141 }
10142 PyThreadState_GET()->recursion_critical = 0;
10143 /* The two references in interned are not counted by refcnt.
10144 The deallocator will take care of this */
10145 Py_REFCNT(s) -= 2;
10146 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010147}
10148
10149void
10150PyUnicode_InternImmortal(PyObject **p)
10151{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010152 PyUnicode_InternInPlace(p);
10153 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10154 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10155 Py_INCREF(*p);
10156 }
Walter Dörwald16807132007-05-25 13:52:07 +000010157}
10158
10159PyObject *
10160PyUnicode_InternFromString(const char *cp)
10161{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010162 PyObject *s = PyUnicode_FromString(cp);
10163 if (s == NULL)
10164 return NULL;
10165 PyUnicode_InternInPlace(&s);
10166 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010167}
10168
Alexander Belopolsky40018472011-02-26 01:02:56 +000010169void
10170_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000010171{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010172 PyObject *keys;
10173 PyUnicodeObject *s;
10174 Py_ssize_t i, n;
10175 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010176
Benjamin Peterson14339b62009-01-31 16:36:08 +000010177 if (interned == NULL || !PyDict_Check(interned))
10178 return;
10179 keys = PyDict_Keys(interned);
10180 if (keys == NULL || !PyList_Check(keys)) {
10181 PyErr_Clear();
10182 return;
10183 }
Walter Dörwald16807132007-05-25 13:52:07 +000010184
Benjamin Peterson14339b62009-01-31 16:36:08 +000010185 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10186 detector, interned unicode strings are not forcibly deallocated;
10187 rather, we give them their stolen references back, and then clear
10188 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010189
Benjamin Peterson14339b62009-01-31 16:36:08 +000010190 n = PyList_GET_SIZE(keys);
10191 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010192 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010193 for (i = 0; i < n; i++) {
10194 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10195 switch (s->state) {
10196 case SSTATE_NOT_INTERNED:
10197 /* XXX Shouldn't happen */
10198 break;
10199 case SSTATE_INTERNED_IMMORTAL:
10200 Py_REFCNT(s) += 1;
10201 immortal_size += s->length;
10202 break;
10203 case SSTATE_INTERNED_MORTAL:
10204 Py_REFCNT(s) += 2;
10205 mortal_size += s->length;
10206 break;
10207 default:
10208 Py_FatalError("Inconsistent interned string state.");
10209 }
10210 s->state = SSTATE_NOT_INTERNED;
10211 }
10212 fprintf(stderr, "total size of all interned strings: "
10213 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10214 "mortal/immortal\n", mortal_size, immortal_size);
10215 Py_DECREF(keys);
10216 PyDict_Clear(interned);
10217 Py_DECREF(interned);
10218 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010219}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010220
10221
10222/********************* Unicode Iterator **************************/
10223
10224typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010225 PyObject_HEAD
10226 Py_ssize_t it_index;
10227 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010228} unicodeiterobject;
10229
10230static void
10231unicodeiter_dealloc(unicodeiterobject *it)
10232{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010233 _PyObject_GC_UNTRACK(it);
10234 Py_XDECREF(it->it_seq);
10235 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010236}
10237
10238static int
10239unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10240{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010241 Py_VISIT(it->it_seq);
10242 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010243}
10244
10245static PyObject *
10246unicodeiter_next(unicodeiterobject *it)
10247{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010248 PyUnicodeObject *seq;
10249 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010250
Benjamin Peterson14339b62009-01-31 16:36:08 +000010251 assert(it != NULL);
10252 seq = it->it_seq;
10253 if (seq == NULL)
10254 return NULL;
10255 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010256
Benjamin Peterson14339b62009-01-31 16:36:08 +000010257 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10258 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010259 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010260 if (item != NULL)
10261 ++it->it_index;
10262 return item;
10263 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010264
Benjamin Peterson14339b62009-01-31 16:36:08 +000010265 Py_DECREF(seq);
10266 it->it_seq = NULL;
10267 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010268}
10269
10270static PyObject *
10271unicodeiter_len(unicodeiterobject *it)
10272{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010273 Py_ssize_t len = 0;
10274 if (it->it_seq)
10275 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10276 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010277}
10278
10279PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10280
10281static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010282 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010283 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010284 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010285};
10286
10287PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010288 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10289 "str_iterator", /* tp_name */
10290 sizeof(unicodeiterobject), /* tp_basicsize */
10291 0, /* tp_itemsize */
10292 /* methods */
10293 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10294 0, /* tp_print */
10295 0, /* tp_getattr */
10296 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010297 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010298 0, /* tp_repr */
10299 0, /* tp_as_number */
10300 0, /* tp_as_sequence */
10301 0, /* tp_as_mapping */
10302 0, /* tp_hash */
10303 0, /* tp_call */
10304 0, /* tp_str */
10305 PyObject_GenericGetAttr, /* tp_getattro */
10306 0, /* tp_setattro */
10307 0, /* tp_as_buffer */
10308 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10309 0, /* tp_doc */
10310 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10311 0, /* tp_clear */
10312 0, /* tp_richcompare */
10313 0, /* tp_weaklistoffset */
10314 PyObject_SelfIter, /* tp_iter */
10315 (iternextfunc)unicodeiter_next, /* tp_iternext */
10316 unicodeiter_methods, /* tp_methods */
10317 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010318};
10319
10320static PyObject *
10321unicode_iter(PyObject *seq)
10322{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010323 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010324
Benjamin Peterson14339b62009-01-31 16:36:08 +000010325 if (!PyUnicode_Check(seq)) {
10326 PyErr_BadInternalCall();
10327 return NULL;
10328 }
10329 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10330 if (it == NULL)
10331 return NULL;
10332 it->it_index = 0;
10333 Py_INCREF(seq);
10334 it->it_seq = (PyUnicodeObject *)seq;
10335 _PyObject_GC_TRACK(it);
10336 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010337}
10338
Martin v. Löwis5b222132007-06-10 09:51:05 +000010339size_t
10340Py_UNICODE_strlen(const Py_UNICODE *u)
10341{
10342 int res = 0;
10343 while(*u++)
10344 res++;
10345 return res;
10346}
10347
10348Py_UNICODE*
10349Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10350{
10351 Py_UNICODE *u = s1;
10352 while ((*u++ = *s2++));
10353 return s1;
10354}
10355
10356Py_UNICODE*
10357Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10358{
10359 Py_UNICODE *u = s1;
10360 while ((*u++ = *s2++))
10361 if (n-- == 0)
10362 break;
10363 return s1;
10364}
10365
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010366Py_UNICODE*
10367Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10368{
10369 Py_UNICODE *u1 = s1;
10370 u1 += Py_UNICODE_strlen(u1);
10371 Py_UNICODE_strcpy(u1, s2);
10372 return s1;
10373}
10374
Martin v. Löwis5b222132007-06-10 09:51:05 +000010375int
10376Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10377{
10378 while (*s1 && *s2 && *s1 == *s2)
10379 s1++, s2++;
10380 if (*s1 && *s2)
10381 return (*s1 < *s2) ? -1 : +1;
10382 if (*s1)
10383 return 1;
10384 if (*s2)
10385 return -1;
10386 return 0;
10387}
10388
Victor Stinneref8d95c2010-08-16 22:03:11 +000010389int
10390Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10391{
10392 register Py_UNICODE u1, u2;
10393 for (; n != 0; n--) {
10394 u1 = *s1;
10395 u2 = *s2;
10396 if (u1 != u2)
10397 return (u1 < u2) ? -1 : +1;
10398 if (u1 == '\0')
10399 return 0;
10400 s1++;
10401 s2++;
10402 }
10403 return 0;
10404}
10405
Martin v. Löwis5b222132007-06-10 09:51:05 +000010406Py_UNICODE*
10407Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10408{
10409 const Py_UNICODE *p;
10410 for (p = s; *p; p++)
10411 if (*p == c)
10412 return (Py_UNICODE*)p;
10413 return NULL;
10414}
10415
Victor Stinner331ea922010-08-10 16:37:20 +000010416Py_UNICODE*
10417Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10418{
10419 const Py_UNICODE *p;
10420 p = s + Py_UNICODE_strlen(s);
10421 while (p != s) {
10422 p--;
10423 if (*p == c)
10424 return (Py_UNICODE*)p;
10425 }
10426 return NULL;
10427}
10428
Victor Stinner71133ff2010-09-01 23:43:53 +000010429Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010430PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010431{
10432 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10433 Py_UNICODE *copy;
10434 Py_ssize_t size;
10435
10436 /* Ensure we won't overflow the size. */
10437 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10438 PyErr_NoMemory();
10439 return NULL;
10440 }
10441 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10442 size *= sizeof(Py_UNICODE);
10443 copy = PyMem_Malloc(size);
10444 if (copy == NULL) {
10445 PyErr_NoMemory();
10446 return NULL;
10447 }
10448 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10449 return copy;
10450}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010451
Georg Brandl66c221e2010-10-14 07:04:07 +000010452/* A _string module, to export formatter_parser and formatter_field_name_split
10453 to the string.Formatter class implemented in Python. */
10454
10455static PyMethodDef _string_methods[] = {
10456 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10457 METH_O, PyDoc_STR("split the argument as a field name")},
10458 {"formatter_parser", (PyCFunction) formatter_parser,
10459 METH_O, PyDoc_STR("parse the argument as a format string")},
10460 {NULL, NULL}
10461};
10462
10463static struct PyModuleDef _string_module = {
10464 PyModuleDef_HEAD_INIT,
10465 "_string",
10466 PyDoc_STR("string helper module"),
10467 0,
10468 _string_methods,
10469 NULL,
10470 NULL,
10471 NULL,
10472 NULL
10473};
10474
10475PyMODINIT_FUNC
10476PyInit__string(void)
10477{
10478 return PyModule_Create(&_string_module);
10479}
10480
10481
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010482#ifdef __cplusplus
10483}
10484#endif