blob: e7bbd808c2c059c5f828be825256fd9646616a9f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146 PyObject **errorHandler,const char *encoding, const char *reason,
147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
Victor Stinner31be90b2010-04-22 19:38:16 +0000150static void raise_encode_exception(PyObject **exceptionObject,
151 const char *encoding,
152 const Py_UNICODE *unicode, Py_ssize_t size,
153 Py_ssize_t startpos, Py_ssize_t endpos,
154 const char *reason);
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000160/* 0x000B, * LINE TABULATION */
161/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x001C, * FILE SEPARATOR */
166/* 0x001D, * GROUP SEPARATOR */
167/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000168 0, 0, 0, 0, 1, 1, 1, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000173
Benjamin Peterson14339b62009-01-31 16:36:08 +0000174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000182};
183
184
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000185Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000186PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000188#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000189 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 /* This is actually an illegal character, so it should
192 not be passed to unichr. */
193 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#endif
195}
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200 to keep things simple, we use a single bitmask, using the least 5
201 bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
Antoine Pitrouf068f942010-01-13 14:19:12 +0000205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
Antoine Pitrouf068f942010-01-13 14:19:12 +0000219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221
Benjamin Peterson29060642009-01-31 22:14:21 +0000222#define BLOOM_LINEBREAK(ch) \
223 ((ch) < 128U ? ascii_linebreak[(ch)] : \
224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228 /* calculate simple bloom-style bitmask for a given unicode string */
229
Antoine Pitrouf068f942010-01-13 14:19:12 +0000230 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 Py_ssize_t i;
232
233 mask = 0;
234 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000235 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236
237 return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242 Py_ssize_t i;
243
244 for (i = 0; i < setlen; i++)
245 if (set[i] == chr)
246 return 1;
247
248 return 0;
249}
250
Benjamin Peterson29060642009-01-31 22:14:21 +0000251#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254/* --- Unicode Object ----------------------------------------------------- */
255
256static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000257int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000258 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259{
260 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000261
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000266 /* Resizing shared object (unicode_empty or single character
267 objects) in-place is not allowed. Use PyUnicode_Resize()
268 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 (unicode->length == 1 &&
272 unicode->str[0] < 256U &&
273 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000275 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 return -1;
277 }
278
Thomas Wouters477c8d52006-05-27 19:21:47 +0000279 /* We allocate one more byte to make sure the string is Ux0000 terminated.
280 The overallocation is also used by fastsearch, which assumes that it's
281 safe to look at str[length] (without making any assumptions about what
282 it contains). */
283
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000285 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000286 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 PyErr_NoMemory();
290 return -1;
291 }
292 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294
Benjamin Peterson29060642009-01-31 22:14:21 +0000295 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000298 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 }
300 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000301
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 return 0;
303}
304
305/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000306 Ux0000 terminated; some code (e.g. new_identifier)
307 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308
309 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000310 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312*/
313
314static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316{
317 register PyUnicodeObject *unicode;
318
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (length == 0 && unicode_empty != NULL) {
321 Py_INCREF(unicode_empty);
322 return unicode_empty;
323 }
324
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000325 /* Ensure we won't overflow the size. */
326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327 return (PyUnicodeObject *)PyErr_NoMemory();
328 }
329
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000331 if (free_list) {
332 unicode = free_list;
333 free_list = *(PyUnicodeObject **)unicode;
334 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 if (unicode->str) {
336 /* Keep-Alive optimization: we only upsize the buffer,
337 never downsize it. */
338 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000339 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 PyObject_DEL(unicode->str);
341 unicode->str = NULL;
342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 }
348 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000351 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 if (unicode == NULL)
354 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
358
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000359 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 PyErr_NoMemory();
361 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000363 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000364 * the caller fails before initializing str -- unicode_resize()
365 * reads str[0], and the Keep-Alive optimization can keep memory
366 * allocated for str alive across a call to unicode_dealloc(unicode).
367 * We don't want unicode_resize to read uninitialized memory in
368 * that case.
369 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000374 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377
Benjamin Peterson29060642009-01-31 22:14:21 +0000378 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000379 /* XXX UNREF/NEWREF interface should be more symmetrical */
380 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000382 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384}
385
386static
Guido van Rossum9475a232001-10-05 20:51:39 +0000387void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388{
Walter Dörwald16807132007-05-25 13:52:07 +0000389 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_NOT_INTERNED:
391 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_INTERNED_MORTAL:
394 /* revive dead object temporarily for DelItem */
395 Py_REFCNT(unicode) = 3;
396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397 Py_FatalError(
398 "deletion of interned string failed");
399 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000400
Benjamin Peterson29060642009-01-31 22:14:21 +0000401 case SSTATE_INTERNED_IMMORTAL:
402 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 default:
405 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406 }
407
Guido van Rossum604ddf82001-12-06 20:03:56 +0000408 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000410 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412 PyObject_DEL(unicode->str);
413 unicode->str = NULL;
414 unicode->length = 0;
415 }
416 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000417 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000418 }
419 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000420 *(PyUnicodeObject **)unicode = free_list;
421 free_list = unicode;
422 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000425 PyObject_DEL(unicode->str);
426 Py_XDECREF(unicode->defenc);
427 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 }
429}
430
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433{
434 register PyUnicodeObject *v;
435
436 /* Argument checks */
437 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000441 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 PyErr_BadInternalCall();
444 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445 }
446
447 /* Resizing unicode_empty and single character objects is not
448 possible since these are being shared. We simply return a fresh
449 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000450 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 (v == unicode_empty || v->length == 1)) {
452 PyUnicodeObject *w = _PyUnicode_New(length);
453 if (w == NULL)
454 return -1;
455 Py_UNICODE_COPY(w->str, v->str,
456 length < v->length ? length : v->length);
457 Py_DECREF(*unicode);
458 *unicode = w;
459 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 }
461
462 /* Note that we don't have to modify *unicode for unshared Unicode
463 objects, since we can modify them in-place. */
464 return unicode_resize(v, length);
465}
466
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000473 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474{
475 PyUnicodeObject *unicode;
476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 /* If the Unicode data is known at construction time, we can apply
478 some optimizations which share commonly used objects. */
479 if (u != NULL) {
480
Benjamin Peterson29060642009-01-31 22:14:21 +0000481 /* Optimization for empty strings */
482 if (size == 0 && unicode_empty != NULL) {
483 Py_INCREF(unicode_empty);
484 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000485 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000486
487 /* Single character Unicode objects in the Latin-1 range are
488 shared when using this constructor */
489 if (size == 1 && *u < 256) {
490 unicode = unicode_latin1[*u];
491 if (!unicode) {
492 unicode = _PyUnicode_New(1);
493 if (!unicode)
494 return NULL;
495 unicode->str[0] = *u;
496 unicode_latin1[*u] = unicode;
497 }
498 Py_INCREF(unicode);
499 return (PyObject *)unicode;
500 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000501 }
Tim Petersced69f82003-09-16 20:30:58 +0000502
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
506
507 /* Copy the Unicode data into the new object */
508 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000509 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 return (PyObject *)unicode;
512}
513
Walter Dörwaldd2034312007-05-18 16:29:38 +0000514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000515{
516 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000517
Benjamin Peterson14339b62009-01-31 16:36:08 +0000518 if (size < 0) {
519 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 return NULL;
522 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000525 some optimizations which share commonly used objects.
526 Also, this means the input must be UTF-8, so fall back to the
527 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (u != NULL) {
529
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 /* Optimization for empty strings */
531 if (size == 0 && unicode_empty != NULL) {
532 Py_INCREF(unicode_empty);
533 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000534 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000535
536 /* Single characters are shared when using this constructor.
537 Restrict to ASCII, since the input must be UTF-8. */
538 if (size == 1 && Py_CHARMASK(*u) < 128) {
539 unicode = unicode_latin1[Py_CHARMASK(*u)];
540 if (!unicode) {
541 unicode = _PyUnicode_New(1);
542 if (!unicode)
543 return NULL;
544 unicode->str[0] = Py_CHARMASK(*u);
545 unicode_latin1[Py_CHARMASK(*u)] = unicode;
546 }
547 Py_INCREF(unicode);
548 return (PyObject *)unicode;
549 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000550
551 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000552 }
553
Walter Dörwald55507312007-05-18 13:12:10 +0000554 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 if (!unicode)
556 return NULL;
557
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 return (PyObject *)unicode;
559}
560
Walter Dörwaldd2034312007-05-18 16:29:38 +0000561PyObject *PyUnicode_FromString(const char *u)
562{
563 size_t size = strlen(u);
564 if (size > PY_SSIZE_T_MAX) {
565 PyErr_SetString(PyExc_OverflowError, "input too long");
566 return NULL;
567 }
568
569 return PyUnicode_FromStringAndSize(u, size);
570}
571
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572#ifdef HAVE_WCHAR_H
573
Mark Dickinson081dfee2009-03-18 14:47:41 +0000574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581 to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584 Py_ssize_t size)
585{
586 PyUnicodeObject *unicode;
587 register Py_ssize_t i;
588 Py_ssize_t alloc;
589 const wchar_t *orig_w;
590
591 if (w == NULL) {
592 if (size == 0)
593 return PyUnicode_FromStringAndSize(NULL, 0);
594 PyErr_BadInternalCall();
595 return NULL;
596 }
597
598 if (size == -1) {
599 size = wcslen(w);
600 }
601
602 alloc = size;
603 orig_w = w;
604 for (i = size; i > 0; i--) {
605 if (*w > 0xFFFF)
606 alloc++;
607 w++;
608 }
609 w = orig_w;
610 unicode = _PyUnicode_New(alloc);
611 if (!unicode)
612 return NULL;
613
614 /* Copy the wchar_t data into the new object */
615 {
616 register Py_UNICODE *u;
617 u = PyUnicode_AS_UNICODE(unicode);
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF) {
620 wchar_t ordinal = *w++;
621 ordinal -= 0x10000;
622 *u++ = 0xD800 | (ordinal >> 10);
623 *u++ = 0xDC00 | (ordinal & 0x3FF);
624 }
625 else
626 *u++ = *w++;
627 }
628 }
629 return (PyObject *)unicode;
630}
631
632#else
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000635 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636{
637 PyUnicodeObject *unicode;
638
639 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000640 if (size == 0)
641 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000642 PyErr_BadInternalCall();
643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644 }
645
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == -1) {
647 size = wcslen(w);
648 }
649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = _PyUnicode_New(size);
651 if (!unicode)
652 return NULL;
653
654 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000657#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000659 register Py_UNICODE *u;
660 register Py_ssize_t i;
661 u = PyUnicode_AS_UNICODE(unicode);
662 for (i = size; i > 0; i--)
663 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665#endif
666
667 return (PyObject *)unicode;
668}
669
Mark Dickinson081dfee2009-03-18 14:47:41 +0000670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
Walter Dörwald346737f2007-05-31 10:44:43 +0000674static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000677{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000678 *fmt++ = '%';
679 if (width) {
680 if (zeropad)
681 *fmt++ = '0';
682 fmt += sprintf(fmt, "%d", width);
683 }
684 if (precision)
685 fmt += sprintf(fmt, ".%d", precision);
686 if (longflag)
687 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000688 else if (longlongflag) {
689 /* longlongflag should only ever be nonzero on machines with
690 HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692 char *f = PY_FORMAT_LONG_LONG;
693 while (*f)
694 *fmt++ = *f++;
695#else
696 /* we shouldn't ever get here */
697 assert(0);
698 *fmt++ = 'l';
699#endif
700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000701 else if (size_tflag) {
702 char *f = PY_FORMAT_SIZE_T;
703 while (*f)
704 *fmt++ = *f++;
705 }
706 *fmt++ = c;
707 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000708}
709
Walter Dörwaldd2034312007-05-18 16:29:38 +0000710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld. 21 characters
715 allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 va_list count;
726 Py_ssize_t callcount = 0;
727 PyObject **callresults = NULL;
728 PyObject **callresult = NULL;
729 Py_ssize_t n = 0;
730 int width = 0;
731 int precision = 0;
732 int zeropad;
733 const char* f;
734 Py_UNICODE *s;
735 PyObject *string;
736 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000737 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000738 /* use abuffer instead of buffer, if we need more space
739 * (which can happen if there's a format specifier with width). */
740 char *abuffer = NULL;
741 char *realbuffer;
742 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000746 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000747 /* step 1: count the number of %S/%R/%A/%s format specifications
748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000752 if (*f == '%') {
753 if (*(f+1)=='%')
754 continue;
755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
756 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000757 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000758 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 ;
761 if (*f == 's')
762 ++callcount;
763 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000764 else if (128 <= (unsigned char)*f) {
765 PyErr_Format(PyExc_ValueError,
766 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000767 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000768 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000769 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 }
772 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 if (callcount) {
775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776 if (!callresults) {
777 PyErr_NoMemory();
778 return NULL;
779 }
780 callresult = callresults;
781 }
782 /* step 3: figure out how large a buffer we need */
783 for (f = format; *f; f++) {
784 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000785#ifdef HAVE_LONG_LONG
786 int longlongflag = 0;
787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000788 const char* p = f;
789 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000790 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796 * they don't affect the amount of space we reserve.
797 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000798 if (*f == 'l') {
799 if (f[1] == 'd' || f[1] == 'u') {
800 ++f;
801 }
802#ifdef HAVE_LONG_LONG
803 else if (f[1] == 'l' &&
804 (f[2] == 'd' || f[2] == 'u')) {
805 longlongflag = 1;
806 f += 2;
807 }
808#endif
809 }
810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000811 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000812 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
Benjamin Peterson14339b62009-01-31 16:36:08 +0000814 switch (*f) {
815 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000816 {
817#ifndef Py_UNICODE_WIDE
818 int ordinal = va_arg(count, int);
819 if (ordinal > 0xffff)
820 n += 2;
821 else
822 n++;
823#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000824 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000825 n++;
826#endif
827 break;
828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 case '%':
830 n++;
831 break;
832 case 'd': case 'u': case 'i': case 'x':
833 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000834#ifdef HAVE_LONG_LONG
835 if (longlongflag) {
836 if (width < MAX_LONG_LONG_CHARS)
837 width = MAX_LONG_LONG_CHARS;
838 }
839 else
840#endif
841 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
842 including sign. Decimal takes the most space. This
843 isn't enough for octal. If a width is specified we
844 need more (which we allocate later). */
845 if (width < MAX_LONG_CHARS)
846 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000848 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000849 if (abuffersize < width)
850 abuffersize = width;
851 break;
852 case 's':
853 {
854 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000855 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000856 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
857 if (!str)
858 goto fail;
859 n += PyUnicode_GET_SIZE(str);
860 /* Remember the str and switch to the next slot */
861 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000862 break;
863 }
864 case 'U':
865 {
866 PyObject *obj = va_arg(count, PyObject *);
867 assert(obj && PyUnicode_Check(obj));
868 n += PyUnicode_GET_SIZE(obj);
869 break;
870 }
871 case 'V':
872 {
873 PyObject *obj = va_arg(count, PyObject *);
874 const char *str = va_arg(count, const char *);
875 assert(obj || str);
876 assert(!obj || PyUnicode_Check(obj));
877 if (obj)
878 n += PyUnicode_GET_SIZE(obj);
879 else
880 n += strlen(str);
881 break;
882 }
883 case 'S':
884 {
885 PyObject *obj = va_arg(count, PyObject *);
886 PyObject *str;
887 assert(obj);
888 str = PyObject_Str(obj);
889 if (!str)
890 goto fail;
891 n += PyUnicode_GET_SIZE(str);
892 /* Remember the str and switch to the next slot */
893 *callresult++ = str;
894 break;
895 }
896 case 'R':
897 {
898 PyObject *obj = va_arg(count, PyObject *);
899 PyObject *repr;
900 assert(obj);
901 repr = PyObject_Repr(obj);
902 if (!repr)
903 goto fail;
904 n += PyUnicode_GET_SIZE(repr);
905 /* Remember the repr and switch to the next slot */
906 *callresult++ = repr;
907 break;
908 }
909 case 'A':
910 {
911 PyObject *obj = va_arg(count, PyObject *);
912 PyObject *ascii;
913 assert(obj);
914 ascii = PyObject_ASCII(obj);
915 if (!ascii)
916 goto fail;
917 n += PyUnicode_GET_SIZE(ascii);
918 /* Remember the repr and switch to the next slot */
919 *callresult++ = ascii;
920 break;
921 }
922 case 'p':
923 (void) va_arg(count, int);
924 /* maximum 64-bit pointer representation:
925 * 0xffffffffffffffff
926 * so 19 characters is enough.
927 * XXX I count 18 -- what's the extra for?
928 */
929 n += 19;
930 break;
931 default:
932 /* if we stumble upon an unknown
933 formatting code, copy the rest of
934 the format string to the output
935 string. (we cannot just skip the
936 code, since there's no way to know
937 what's in the argument list) */
938 n += strlen(p);
939 goto expand;
940 }
941 } else
942 n++;
943 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000944 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000945 if (abuffersize > ITEM_BUFFER_LEN) {
946 /* add 1 for sprintf's trailing null byte */
947 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000948 if (!abuffer) {
949 PyErr_NoMemory();
950 goto fail;
951 }
952 realbuffer = abuffer;
953 }
954 else
955 realbuffer = buffer;
956 /* step 4: fill the buffer */
957 /* Since we've analyzed how much space we need for the worst case,
958 we don't have to resize the string.
959 There can be no errors beyond this point. */
960 string = PyUnicode_FromUnicode(NULL, n);
961 if (!string)
962 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000963
Benjamin Peterson14339b62009-01-31 16:36:08 +0000964 s = PyUnicode_AS_UNICODE(string);
965 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000966
Benjamin Peterson14339b62009-01-31 16:36:08 +0000967 for (f = format; *f; f++) {
968 if (*f == '%') {
969 const char* p = f++;
970 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000971 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000972 int size_tflag = 0;
973 zeropad = (*f == '0');
974 /* parse the width.precision part */
975 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000976 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000977 width = (width*10) + *f++ - '0';
978 precision = 0;
979 if (*f == '.') {
980 f++;
David Malcolm96960882010-11-05 17:23:41 +0000981 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000982 precision = (precision*10) + *f++ - '0';
983 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000984 /* Handle %ld, %lu, %lld and %llu. */
985 if (*f == 'l') {
986 if (f[1] == 'd' || f[1] == 'u') {
987 longflag = 1;
988 ++f;
989 }
990#ifdef HAVE_LONG_LONG
991 else if (f[1] == 'l' &&
992 (f[2] == 'd' || f[2] == 'u')) {
993 longlongflag = 1;
994 f += 2;
995 }
996#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000997 }
998 /* handle the size_t flag. */
999 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1000 size_tflag = 1;
1001 ++f;
1002 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001003
Benjamin Peterson14339b62009-01-31 16:36:08 +00001004 switch (*f) {
1005 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001006 {
1007 int ordinal = va_arg(vargs, int);
1008#ifndef Py_UNICODE_WIDE
1009 if (ordinal > 0xffff) {
1010 ordinal -= 0x10000;
1011 *s++ = 0xD800 | (ordinal >> 10);
1012 *s++ = 0xDC00 | (ordinal & 0x3FF);
1013 } else
1014#endif
1015 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001016 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001017 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001019 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1020 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 if (longflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001023#ifdef HAVE_LONG_LONG
1024 else if (longlongflag)
1025 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1026#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001027 else if (size_tflag)
1028 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1029 else
1030 sprintf(realbuffer, fmt, va_arg(vargs, int));
1031 appendstring(realbuffer);
1032 break;
1033 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001034 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1035 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001036 if (longflag)
1037 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001038#ifdef HAVE_LONG_LONG
1039 else if (longlongflag)
1040 sprintf(realbuffer, fmt, va_arg(vargs,
1041 unsigned PY_LONG_LONG));
1042#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 else if (size_tflag)
1044 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1045 else
1046 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1047 appendstring(realbuffer);
1048 break;
1049 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001050 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 sprintf(realbuffer, fmt, va_arg(vargs, int));
1052 appendstring(realbuffer);
1053 break;
1054 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001055 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001056 sprintf(realbuffer, fmt, va_arg(vargs, int));
1057 appendstring(realbuffer);
1058 break;
1059 case 's':
1060 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001061 /* unused, since we already have the result */
1062 (void) va_arg(vargs, char *);
1063 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1064 PyUnicode_GET_SIZE(*callresult));
1065 s += PyUnicode_GET_SIZE(*callresult);
1066 /* We're done with the unicode()/repr() => forget it */
1067 Py_DECREF(*callresult);
1068 /* switch to next unicode()/repr() result */
1069 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001070 break;
1071 }
1072 case 'U':
1073 {
1074 PyObject *obj = va_arg(vargs, PyObject *);
1075 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1076 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1077 s += size;
1078 break;
1079 }
1080 case 'V':
1081 {
1082 PyObject *obj = va_arg(vargs, PyObject *);
1083 const char *str = va_arg(vargs, const char *);
1084 if (obj) {
1085 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1086 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1087 s += size;
1088 } else {
1089 appendstring(str);
1090 }
1091 break;
1092 }
1093 case 'S':
1094 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001095 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001096 {
1097 Py_UNICODE *ucopy;
1098 Py_ssize_t usize;
1099 Py_ssize_t upos;
1100 /* unused, since we already have the result */
1101 (void) va_arg(vargs, PyObject *);
1102 ucopy = PyUnicode_AS_UNICODE(*callresult);
1103 usize = PyUnicode_GET_SIZE(*callresult);
1104 for (upos = 0; upos<usize;)
1105 *s++ = ucopy[upos++];
1106 /* We're done with the unicode()/repr() => forget it */
1107 Py_DECREF(*callresult);
1108 /* switch to next unicode()/repr() result */
1109 ++callresult;
1110 break;
1111 }
1112 case 'p':
1113 sprintf(buffer, "%p", va_arg(vargs, void*));
1114 /* %p is ill-defined: ensure leading 0x. */
1115 if (buffer[1] == 'X')
1116 buffer[1] = 'x';
1117 else if (buffer[1] != 'x') {
1118 memmove(buffer+2, buffer, strlen(buffer)+1);
1119 buffer[0] = '0';
1120 buffer[1] = 'x';
1121 }
1122 appendstring(buffer);
1123 break;
1124 case '%':
1125 *s++ = '%';
1126 break;
1127 default:
1128 appendstring(p);
1129 goto end;
1130 }
Victor Stinner1205f272010-09-11 00:54:47 +00001131 }
Victor Stinner1205f272010-09-11 00:54:47 +00001132 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001133 *s++ = *f;
1134 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001135
Benjamin Peterson29060642009-01-31 22:14:21 +00001136 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001137 if (callresults)
1138 PyObject_Free(callresults);
1139 if (abuffer)
1140 PyObject_Free(abuffer);
1141 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1142 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001143 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 if (callresults) {
1145 PyObject **callresult2 = callresults;
1146 while (callresult2 < callresult) {
1147 Py_DECREF(*callresult2);
1148 ++callresult2;
1149 }
1150 PyObject_Free(callresults);
1151 }
1152 if (abuffer)
1153 PyObject_Free(abuffer);
1154 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155}
1156
1157#undef appendstring
1158
1159PyObject *
1160PyUnicode_FromFormat(const char *format, ...)
1161{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 PyObject* ret;
1163 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001164
1165#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001167#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001168 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001169#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001170 ret = PyUnicode_FromFormatV(format, vargs);
1171 va_end(vargs);
1172 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001173}
1174
Victor Stinner5593d8a2010-10-02 11:11:27 +00001175/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1176 convert a Unicode object to a wide character string.
1177
1178 - If w is NULL: return the number of wide characters (including the nul
1179 character) required to convert the unicode object. Ignore size argument.
1180
1181 - Otherwise: return the number of wide characters (excluding the nul
1182 character) written into w. Write at most size wide characters (including
1183 the nul character). */
1184static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001185unicode_aswidechar(PyUnicodeObject *unicode,
1186 wchar_t *w,
1187 Py_ssize_t size)
1188{
1189#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001190 Py_ssize_t res;
1191 if (w != NULL) {
1192 res = PyUnicode_GET_SIZE(unicode);
1193 if (size > res)
1194 size = res + 1;
1195 else
1196 res = size;
1197 memcpy(w, unicode->str, size * sizeof(wchar_t));
1198 return res;
1199 }
1200 else
1201 return PyUnicode_GET_SIZE(unicode) + 1;
1202#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1203 register const Py_UNICODE *u;
1204 const Py_UNICODE *uend;
1205 const wchar_t *worig, *wend;
1206 Py_ssize_t nchar;
1207
Victor Stinner137c34c2010-09-29 10:25:54 +00001208 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001209 uend = u + PyUnicode_GET_SIZE(unicode);
1210 if (w != NULL) {
1211 worig = w;
1212 wend = w + size;
1213 while (u != uend && w != wend) {
1214 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1215 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1216 {
1217 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1218 u += 2;
1219 }
1220 else {
1221 *w = *u;
1222 u++;
1223 }
1224 w++;
1225 }
1226 if (w != wend)
1227 *w = L'\0';
1228 return w - worig;
1229 }
1230 else {
1231 nchar = 1; /* nul character at the end */
1232 while (u != uend) {
1233 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1234 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1235 u += 2;
1236 else
1237 u++;
1238 nchar++;
1239 }
1240 }
1241 return nchar;
1242#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1243 register Py_UNICODE *u, *uend, ordinal;
1244 register Py_ssize_t i;
1245 wchar_t *worig, *wend;
1246 Py_ssize_t nchar;
1247
1248 u = PyUnicode_AS_UNICODE(unicode);
1249 uend = u + PyUnicode_GET_SIZE(u);
1250 if (w != NULL) {
1251 worig = w;
1252 wend = w + size;
1253 while (u != uend && w != wend) {
1254 ordinal = *u;
1255 if (ordinal > 0xffff) {
1256 ordinal -= 0x10000;
1257 *w++ = 0xD800 | (ordinal >> 10);
1258 *w++ = 0xDC00 | (ordinal & 0x3FF);
1259 }
1260 else
1261 *w++ = ordinal;
1262 u++;
1263 }
1264 if (w != wend)
1265 *w = 0;
1266 return w - worig;
1267 }
1268 else {
1269 nchar = 1; /* nul character */
1270 while (u != uend) {
1271 if (*u > 0xffff)
1272 nchar += 2;
1273 else
1274 nchar++;
1275 u++;
1276 }
1277 return nchar;
1278 }
1279#else
1280# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001281#endif
1282}
1283
1284Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001285PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001286 wchar_t *w,
1287 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288{
1289 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001290 PyErr_BadInternalCall();
1291 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001293 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
Victor Stinner137c34c2010-09-29 10:25:54 +00001296wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001297PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001298 Py_ssize_t *size)
1299{
1300 wchar_t* buffer;
1301 Py_ssize_t buflen;
1302
1303 if (unicode == NULL) {
1304 PyErr_BadInternalCall();
1305 return NULL;
1306 }
1307
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001308 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001309 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001310 PyErr_NoMemory();
1311 return NULL;
1312 }
1313
Victor Stinner137c34c2010-09-29 10:25:54 +00001314 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1315 if (buffer == NULL) {
1316 PyErr_NoMemory();
1317 return NULL;
1318 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001319 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001320 if (size != NULL)
1321 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001322 return buffer;
1323}
1324
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325#endif
1326
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001327PyObject *PyUnicode_FromOrdinal(int ordinal)
1328{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001329 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001330
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001331 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001332 PyErr_SetString(PyExc_ValueError,
1333 "chr() arg not in range(0x110000)");
1334 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001335 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001336
1337#ifndef Py_UNICODE_WIDE
1338 if (ordinal > 0xffff) {
1339 ordinal -= 0x10000;
1340 s[0] = 0xD800 | (ordinal >> 10);
1341 s[1] = 0xDC00 | (ordinal & 0x3FF);
1342 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001343 }
1344#endif
1345
Hye-Shik Chang40574832004-04-06 07:24:51 +00001346 s[0] = (Py_UNICODE)ordinal;
1347 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001348}
1349
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350PyObject *PyUnicode_FromObject(register PyObject *obj)
1351{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001352 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001354 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001355 Py_INCREF(obj);
1356 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001357 }
1358 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 /* For a Unicode subtype that's not a Unicode object,
1360 return a true Unicode object with the same data. */
1361 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1362 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001363 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001364 PyErr_Format(PyExc_TypeError,
1365 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001366 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001367 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001368}
1369
1370PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001371 const char *encoding,
1372 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001373{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001374 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001375 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001376
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001378 PyErr_BadInternalCall();
1379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001381
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001382 /* Decoding bytes objects is the most common case and should be fast */
1383 if (PyBytes_Check(obj)) {
1384 if (PyBytes_GET_SIZE(obj) == 0) {
1385 Py_INCREF(unicode_empty);
1386 v = (PyObject *) unicode_empty;
1387 }
1388 else {
1389 v = PyUnicode_Decode(
1390 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1391 encoding, errors);
1392 }
1393 return v;
1394 }
1395
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001396 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 PyErr_SetString(PyExc_TypeError,
1398 "decoding str is not supported");
1399 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001400 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001401
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001402 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1403 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1404 PyErr_Format(PyExc_TypeError,
1405 "coercing to str: need bytes, bytearray "
1406 "or buffer-like object, %.80s found",
1407 Py_TYPE(obj)->tp_name);
1408 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001409 }
Tim Petersced69f82003-09-16 20:30:58 +00001410
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001411 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001412 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001413 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 }
Tim Petersced69f82003-09-16 20:30:58 +00001415 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001416 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001417
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001418 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001419 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420}
1421
Victor Stinner600d3be2010-06-10 12:00:55 +00001422/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001423 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1424 1 on success. */
1425static int
1426normalize_encoding(const char *encoding,
1427 char *lower,
1428 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001430 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001431 char *l;
1432 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001434 e = encoding;
1435 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001436 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001437 while (*e) {
1438 if (l == l_end)
1439 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001440 if (Py_ISUPPER(*e)) {
1441 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001442 }
1443 else if (*e == '_') {
1444 *l++ = '-';
1445 e++;
1446 }
1447 else {
1448 *l++ = *e++;
1449 }
1450 }
1451 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001452 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001453}
1454
1455PyObject *PyUnicode_Decode(const char *s,
1456 Py_ssize_t size,
1457 const char *encoding,
1458 const char *errors)
1459{
1460 PyObject *buffer = NULL, *unicode;
1461 Py_buffer info;
1462 char lower[11]; /* Enough for any encoding shortcut */
1463
1464 if (encoding == NULL)
1465 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001466
1467 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001468 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1469 if (strcmp(lower, "utf-8") == 0)
1470 return PyUnicode_DecodeUTF8(s, size, errors);
1471 else if ((strcmp(lower, "latin-1") == 0) ||
1472 (strcmp(lower, "iso-8859-1") == 0))
1473 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001474#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001475 else if (strcmp(lower, "mbcs") == 0)
1476 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001477#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001478 else if (strcmp(lower, "ascii") == 0)
1479 return PyUnicode_DecodeASCII(s, size, errors);
1480 else if (strcmp(lower, "utf-16") == 0)
1481 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1482 else if (strcmp(lower, "utf-32") == 0)
1483 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485
1486 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001487 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001488 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001489 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001490 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 if (buffer == NULL)
1492 goto onError;
1493 unicode = PyCodec_Decode(buffer, encoding, errors);
1494 if (unicode == NULL)
1495 goto onError;
1496 if (!PyUnicode_Check(unicode)) {
1497 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001498 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001499 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 Py_DECREF(unicode);
1501 goto onError;
1502 }
1503 Py_DECREF(buffer);
1504 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001505
Benjamin Peterson29060642009-01-31 22:14:21 +00001506 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 Py_XDECREF(buffer);
1508 return NULL;
1509}
1510
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001511PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1512 const char *encoding,
1513 const char *errors)
1514{
1515 PyObject *v;
1516
1517 if (!PyUnicode_Check(unicode)) {
1518 PyErr_BadArgument();
1519 goto onError;
1520 }
1521
1522 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001524
1525 /* Decode via the codec registry */
1526 v = PyCodec_Decode(unicode, encoding, errors);
1527 if (v == NULL)
1528 goto onError;
1529 return v;
1530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001532 return NULL;
1533}
1534
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001535PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1536 const char *encoding,
1537 const char *errors)
1538{
1539 PyObject *v;
1540
1541 if (!PyUnicode_Check(unicode)) {
1542 PyErr_BadArgument();
1543 goto onError;
1544 }
1545
1546 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001547 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001548
1549 /* Decode via the codec registry */
1550 v = PyCodec_Decode(unicode, encoding, errors);
1551 if (v == NULL)
1552 goto onError;
1553 if (!PyUnicode_Check(v)) {
1554 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001555 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001556 Py_TYPE(v)->tp_name);
1557 Py_DECREF(v);
1558 goto onError;
1559 }
1560 return v;
1561
Benjamin Peterson29060642009-01-31 22:14:21 +00001562 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001563 return NULL;
1564}
1565
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 Py_ssize_t size,
1568 const char *encoding,
1569 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570{
1571 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001572
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573 unicode = PyUnicode_FromUnicode(s, size);
1574 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001576 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1577 Py_DECREF(unicode);
1578 return v;
1579}
1580
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001581PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1582 const char *encoding,
1583 const char *errors)
1584{
1585 PyObject *v;
1586
1587 if (!PyUnicode_Check(unicode)) {
1588 PyErr_BadArgument();
1589 goto onError;
1590 }
1591
1592 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001593 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001594
1595 /* Encode via the codec registry */
1596 v = PyCodec_Encode(unicode, encoding, errors);
1597 if (v == NULL)
1598 goto onError;
1599 return v;
1600
Benjamin Peterson29060642009-01-31 22:14:21 +00001601 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001602 return NULL;
1603}
1604
Victor Stinnerad158722010-10-27 00:25:46 +00001605PyObject *
1606PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001607{
Victor Stinner313a1202010-06-11 23:56:51 +00001608#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001609 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1610 PyUnicode_GET_SIZE(unicode),
1611 NULL);
1612#elif defined(__APPLE__)
1613 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1614 PyUnicode_GET_SIZE(unicode),
1615 "surrogateescape");
1616#else
1617 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001618 return PyUnicode_AsEncodedString(unicode,
1619 Py_FileSystemDefaultEncoding,
1620 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001621 }
1622 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001623 /* locale encoding with surrogateescape */
1624 wchar_t *wchar;
1625 char *bytes;
1626 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001627 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001628
1629 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1630 if (wchar == NULL)
1631 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001632 bytes = _Py_wchar2char(wchar, &error_pos);
1633 if (bytes == NULL) {
1634 if (error_pos != (size_t)-1) {
1635 char *errmsg = strerror(errno);
1636 PyObject *exc = NULL;
1637 if (errmsg == NULL)
1638 errmsg = "Py_wchar2char() failed";
1639 raise_encode_exception(&exc,
1640 "filesystemencoding",
1641 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1642 error_pos, error_pos+1,
1643 errmsg);
1644 Py_XDECREF(exc);
1645 }
1646 else
1647 PyErr_NoMemory();
1648 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001649 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001650 }
1651 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001652
1653 bytes_obj = PyBytes_FromString(bytes);
1654 PyMem_Free(bytes);
1655 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001656 }
Victor Stinnerad158722010-10-27 00:25:46 +00001657#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001658}
1659
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1661 const char *encoding,
1662 const char *errors)
1663{
1664 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001665 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001666
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667 if (!PyUnicode_Check(unicode)) {
1668 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670 }
Fred Drakee4315f52000-05-09 19:53:39 +00001671
Tim Petersced69f82003-09-16 20:30:58 +00001672 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001673 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001674
1675 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001676 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1677 if (strcmp(lower, "utf-8") == 0)
1678 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1679 PyUnicode_GET_SIZE(unicode),
1680 errors);
1681 else if ((strcmp(lower, "latin-1") == 0) ||
1682 (strcmp(lower, "iso-8859-1") == 0))
1683 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1684 PyUnicode_GET_SIZE(unicode),
1685 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001686#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001687 else if (strcmp(lower, "mbcs") == 0)
1688 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1689 PyUnicode_GET_SIZE(unicode),
1690 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001691#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001692 else if (strcmp(lower, "ascii") == 0)
1693 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1694 PyUnicode_GET_SIZE(unicode),
1695 errors);
1696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697
1698 /* Encode via the codec registry */
1699 v = PyCodec_Encode(unicode, encoding, errors);
1700 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001701 return NULL;
1702
1703 /* The normal path */
1704 if (PyBytes_Check(v))
1705 return v;
1706
1707 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001708 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001709 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001710 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001711
1712 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1713 "encoder %s returned bytearray instead of bytes",
1714 encoding);
1715 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001716 Py_DECREF(v);
1717 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001718 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001719
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001720 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1721 Py_DECREF(v);
1722 return b;
1723 }
1724
1725 PyErr_Format(PyExc_TypeError,
1726 "encoder did not return a bytes object (type=%.400s)",
1727 Py_TYPE(v)->tp_name);
1728 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001729 return NULL;
1730}
1731
1732PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1733 const char *encoding,
1734 const char *errors)
1735{
1736 PyObject *v;
1737
1738 if (!PyUnicode_Check(unicode)) {
1739 PyErr_BadArgument();
1740 goto onError;
1741 }
1742
1743 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001744 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001745
1746 /* Encode via the codec registry */
1747 v = PyCodec_Encode(unicode, encoding, errors);
1748 if (v == NULL)
1749 goto onError;
1750 if (!PyUnicode_Check(v)) {
1751 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001752 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001753 Py_TYPE(v)->tp_name);
1754 Py_DECREF(v);
1755 goto onError;
1756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001758
Benjamin Peterson29060642009-01-31 22:14:21 +00001759 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760 return NULL;
1761}
1762
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001763PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001765{
1766 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001767 if (v)
1768 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001769 if (errors != NULL)
1770 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001771 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001772 PyUnicode_GET_SIZE(unicode),
1773 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001774 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001775 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001776 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001777 return v;
1778}
1779
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001780PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001781PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001782 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001783 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1784}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001785
Christian Heimes5894ba72007-11-04 11:43:14 +00001786PyObject*
1787PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1788{
Victor Stinnerad158722010-10-27 00:25:46 +00001789#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1790 return PyUnicode_DecodeMBCS(s, size, NULL);
1791#elif defined(__APPLE__)
1792 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1793#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001794 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1795 can be undefined. If it is case, decode using UTF-8. The following assumes
1796 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1797 bootstrapping process where the codecs aren't ready yet.
1798 */
1799 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001800 return PyUnicode_Decode(s, size,
1801 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001802 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001803 }
1804 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001805 /* locale encoding with surrogateescape */
1806 wchar_t *wchar;
1807 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001808 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001809
1810 if (s[size] != '\0' || size != strlen(s)) {
1811 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1812 return NULL;
1813 }
1814
Victor Stinner168e1172010-10-16 23:16:16 +00001815 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001816 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001817 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001818
Victor Stinner168e1172010-10-16 23:16:16 +00001819 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001820 PyMem_Free(wchar);
1821 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001822 }
Victor Stinnerad158722010-10-27 00:25:46 +00001823#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001824}
1825
Martin v. Löwis011e8422009-05-05 04:43:17 +00001826
1827int
1828PyUnicode_FSConverter(PyObject* arg, void* addr)
1829{
1830 PyObject *output = NULL;
1831 Py_ssize_t size;
1832 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001833 if (arg == NULL) {
1834 Py_DECREF(*(PyObject**)addr);
1835 return 1;
1836 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001837 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001838 output = arg;
1839 Py_INCREF(output);
1840 }
1841 else {
1842 arg = PyUnicode_FromObject(arg);
1843 if (!arg)
1844 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001845 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001846 Py_DECREF(arg);
1847 if (!output)
1848 return 0;
1849 if (!PyBytes_Check(output)) {
1850 Py_DECREF(output);
1851 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1852 return 0;
1853 }
1854 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001855 size = PyBytes_GET_SIZE(output);
1856 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001857 if (size != strlen(data)) {
1858 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1859 Py_DECREF(output);
1860 return 0;
1861 }
1862 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001863 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001864}
1865
1866
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001867int
1868PyUnicode_FSDecoder(PyObject* arg, void* addr)
1869{
1870 PyObject *output = NULL;
1871 Py_ssize_t size;
1872 void *data;
1873 if (arg == NULL) {
1874 Py_DECREF(*(PyObject**)addr);
1875 return 1;
1876 }
1877 if (PyUnicode_Check(arg)) {
1878 output = arg;
1879 Py_INCREF(output);
1880 }
1881 else {
1882 arg = PyBytes_FromObject(arg);
1883 if (!arg)
1884 return 0;
1885 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1886 PyBytes_GET_SIZE(arg));
1887 Py_DECREF(arg);
1888 if (!output)
1889 return 0;
1890 if (!PyUnicode_Check(output)) {
1891 Py_DECREF(output);
1892 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1893 return 0;
1894 }
1895 }
1896 size = PyUnicode_GET_SIZE(output);
1897 data = PyUnicode_AS_UNICODE(output);
1898 if (size != Py_UNICODE_strlen(data)) {
1899 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1900 Py_DECREF(output);
1901 return 0;
1902 }
1903 *(PyObject**)addr = output;
1904 return Py_CLEANUP_SUPPORTED;
1905}
1906
1907
Martin v. Löwis5b222132007-06-10 09:51:05 +00001908char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001909_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001910{
Christian Heimesf3863112007-11-22 07:46:41 +00001911 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001912 if (!PyUnicode_Check(unicode)) {
1913 PyErr_BadArgument();
1914 return NULL;
1915 }
Christian Heimesf3863112007-11-22 07:46:41 +00001916 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1917 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001918 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001919 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001920 *psize = PyBytes_GET_SIZE(bytes);
1921 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001922}
1923
1924char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001925_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001926{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001927 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001928}
1929
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1931{
1932 if (!PyUnicode_Check(unicode)) {
1933 PyErr_BadArgument();
1934 goto onError;
1935 }
1936 return PyUnicode_AS_UNICODE(unicode);
1937
Benjamin Peterson29060642009-01-31 22:14:21 +00001938 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 return NULL;
1940}
1941
Martin v. Löwis18e16552006-02-15 17:27:45 +00001942Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943{
1944 if (!PyUnicode_Check(unicode)) {
1945 PyErr_BadArgument();
1946 goto onError;
1947 }
1948 return PyUnicode_GET_SIZE(unicode);
1949
Benjamin Peterson29060642009-01-31 22:14:21 +00001950 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 return -1;
1952}
1953
Thomas Wouters78890102000-07-22 19:25:51 +00001954const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001955{
Victor Stinner42cb4622010-09-01 19:39:01 +00001956 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001957}
1958
Victor Stinner554f3f02010-06-16 23:33:54 +00001959/* create or adjust a UnicodeDecodeError */
1960static void
1961make_decode_exception(PyObject **exceptionObject,
1962 const char *encoding,
1963 const char *input, Py_ssize_t length,
1964 Py_ssize_t startpos, Py_ssize_t endpos,
1965 const char *reason)
1966{
1967 if (*exceptionObject == NULL) {
1968 *exceptionObject = PyUnicodeDecodeError_Create(
1969 encoding, input, length, startpos, endpos, reason);
1970 }
1971 else {
1972 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1973 goto onError;
1974 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1975 goto onError;
1976 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1977 goto onError;
1978 }
1979 return;
1980
1981onError:
1982 Py_DECREF(*exceptionObject);
1983 *exceptionObject = NULL;
1984}
1985
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001986/* error handling callback helper:
1987 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001988 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 and adjust various state variables.
1990 return 0 on success, -1 on error
1991*/
1992
1993static
1994int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001995 const char *encoding, const char *reason,
1996 const char **input, const char **inend, Py_ssize_t *startinpos,
1997 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1998 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001999{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002000 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002001
2002 PyObject *restuple = NULL;
2003 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002004 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002005 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002006 Py_ssize_t requiredsize;
2007 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002008 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002009 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002010 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 int res = -1;
2012
2013 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002014 *errorHandler = PyCodec_LookupError(errors);
2015 if (*errorHandler == NULL)
2016 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 }
2018
Victor Stinner554f3f02010-06-16 23:33:54 +00002019 make_decode_exception(exceptionObject,
2020 encoding,
2021 *input, *inend - *input,
2022 *startinpos, *endinpos,
2023 reason);
2024 if (*exceptionObject == NULL)
2025 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026
2027 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2028 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002029 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002030 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002031 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002032 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002033 }
2034 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002035 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002036
2037 /* Copy back the bytes variables, which might have been modified by the
2038 callback */
2039 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2040 if (!inputobj)
2041 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002042 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002043 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002044 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002045 *input = PyBytes_AS_STRING(inputobj);
2046 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002047 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002048 /* we can DECREF safely, as the exception has another reference,
2049 so the object won't go away. */
2050 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002051
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002052 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002053 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002054 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002055 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2056 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002057 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058
2059 /* need more space? (at least enough for what we
2060 have+the replacement+the rest of the string (starting
2061 at the new input position), so we won't have to check space
2062 when there are no errors in the rest of the string) */
2063 repptr = PyUnicode_AS_UNICODE(repunicode);
2064 repsize = PyUnicode_GET_SIZE(repunicode);
2065 requiredsize = *outpos + repsize + insize-newpos;
2066 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002067 if (requiredsize<2*outsize)
2068 requiredsize = 2*outsize;
2069 if (_PyUnicode_Resize(output, requiredsize) < 0)
2070 goto onError;
2071 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 }
2073 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002074 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002075 Py_UNICODE_COPY(*outptr, repptr, repsize);
2076 *outptr += repsize;
2077 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 /* we made it! */
2080 res = 0;
2081
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002083 Py_XDECREF(restuple);
2084 return res;
2085}
2086
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002087/* --- UTF-7 Codec -------------------------------------------------------- */
2088
Antoine Pitrou244651a2009-05-04 18:56:13 +00002089/* See RFC2152 for details. We encode conservatively and decode liberally. */
2090
2091/* Three simple macros defining base-64. */
2092
2093/* Is c a base-64 character? */
2094
2095#define IS_BASE64(c) \
2096 (((c) >= 'A' && (c) <= 'Z') || \
2097 ((c) >= 'a' && (c) <= 'z') || \
2098 ((c) >= '0' && (c) <= '9') || \
2099 (c) == '+' || (c) == '/')
2100
2101/* given that c is a base-64 character, what is its base-64 value? */
2102
2103#define FROM_BASE64(c) \
2104 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2105 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2106 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2107 (c) == '+' ? 62 : 63)
2108
2109/* What is the base-64 character of the bottom 6 bits of n? */
2110
2111#define TO_BASE64(n) \
2112 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2113
2114/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2115 * decoded as itself. We are permissive on decoding; the only ASCII
2116 * byte not decoding to itself is the + which begins a base64
2117 * string. */
2118
2119#define DECODE_DIRECT(c) \
2120 ((c) <= 127 && (c) != '+')
2121
2122/* The UTF-7 encoder treats ASCII characters differently according to
2123 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2124 * the above). See RFC2152. This array identifies these different
2125 * sets:
2126 * 0 : "Set D"
2127 * alphanumeric and '(),-./:?
2128 * 1 : "Set O"
2129 * !"#$%&*;<=>@[]^_`{|}
2130 * 2 : "whitespace"
2131 * ht nl cr sp
2132 * 3 : special (must be base64 encoded)
2133 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2134 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135
Tim Petersced69f82003-09-16 20:30:58 +00002136static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002137char utf7_category[128] = {
2138/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2139 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2140/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2141 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2142/* sp ! " # $ % & ' ( ) * + , - . / */
2143 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2144/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2146/* @ A B C D E F G H I J K L M N O */
2147 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2148/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2150/* ` a b c d e f g h i j k l m n o */
2151 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2152/* p q r s t u v w x y z { | } ~ del */
2153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002154};
2155
Antoine Pitrou244651a2009-05-04 18:56:13 +00002156/* ENCODE_DIRECT: this character should be encoded as itself. The
2157 * answer depends on whether we are encoding set O as itself, and also
2158 * on whether we are encoding whitespace as itself. RFC2152 makes it
2159 * clear that the answers to these questions vary between
2160 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002161
Antoine Pitrou244651a2009-05-04 18:56:13 +00002162#define ENCODE_DIRECT(c, directO, directWS) \
2163 ((c) < 128 && (c) > 0 && \
2164 ((utf7_category[(c)] == 0) || \
2165 (directWS && (utf7_category[(c)] == 2)) || \
2166 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002167
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002168PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002169 Py_ssize_t size,
2170 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002171{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002172 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2173}
2174
Antoine Pitrou244651a2009-05-04 18:56:13 +00002175/* The decoder. The only state we preserve is our read position,
2176 * i.e. how many characters we have consumed. So if we end in the
2177 * middle of a shift sequence we have to back off the read position
2178 * and the output to the beginning of the sequence, otherwise we lose
2179 * all the shift state (seen bits, number of bits seen, high
2180 * surrogate). */
2181
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002182PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002183 Py_ssize_t size,
2184 const char *errors,
2185 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002188 Py_ssize_t startinpos;
2189 Py_ssize_t endinpos;
2190 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002191 const char *e;
2192 PyUnicodeObject *unicode;
2193 Py_UNICODE *p;
2194 const char *errmsg = "";
2195 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002196 Py_UNICODE *shiftOutStart;
2197 unsigned int base64bits = 0;
2198 unsigned long base64buffer = 0;
2199 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002200 PyObject *errorHandler = NULL;
2201 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002202
2203 unicode = _PyUnicode_New(size);
2204 if (!unicode)
2205 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002206 if (size == 0) {
2207 if (consumed)
2208 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002210 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002211
2212 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002213 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002214 e = s + size;
2215
2216 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002217 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002218 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002219 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002220
Antoine Pitrou244651a2009-05-04 18:56:13 +00002221 if (inShift) { /* in a base-64 section */
2222 if (IS_BASE64(ch)) { /* consume a base-64 character */
2223 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2224 base64bits += 6;
2225 s++;
2226 if (base64bits >= 16) {
2227 /* we have enough bits for a UTF-16 value */
2228 Py_UNICODE outCh = (Py_UNICODE)
2229 (base64buffer >> (base64bits-16));
2230 base64bits -= 16;
2231 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2232 if (surrogate) {
2233 /* expecting a second surrogate */
2234 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2235#ifdef Py_UNICODE_WIDE
2236 *p++ = (((surrogate & 0x3FF)<<10)
2237 | (outCh & 0x3FF)) + 0x10000;
2238#else
2239 *p++ = surrogate;
2240 *p++ = outCh;
2241#endif
2242 surrogate = 0;
2243 }
2244 else {
2245 surrogate = 0;
2246 errmsg = "second surrogate missing";
2247 goto utf7Error;
2248 }
2249 }
2250 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2251 /* first surrogate */
2252 surrogate = outCh;
2253 }
2254 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2255 errmsg = "unexpected second surrogate";
2256 goto utf7Error;
2257 }
2258 else {
2259 *p++ = outCh;
2260 }
2261 }
2262 }
2263 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002264 inShift = 0;
2265 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002266 if (surrogate) {
2267 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002268 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002269 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002270 if (base64bits > 0) { /* left-over bits */
2271 if (base64bits >= 6) {
2272 /* We've seen at least one base-64 character */
2273 errmsg = "partial character in shift sequence";
2274 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002275 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002276 else {
2277 /* Some bits remain; they should be zero */
2278 if (base64buffer != 0) {
2279 errmsg = "non-zero padding bits in shift sequence";
2280 goto utf7Error;
2281 }
2282 }
2283 }
2284 if (ch != '-') {
2285 /* '-' is absorbed; other terminating
2286 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002287 *p++ = ch;
2288 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002289 }
2290 }
2291 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002292 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293 s++; /* consume '+' */
2294 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002295 s++;
2296 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002297 }
2298 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002299 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002300 shiftOutStart = p;
2301 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002302 }
2303 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002304 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002305 *p++ = ch;
2306 s++;
2307 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002308 else {
2309 startinpos = s-starts;
2310 s++;
2311 errmsg = "unexpected special character";
2312 goto utf7Error;
2313 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002314 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002315utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002316 outpos = p-PyUnicode_AS_UNICODE(unicode);
2317 endinpos = s-starts;
2318 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002319 errors, &errorHandler,
2320 "utf7", errmsg,
2321 &starts, &e, &startinpos, &endinpos, &exc, &s,
2322 &unicode, &outpos, &p))
2323 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002324 }
2325
Antoine Pitrou244651a2009-05-04 18:56:13 +00002326 /* end of string */
2327
2328 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2329 /* if we're in an inconsistent state, that's an error */
2330 if (surrogate ||
2331 (base64bits >= 6) ||
2332 (base64bits > 0 && base64buffer != 0)) {
2333 outpos = p-PyUnicode_AS_UNICODE(unicode);
2334 endinpos = size;
2335 if (unicode_decode_call_errorhandler(
2336 errors, &errorHandler,
2337 "utf7", "unterminated shift sequence",
2338 &starts, &e, &startinpos, &endinpos, &exc, &s,
2339 &unicode, &outpos, &p))
2340 goto onError;
2341 if (s < e)
2342 goto restart;
2343 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002344 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002345
2346 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002347 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002348 if (inShift) {
2349 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002350 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002351 }
2352 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002353 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002354 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002355 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002356
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002357 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002358 goto onError;
2359
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002360 Py_XDECREF(errorHandler);
2361 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002362 return (PyObject *)unicode;
2363
Benjamin Peterson29060642009-01-31 22:14:21 +00002364 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002365 Py_XDECREF(errorHandler);
2366 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002367 Py_DECREF(unicode);
2368 return NULL;
2369}
2370
2371
2372PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002373 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002374 int base64SetO,
2375 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002377{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002378 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002379 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002380 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002381 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002382 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002383 unsigned int base64bits = 0;
2384 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002385 char * out;
2386 char * start;
2387
2388 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002389 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002390
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002391 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002392 return PyErr_NoMemory();
2393
Antoine Pitrou244651a2009-05-04 18:56:13 +00002394 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002395 if (v == NULL)
2396 return NULL;
2397
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002398 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002399 for (;i < size; ++i) {
2400 Py_UNICODE ch = s[i];
2401
Antoine Pitrou244651a2009-05-04 18:56:13 +00002402 if (inShift) {
2403 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2404 /* shifting out */
2405 if (base64bits) { /* output remaining bits */
2406 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2407 base64buffer = 0;
2408 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002409 }
2410 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002411 /* Characters not in the BASE64 set implicitly unshift the sequence
2412 so no '-' is required, except if the character is itself a '-' */
2413 if (IS_BASE64(ch) || ch == '-') {
2414 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002416 *out++ = (char) ch;
2417 }
2418 else {
2419 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002420 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002421 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002422 else { /* not in a shift sequence */
2423 if (ch == '+') {
2424 *out++ = '+';
2425 *out++ = '-';
2426 }
2427 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2428 *out++ = (char) ch;
2429 }
2430 else {
2431 *out++ = '+';
2432 inShift = 1;
2433 goto encode_char;
2434 }
2435 }
2436 continue;
2437encode_char:
2438#ifdef Py_UNICODE_WIDE
2439 if (ch >= 0x10000) {
2440 /* code first surrogate */
2441 base64bits += 16;
2442 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2443 while (base64bits >= 6) {
2444 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2445 base64bits -= 6;
2446 }
2447 /* prepare second surrogate */
2448 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2449 }
2450#endif
2451 base64bits += 16;
2452 base64buffer = (base64buffer << 16) | ch;
2453 while (base64bits >= 6) {
2454 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2455 base64bits -= 6;
2456 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002458 if (base64bits)
2459 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2460 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002461 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002462 if (_PyBytes_Resize(&v, out - start) < 0)
2463 return NULL;
2464 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002465}
2466
Antoine Pitrou244651a2009-05-04 18:56:13 +00002467#undef IS_BASE64
2468#undef FROM_BASE64
2469#undef TO_BASE64
2470#undef DECODE_DIRECT
2471#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002472
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473/* --- UTF-8 Codec -------------------------------------------------------- */
2474
Tim Petersced69f82003-09-16 20:30:58 +00002475static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002477 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2478 illegal prefix. See RFC 3629 for details */
2479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2480 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002481 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2483 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2484 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2485 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2491 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2492 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2493 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2494 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495};
2496
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002498 Py_ssize_t size,
2499 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500{
Walter Dörwald69652032004-09-07 20:24:22 +00002501 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2502}
2503
Antoine Pitrouab868312009-01-10 15:40:25 +00002504/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2505#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2506
2507/* Mask to quickly check whether a C 'long' contains a
2508 non-ASCII, UTF8-encoded char. */
2509#if (SIZEOF_LONG == 8)
2510# define ASCII_CHAR_MASK 0x8080808080808080L
2511#elif (SIZEOF_LONG == 4)
2512# define ASCII_CHAR_MASK 0x80808080L
2513#else
2514# error C 'long' size should be either 4 or 8!
2515#endif
2516
Walter Dörwald69652032004-09-07 20:24:22 +00002517PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002518 Py_ssize_t size,
2519 const char *errors,
2520 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002521{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002522 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002524 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002525 Py_ssize_t startinpos;
2526 Py_ssize_t endinpos;
2527 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002528 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 PyUnicodeObject *unicode;
2530 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002531 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002532 PyObject *errorHandler = NULL;
2533 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534
2535 /* Note: size will always be longer than the resulting Unicode
2536 character count */
2537 unicode = _PyUnicode_New(size);
2538 if (!unicode)
2539 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002540 if (size == 0) {
2541 if (consumed)
2542 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545
2546 /* Unpack UTF-8 encoded data */
2547 p = unicode->str;
2548 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002549 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550
2551 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002552 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553
2554 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002555 /* Fast path for runs of ASCII characters. Given that common UTF-8
2556 input will consist of an overwhelming majority of ASCII
2557 characters, we try to optimize for this case by checking
2558 as many characters as a C 'long' can contain.
2559 First, check if we can do an aligned read, as most CPUs have
2560 a penalty for unaligned reads.
2561 */
2562 if (!((size_t) s & LONG_PTR_MASK)) {
2563 /* Help register allocation */
2564 register const char *_s = s;
2565 register Py_UNICODE *_p = p;
2566 while (_s < aligned_end) {
2567 /* Read a whole long at a time (either 4 or 8 bytes),
2568 and do a fast unrolled copy if it only contains ASCII
2569 characters. */
2570 unsigned long data = *(unsigned long *) _s;
2571 if (data & ASCII_CHAR_MASK)
2572 break;
2573 _p[0] = (unsigned char) _s[0];
2574 _p[1] = (unsigned char) _s[1];
2575 _p[2] = (unsigned char) _s[2];
2576 _p[3] = (unsigned char) _s[3];
2577#if (SIZEOF_LONG == 8)
2578 _p[4] = (unsigned char) _s[4];
2579 _p[5] = (unsigned char) _s[5];
2580 _p[6] = (unsigned char) _s[6];
2581 _p[7] = (unsigned char) _s[7];
2582#endif
2583 _s += SIZEOF_LONG;
2584 _p += SIZEOF_LONG;
2585 }
2586 s = _s;
2587 p = _p;
2588 if (s == e)
2589 break;
2590 ch = (unsigned char)*s;
2591 }
2592 }
2593
2594 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002595 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 s++;
2597 continue;
2598 }
2599
2600 n = utf8_code_length[ch];
2601
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002602 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002603 if (consumed)
2604 break;
2605 else {
2606 errmsg = "unexpected end of data";
2607 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002608 endinpos = startinpos+1;
2609 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2610 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002611 goto utf8Error;
2612 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614
2615 switch (n) {
2616
2617 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002618 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002619 startinpos = s-starts;
2620 endinpos = startinpos+1;
2621 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622
2623 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002624 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002625 startinpos = s-starts;
2626 endinpos = startinpos+1;
2627 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628
2629 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002630 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002631 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002632 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002633 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002634 goto utf8Error;
2635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002637 assert ((ch > 0x007F) && (ch <= 0x07FF));
2638 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639 break;
2640
2641 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002642 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2643 will result in surrogates in range d800-dfff. Surrogates are
2644 not valid UTF-8 so they are rejected.
2645 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2646 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002647 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002648 (s[2] & 0xc0) != 0x80 ||
2649 ((unsigned char)s[0] == 0xE0 &&
2650 (unsigned char)s[1] < 0xA0) ||
2651 ((unsigned char)s[0] == 0xED &&
2652 (unsigned char)s[1] > 0x9F)) {
2653 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002654 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002655 endinpos = startinpos + 1;
2656
2657 /* if s[1] first two bits are 1 and 0, then the invalid
2658 continuation byte is s[2], so increment endinpos by 1,
2659 if not, s[1] is invalid and endinpos doesn't need to
2660 be incremented. */
2661 if ((s[1] & 0xC0) == 0x80)
2662 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002663 goto utf8Error;
2664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002666 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2667 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002668 break;
2669
2670 case 4:
2671 if ((s[1] & 0xc0) != 0x80 ||
2672 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002673 (s[3] & 0xc0) != 0x80 ||
2674 ((unsigned char)s[0] == 0xF0 &&
2675 (unsigned char)s[1] < 0x90) ||
2676 ((unsigned char)s[0] == 0xF4 &&
2677 (unsigned char)s[1] > 0x8F)) {
2678 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002679 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002680 endinpos = startinpos + 1;
2681 if ((s[1] & 0xC0) == 0x80) {
2682 endinpos++;
2683 if ((s[2] & 0xC0) == 0x80)
2684 endinpos++;
2685 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 goto utf8Error;
2687 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002688 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002689 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2690 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2691
Fredrik Lundh8f455852001-06-27 18:59:43 +00002692#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002693 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002694#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002695 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002696
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002697 /* translate from 10000..10FFFF to 0..FFFF */
2698 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002699
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002700 /* high surrogate = top 10 bits added to D800 */
2701 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002702
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002703 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002704 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002705#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 }
2708 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002709 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002710
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 utf8Error:
2712 outpos = p-PyUnicode_AS_UNICODE(unicode);
2713 if (unicode_decode_call_errorhandler(
2714 errors, &errorHandler,
2715 "utf8", errmsg,
2716 &starts, &e, &startinpos, &endinpos, &exc, &s,
2717 &unicode, &outpos, &p))
2718 goto onError;
2719 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 }
Walter Dörwald69652032004-09-07 20:24:22 +00002721 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002722 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723
2724 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002725 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 goto onError;
2727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 Py_XDECREF(errorHandler);
2729 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 return (PyObject *)unicode;
2731
Benjamin Peterson29060642009-01-31 22:14:21 +00002732 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 Py_XDECREF(errorHandler);
2734 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 Py_DECREF(unicode);
2736 return NULL;
2737}
2738
Antoine Pitrouab868312009-01-10 15:40:25 +00002739#undef ASCII_CHAR_MASK
2740
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002741#ifdef __APPLE__
2742
2743/* Simplified UTF-8 decoder using surrogateescape error handler,
2744 used to decode the command line arguments on Mac OS X. */
2745
2746wchar_t*
2747_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2748{
2749 int n;
2750 const char *e;
2751 wchar_t *unicode, *p;
2752
2753 /* Note: size will always be longer than the resulting Unicode
2754 character count */
2755 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2756 PyErr_NoMemory();
2757 return NULL;
2758 }
2759 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2760 if (!unicode)
2761 return NULL;
2762
2763 /* Unpack UTF-8 encoded data */
2764 p = unicode;
2765 e = s + size;
2766 while (s < e) {
2767 Py_UCS4 ch = (unsigned char)*s;
2768
2769 if (ch < 0x80) {
2770 *p++ = (wchar_t)ch;
2771 s++;
2772 continue;
2773 }
2774
2775 n = utf8_code_length[ch];
2776 if (s + n > e) {
2777 goto surrogateescape;
2778 }
2779
2780 switch (n) {
2781 case 0:
2782 case 1:
2783 goto surrogateescape;
2784
2785 case 2:
2786 if ((s[1] & 0xc0) != 0x80)
2787 goto surrogateescape;
2788 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2789 assert ((ch > 0x007F) && (ch <= 0x07FF));
2790 *p++ = (wchar_t)ch;
2791 break;
2792
2793 case 3:
2794 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2795 will result in surrogates in range d800-dfff. Surrogates are
2796 not valid UTF-8 so they are rejected.
2797 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2798 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2799 if ((s[1] & 0xc0) != 0x80 ||
2800 (s[2] & 0xc0) != 0x80 ||
2801 ((unsigned char)s[0] == 0xE0 &&
2802 (unsigned char)s[1] < 0xA0) ||
2803 ((unsigned char)s[0] == 0xED &&
2804 (unsigned char)s[1] > 0x9F)) {
2805
2806 goto surrogateescape;
2807 }
2808 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2809 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2810 *p++ = (Py_UNICODE)ch;
2811 break;
2812
2813 case 4:
2814 if ((s[1] & 0xc0) != 0x80 ||
2815 (s[2] & 0xc0) != 0x80 ||
2816 (s[3] & 0xc0) != 0x80 ||
2817 ((unsigned char)s[0] == 0xF0 &&
2818 (unsigned char)s[1] < 0x90) ||
2819 ((unsigned char)s[0] == 0xF4 &&
2820 (unsigned char)s[1] > 0x8F)) {
2821 goto surrogateescape;
2822 }
2823 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2824 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2825 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2826
2827#if SIZEOF_WCHAR_T == 4
2828 *p++ = (wchar_t)ch;
2829#else
2830 /* compute and append the two surrogates: */
2831
2832 /* translate from 10000..10FFFF to 0..FFFF */
2833 ch -= 0x10000;
2834
2835 /* high surrogate = top 10 bits added to D800 */
2836 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2837
2838 /* low surrogate = bottom 10 bits added to DC00 */
2839 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2840#endif
2841 break;
2842 }
2843 s += n;
2844 continue;
2845
2846 surrogateescape:
2847 *p++ = 0xDC00 + ch;
2848 s++;
2849 }
2850 *p = L'\0';
2851 return unicode;
2852}
2853
2854#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002855
Tim Peters602f7402002-04-27 18:03:26 +00002856/* Allocation strategy: if the string is short, convert into a stack buffer
2857 and allocate exactly as much space needed at the end. Else allocate the
2858 maximum possible needed (4 result bytes per Unicode character), and return
2859 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002860*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002861PyObject *
2862PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002863 Py_ssize_t size,
2864 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865{
Tim Peters602f7402002-04-27 18:03:26 +00002866#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002867
Guido van Rossum98297ee2007-11-06 21:34:58 +00002868 Py_ssize_t i; /* index into s of next input byte */
2869 PyObject *result; /* result string object */
2870 char *p; /* next free byte in output buffer */
2871 Py_ssize_t nallocated; /* number of result bytes allocated */
2872 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002873 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002874 PyObject *errorHandler = NULL;
2875 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002876
Tim Peters602f7402002-04-27 18:03:26 +00002877 assert(s != NULL);
2878 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879
Tim Peters602f7402002-04-27 18:03:26 +00002880 if (size <= MAX_SHORT_UNICHARS) {
2881 /* Write into the stack buffer; nallocated can't overflow.
2882 * At the end, we'll allocate exactly as much heap space as it
2883 * turns out we need.
2884 */
2885 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002886 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002887 p = stackbuf;
2888 }
2889 else {
2890 /* Overallocate on the heap, and give the excess back at the end. */
2891 nallocated = size * 4;
2892 if (nallocated / 4 != size) /* overflow! */
2893 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002894 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002895 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002896 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002897 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002898 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002899
Tim Peters602f7402002-04-27 18:03:26 +00002900 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002901 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002902
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002903 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002904 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002906
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002908 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002909 *p++ = (char)(0xc0 | (ch >> 6));
2910 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002911 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002912#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002913 /* Special case: check for high and low surrogate */
2914 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2915 Py_UCS4 ch2 = s[i];
2916 /* Combine the two surrogates to form a UCS4 value */
2917 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2918 i++;
2919
2920 /* Encode UCS4 Unicode ordinals */
2921 *p++ = (char)(0xf0 | (ch >> 18));
2922 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002923 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2924 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002925 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002926#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002927 Py_ssize_t newpos;
2928 PyObject *rep;
2929 Py_ssize_t repsize, k;
2930 rep = unicode_encode_call_errorhandler
2931 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2932 s, size, &exc, i-1, i, &newpos);
2933 if (!rep)
2934 goto error;
2935
2936 if (PyBytes_Check(rep))
2937 repsize = PyBytes_GET_SIZE(rep);
2938 else
2939 repsize = PyUnicode_GET_SIZE(rep);
2940
2941 if (repsize > 4) {
2942 Py_ssize_t offset;
2943
2944 if (result == NULL)
2945 offset = p - stackbuf;
2946 else
2947 offset = p - PyBytes_AS_STRING(result);
2948
2949 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2950 /* integer overflow */
2951 PyErr_NoMemory();
2952 goto error;
2953 }
2954 nallocated += repsize - 4;
2955 if (result != NULL) {
2956 if (_PyBytes_Resize(&result, nallocated) < 0)
2957 goto error;
2958 } else {
2959 result = PyBytes_FromStringAndSize(NULL, nallocated);
2960 if (result == NULL)
2961 goto error;
2962 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2963 }
2964 p = PyBytes_AS_STRING(result) + offset;
2965 }
2966
2967 if (PyBytes_Check(rep)) {
2968 char *prep = PyBytes_AS_STRING(rep);
2969 for(k = repsize; k > 0; k--)
2970 *p++ = *prep++;
2971 } else /* rep is unicode */ {
2972 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2973 Py_UNICODE c;
2974
2975 for(k=0; k<repsize; k++) {
2976 c = prep[k];
2977 if (0x80 <= c) {
2978 raise_encode_exception(&exc, "utf-8", s, size,
2979 i-1, i, "surrogates not allowed");
2980 goto error;
2981 }
2982 *p++ = (char)prep[k];
2983 }
2984 }
2985 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002986#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002987 }
Victor Stinner445a6232010-04-22 20:01:57 +00002988#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002989 } else if (ch < 0x10000) {
2990 *p++ = (char)(0xe0 | (ch >> 12));
2991 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2992 *p++ = (char)(0x80 | (ch & 0x3f));
2993 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002994 /* Encode UCS4 Unicode ordinals */
2995 *p++ = (char)(0xf0 | (ch >> 18));
2996 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2997 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2998 *p++ = (char)(0x80 | (ch & 0x3f));
2999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003001
Guido van Rossum98297ee2007-11-06 21:34:58 +00003002 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003003 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003004 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003005 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003006 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003007 }
3008 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003009 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003010 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003011 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003012 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003013 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003014 Py_XDECREF(errorHandler);
3015 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003016 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003017 error:
3018 Py_XDECREF(errorHandler);
3019 Py_XDECREF(exc);
3020 Py_XDECREF(result);
3021 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003022
Tim Peters602f7402002-04-27 18:03:26 +00003023#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024}
3025
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3027{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 if (!PyUnicode_Check(unicode)) {
3029 PyErr_BadArgument();
3030 return NULL;
3031 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003032 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 PyUnicode_GET_SIZE(unicode),
3034 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035}
3036
Walter Dörwald41980ca2007-08-16 21:55:45 +00003037/* --- UTF-32 Codec ------------------------------------------------------- */
3038
3039PyObject *
3040PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 Py_ssize_t size,
3042 const char *errors,
3043 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003044{
3045 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3046}
3047
3048PyObject *
3049PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 Py_ssize_t size,
3051 const char *errors,
3052 int *byteorder,
3053 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003054{
3055 const char *starts = s;
3056 Py_ssize_t startinpos;
3057 Py_ssize_t endinpos;
3058 Py_ssize_t outpos;
3059 PyUnicodeObject *unicode;
3060 Py_UNICODE *p;
3061#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003062 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003063 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003064#else
3065 const int pairs = 0;
3066#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003067 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003068 int bo = 0; /* assume native ordering by default */
3069 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003070 /* Offsets from q for retrieving bytes in the right order. */
3071#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3072 int iorder[] = {0, 1, 2, 3};
3073#else
3074 int iorder[] = {3, 2, 1, 0};
3075#endif
3076 PyObject *errorHandler = NULL;
3077 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003078
Walter Dörwald41980ca2007-08-16 21:55:45 +00003079 q = (unsigned char *)s;
3080 e = q + size;
3081
3082 if (byteorder)
3083 bo = *byteorder;
3084
3085 /* Check for BOM marks (U+FEFF) in the input and adjust current
3086 byte order setting accordingly. In native mode, the leading BOM
3087 mark is skipped, in all other modes, it is copied to the output
3088 stream as-is (giving a ZWNBSP character). */
3089 if (bo == 0) {
3090 if (size >= 4) {
3091 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003092 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003093#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003094 if (bom == 0x0000FEFF) {
3095 q += 4;
3096 bo = -1;
3097 }
3098 else if (bom == 0xFFFE0000) {
3099 q += 4;
3100 bo = 1;
3101 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003102#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003103 if (bom == 0x0000FEFF) {
3104 q += 4;
3105 bo = 1;
3106 }
3107 else if (bom == 0xFFFE0000) {
3108 q += 4;
3109 bo = -1;
3110 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003111#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003113 }
3114
3115 if (bo == -1) {
3116 /* force LE */
3117 iorder[0] = 0;
3118 iorder[1] = 1;
3119 iorder[2] = 2;
3120 iorder[3] = 3;
3121 }
3122 else if (bo == 1) {
3123 /* force BE */
3124 iorder[0] = 3;
3125 iorder[1] = 2;
3126 iorder[2] = 1;
3127 iorder[3] = 0;
3128 }
3129
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003130 /* On narrow builds we split characters outside the BMP into two
3131 codepoints => count how much extra space we need. */
3132#ifndef Py_UNICODE_WIDE
3133 for (qq = q; qq < e; qq += 4)
3134 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3135 pairs++;
3136#endif
3137
3138 /* This might be one to much, because of a BOM */
3139 unicode = _PyUnicode_New((size+3)/4+pairs);
3140 if (!unicode)
3141 return NULL;
3142 if (size == 0)
3143 return (PyObject *)unicode;
3144
3145 /* Unpack UTF-32 encoded data */
3146 p = unicode->str;
3147
Walter Dörwald41980ca2007-08-16 21:55:45 +00003148 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003149 Py_UCS4 ch;
3150 /* remaining bytes at the end? (size should be divisible by 4) */
3151 if (e-q<4) {
3152 if (consumed)
3153 break;
3154 errmsg = "truncated data";
3155 startinpos = ((const char *)q)-starts;
3156 endinpos = ((const char *)e)-starts;
3157 goto utf32Error;
3158 /* The remaining input chars are ignored if the callback
3159 chooses to skip the input */
3160 }
3161 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3162 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003163
Benjamin Peterson29060642009-01-31 22:14:21 +00003164 if (ch >= 0x110000)
3165 {
3166 errmsg = "codepoint not in range(0x110000)";
3167 startinpos = ((const char *)q)-starts;
3168 endinpos = startinpos+4;
3169 goto utf32Error;
3170 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003171#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 if (ch >= 0x10000)
3173 {
3174 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3175 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3176 }
3177 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003178#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 *p++ = ch;
3180 q += 4;
3181 continue;
3182 utf32Error:
3183 outpos = p-PyUnicode_AS_UNICODE(unicode);
3184 if (unicode_decode_call_errorhandler(
3185 errors, &errorHandler,
3186 "utf32", errmsg,
3187 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3188 &unicode, &outpos, &p))
3189 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003190 }
3191
3192 if (byteorder)
3193 *byteorder = bo;
3194
3195 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003196 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003197
3198 /* Adjust length */
3199 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3200 goto onError;
3201
3202 Py_XDECREF(errorHandler);
3203 Py_XDECREF(exc);
3204 return (PyObject *)unicode;
3205
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003207 Py_DECREF(unicode);
3208 Py_XDECREF(errorHandler);
3209 Py_XDECREF(exc);
3210 return NULL;
3211}
3212
3213PyObject *
3214PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003215 Py_ssize_t size,
3216 const char *errors,
3217 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003218{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003219 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003220 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003221 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003222#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003223 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003224#else
3225 const int pairs = 0;
3226#endif
3227 /* Offsets from p for storing byte pairs in the right order. */
3228#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3229 int iorder[] = {0, 1, 2, 3};
3230#else
3231 int iorder[] = {3, 2, 1, 0};
3232#endif
3233
Benjamin Peterson29060642009-01-31 22:14:21 +00003234#define STORECHAR(CH) \
3235 do { \
3236 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3237 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3238 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3239 p[iorder[0]] = (CH) & 0xff; \
3240 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003241 } while(0)
3242
3243 /* In narrow builds we can output surrogate pairs as one codepoint,
3244 so we need less space. */
3245#ifndef Py_UNICODE_WIDE
3246 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3248 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3249 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003250#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003251 nsize = (size - pairs + (byteorder == 0));
3252 bytesize = nsize * 4;
3253 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003254 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003255 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003256 if (v == NULL)
3257 return NULL;
3258
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003259 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003260 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003262 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003263 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003264
3265 if (byteorder == -1) {
3266 /* force LE */
3267 iorder[0] = 0;
3268 iorder[1] = 1;
3269 iorder[2] = 2;
3270 iorder[3] = 3;
3271 }
3272 else if (byteorder == 1) {
3273 /* force BE */
3274 iorder[0] = 3;
3275 iorder[1] = 2;
3276 iorder[2] = 1;
3277 iorder[3] = 0;
3278 }
3279
3280 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003282#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3284 Py_UCS4 ch2 = *s;
3285 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3286 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3287 s++;
3288 size--;
3289 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003290 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003291#endif
3292 STORECHAR(ch);
3293 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003294
3295 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003296 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003297#undef STORECHAR
3298}
3299
3300PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3301{
3302 if (!PyUnicode_Check(unicode)) {
3303 PyErr_BadArgument();
3304 return NULL;
3305 }
3306 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003307 PyUnicode_GET_SIZE(unicode),
3308 NULL,
3309 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003310}
3311
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312/* --- UTF-16 Codec ------------------------------------------------------- */
3313
Tim Peters772747b2001-08-09 22:21:55 +00003314PyObject *
3315PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 Py_ssize_t size,
3317 const char *errors,
3318 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319{
Walter Dörwald69652032004-09-07 20:24:22 +00003320 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3321}
3322
Antoine Pitrouab868312009-01-10 15:40:25 +00003323/* Two masks for fast checking of whether a C 'long' may contain
3324 UTF16-encoded surrogate characters. This is an efficient heuristic,
3325 assuming that non-surrogate characters with a code point >= 0x8000 are
3326 rare in most input.
3327 FAST_CHAR_MASK is used when the input is in native byte ordering,
3328 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003329*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003330#if (SIZEOF_LONG == 8)
3331# define FAST_CHAR_MASK 0x8000800080008000L
3332# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3333#elif (SIZEOF_LONG == 4)
3334# define FAST_CHAR_MASK 0x80008000L
3335# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3336#else
3337# error C 'long' size should be either 4 or 8!
3338#endif
3339
Walter Dörwald69652032004-09-07 20:24:22 +00003340PyObject *
3341PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 Py_ssize_t size,
3343 const char *errors,
3344 int *byteorder,
3345 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003346{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003348 Py_ssize_t startinpos;
3349 Py_ssize_t endinpos;
3350 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 PyUnicodeObject *unicode;
3352 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003353 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003354 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003355 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003356 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003357 /* Offsets from q for retrieving byte pairs in the right order. */
3358#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3359 int ihi = 1, ilo = 0;
3360#else
3361 int ihi = 0, ilo = 1;
3362#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003363 PyObject *errorHandler = NULL;
3364 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365
3366 /* Note: size will always be longer than the resulting Unicode
3367 character count */
3368 unicode = _PyUnicode_New(size);
3369 if (!unicode)
3370 return NULL;
3371 if (size == 0)
3372 return (PyObject *)unicode;
3373
3374 /* Unpack UTF-16 encoded data */
3375 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003376 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003377 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378
3379 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003380 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003382 /* Check for BOM marks (U+FEFF) in the input and adjust current
3383 byte order setting accordingly. In native mode, the leading BOM
3384 mark is skipped, in all other modes, it is copied to the output
3385 stream as-is (giving a ZWNBSP character). */
3386 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003387 if (size >= 2) {
3388 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003389#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003390 if (bom == 0xFEFF) {
3391 q += 2;
3392 bo = -1;
3393 }
3394 else if (bom == 0xFFFE) {
3395 q += 2;
3396 bo = 1;
3397 }
Tim Petersced69f82003-09-16 20:30:58 +00003398#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003399 if (bom == 0xFEFF) {
3400 q += 2;
3401 bo = 1;
3402 }
3403 else if (bom == 0xFFFE) {
3404 q += 2;
3405 bo = -1;
3406 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003407#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410
Tim Peters772747b2001-08-09 22:21:55 +00003411 if (bo == -1) {
3412 /* force LE */
3413 ihi = 1;
3414 ilo = 0;
3415 }
3416 else if (bo == 1) {
3417 /* force BE */
3418 ihi = 0;
3419 ilo = 1;
3420 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003421#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3422 native_ordering = ilo < ihi;
3423#else
3424 native_ordering = ilo > ihi;
3425#endif
Tim Peters772747b2001-08-09 22:21:55 +00003426
Antoine Pitrouab868312009-01-10 15:40:25 +00003427 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003428 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003429 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003430 /* First check for possible aligned read of a C 'long'. Unaligned
3431 reads are more expensive, better to defer to another iteration. */
3432 if (!((size_t) q & LONG_PTR_MASK)) {
3433 /* Fast path for runs of non-surrogate chars. */
3434 register const unsigned char *_q = q;
3435 Py_UNICODE *_p = p;
3436 if (native_ordering) {
3437 /* Native ordering is simple: as long as the input cannot
3438 possibly contain a surrogate char, do an unrolled copy
3439 of several 16-bit code points to the target object.
3440 The non-surrogate check is done on several input bytes
3441 at a time (as many as a C 'long' can contain). */
3442 while (_q < aligned_end) {
3443 unsigned long data = * (unsigned long *) _q;
3444 if (data & FAST_CHAR_MASK)
3445 break;
3446 _p[0] = ((unsigned short *) _q)[0];
3447 _p[1] = ((unsigned short *) _q)[1];
3448#if (SIZEOF_LONG == 8)
3449 _p[2] = ((unsigned short *) _q)[2];
3450 _p[3] = ((unsigned short *) _q)[3];
3451#endif
3452 _q += SIZEOF_LONG;
3453 _p += SIZEOF_LONG / 2;
3454 }
3455 }
3456 else {
3457 /* Byteswapped ordering is similar, but we must decompose
3458 the copy bytewise, and take care of zero'ing out the
3459 upper bytes if the target object is in 32-bit units
3460 (that is, in UCS-4 builds). */
3461 while (_q < aligned_end) {
3462 unsigned long data = * (unsigned long *) _q;
3463 if (data & SWAPPED_FAST_CHAR_MASK)
3464 break;
3465 /* Zero upper bytes in UCS-4 builds */
3466#if (Py_UNICODE_SIZE > 2)
3467 _p[0] = 0;
3468 _p[1] = 0;
3469#if (SIZEOF_LONG == 8)
3470 _p[2] = 0;
3471 _p[3] = 0;
3472#endif
3473#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003474 /* Issue #4916; UCS-4 builds on big endian machines must
3475 fill the two last bytes of each 4-byte unit. */
3476#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3477# define OFF 2
3478#else
3479# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003480#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003481 ((unsigned char *) _p)[OFF + 1] = _q[0];
3482 ((unsigned char *) _p)[OFF + 0] = _q[1];
3483 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3484 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3485#if (SIZEOF_LONG == 8)
3486 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3487 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3488 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3489 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3490#endif
3491#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003492 _q += SIZEOF_LONG;
3493 _p += SIZEOF_LONG / 2;
3494 }
3495 }
3496 p = _p;
3497 q = _q;
3498 if (q >= e)
3499 break;
3500 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003501 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502
Benjamin Peterson14339b62009-01-31 16:36:08 +00003503 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003504
3505 if (ch < 0xD800 || ch > 0xDFFF) {
3506 *p++ = ch;
3507 continue;
3508 }
3509
3510 /* UTF-16 code pair: */
3511 if (q > e) {
3512 errmsg = "unexpected end of data";
3513 startinpos = (((const char *)q) - 2) - starts;
3514 endinpos = ((const char *)e) + 1 - starts;
3515 goto utf16Error;
3516 }
3517 if (0xD800 <= ch && ch <= 0xDBFF) {
3518 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3519 q += 2;
3520 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003521#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003522 *p++ = ch;
3523 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003524#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003525 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003526#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003527 continue;
3528 }
3529 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003530 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003531 startinpos = (((const char *)q)-4)-starts;
3532 endinpos = startinpos+2;
3533 goto utf16Error;
3534 }
3535
Benjamin Peterson14339b62009-01-31 16:36:08 +00003536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 errmsg = "illegal encoding";
3538 startinpos = (((const char *)q)-2)-starts;
3539 endinpos = startinpos+2;
3540 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003541
Benjamin Peterson29060642009-01-31 22:14:21 +00003542 utf16Error:
3543 outpos = p - PyUnicode_AS_UNICODE(unicode);
3544 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003545 errors,
3546 &errorHandler,
3547 "utf16", errmsg,
3548 &starts,
3549 (const char **)&e,
3550 &startinpos,
3551 &endinpos,
3552 &exc,
3553 (const char **)&q,
3554 &unicode,
3555 &outpos,
3556 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003557 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003559 /* remaining byte at the end? (size should be even) */
3560 if (e == q) {
3561 if (!consumed) {
3562 errmsg = "truncated data";
3563 startinpos = ((const char *)q) - starts;
3564 endinpos = ((const char *)e) + 1 - starts;
3565 outpos = p - PyUnicode_AS_UNICODE(unicode);
3566 if (unicode_decode_call_errorhandler(
3567 errors,
3568 &errorHandler,
3569 "utf16", errmsg,
3570 &starts,
3571 (const char **)&e,
3572 &startinpos,
3573 &endinpos,
3574 &exc,
3575 (const char **)&q,
3576 &unicode,
3577 &outpos,
3578 &p))
3579 goto onError;
3580 /* The remaining input chars are ignored if the callback
3581 chooses to skip the input */
3582 }
3583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584
3585 if (byteorder)
3586 *byteorder = bo;
3587
Walter Dörwald69652032004-09-07 20:24:22 +00003588 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003590
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003592 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 goto onError;
3594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 Py_XDECREF(errorHandler);
3596 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 return (PyObject *)unicode;
3598
Benjamin Peterson29060642009-01-31 22:14:21 +00003599 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 Py_XDECREF(errorHandler);
3602 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 return NULL;
3604}
3605
Antoine Pitrouab868312009-01-10 15:40:25 +00003606#undef FAST_CHAR_MASK
3607#undef SWAPPED_FAST_CHAR_MASK
3608
Tim Peters772747b2001-08-09 22:21:55 +00003609PyObject *
3610PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003611 Py_ssize_t size,
3612 const char *errors,
3613 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003615 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003616 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003617 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003618#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003619 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003620#else
3621 const int pairs = 0;
3622#endif
Tim Peters772747b2001-08-09 22:21:55 +00003623 /* Offsets from p for storing byte pairs in the right order. */
3624#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3625 int ihi = 1, ilo = 0;
3626#else
3627 int ihi = 0, ilo = 1;
3628#endif
3629
Benjamin Peterson29060642009-01-31 22:14:21 +00003630#define STORECHAR(CH) \
3631 do { \
3632 p[ihi] = ((CH) >> 8) & 0xff; \
3633 p[ilo] = (CH) & 0xff; \
3634 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003635 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003637#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003638 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 if (s[i] >= 0x10000)
3640 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003641#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003642 /* 2 * (size + pairs + (byteorder == 0)) */
3643 if (size > PY_SSIZE_T_MAX ||
3644 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003646 nsize = size + pairs + (byteorder == 0);
3647 bytesize = nsize * 2;
3648 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003650 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 if (v == NULL)
3652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003654 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003657 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003658 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003659
3660 if (byteorder == -1) {
3661 /* force LE */
3662 ihi = 1;
3663 ilo = 0;
3664 }
3665 else if (byteorder == 1) {
3666 /* force BE */
3667 ihi = 0;
3668 ilo = 1;
3669 }
3670
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003671 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 Py_UNICODE ch = *s++;
3673 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003674#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003675 if (ch >= 0x10000) {
3676 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3677 ch = 0xD800 | ((ch-0x10000) >> 10);
3678 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003679#endif
Tim Peters772747b2001-08-09 22:21:55 +00003680 STORECHAR(ch);
3681 if (ch2)
3682 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003683 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003684
3685 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003686 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003687#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688}
3689
3690PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3691{
3692 if (!PyUnicode_Check(unicode)) {
3693 PyErr_BadArgument();
3694 return NULL;
3695 }
3696 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 PyUnicode_GET_SIZE(unicode),
3698 NULL,
3699 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700}
3701
3702/* --- Unicode Escape Codec ----------------------------------------------- */
3703
Fredrik Lundh06d12682001-01-24 07:59:11 +00003704static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003705
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 Py_ssize_t size,
3708 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003711 Py_ssize_t startinpos;
3712 Py_ssize_t endinpos;
3713 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003714 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003718 char* message;
3719 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 PyObject *errorHandler = NULL;
3721 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003722
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 /* Escaped strings will always be longer than the resulting
3724 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 length after conversion to the true value.
3726 (but if the error callback returns a long replacement string
3727 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 v = _PyUnicode_New(size);
3729 if (v == NULL)
3730 goto onError;
3731 if (size == 0)
3732 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003733
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003736
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737 while (s < end) {
3738 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003739 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741
3742 /* Non-escape characters are interpreted as Unicode ordinals */
3743 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003744 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 continue;
3746 }
3747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749 /* \ - Escapes */
3750 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003751 c = *s++;
3752 if (s > end)
3753 c = '\0'; /* Invalid after \ */
3754 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 case '\n': break;
3758 case '\\': *p++ = '\\'; break;
3759 case '\'': *p++ = '\''; break;
3760 case '\"': *p++ = '\"'; break;
3761 case 'b': *p++ = '\b'; break;
3762 case 'f': *p++ = '\014'; break; /* FF */
3763 case 't': *p++ = '\t'; break;
3764 case 'n': *p++ = '\n'; break;
3765 case 'r': *p++ = '\r'; break;
3766 case 'v': *p++ = '\013'; break; /* VT */
3767 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3768
Benjamin Peterson29060642009-01-31 22:14:21 +00003769 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 case '0': case '1': case '2': case '3':
3771 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003772 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003773 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003774 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003775 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003776 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003778 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 break;
3780
Benjamin Peterson29060642009-01-31 22:14:21 +00003781 /* hex escapes */
3782 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003784 digits = 2;
3785 message = "truncated \\xXX escape";
3786 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787
Benjamin Peterson29060642009-01-31 22:14:21 +00003788 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003790 digits = 4;
3791 message = "truncated \\uXXXX escape";
3792 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793
Benjamin Peterson29060642009-01-31 22:14:21 +00003794 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003795 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003796 digits = 8;
3797 message = "truncated \\UXXXXXXXX escape";
3798 hexescape:
3799 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003800 outpos = p-PyUnicode_AS_UNICODE(v);
3801 if (s+digits>end) {
3802 endinpos = size;
3803 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003804 errors, &errorHandler,
3805 "unicodeescape", "end of string in escape sequence",
3806 &starts, &end, &startinpos, &endinpos, &exc, &s,
3807 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808 goto onError;
3809 goto nextByte;
3810 }
3811 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003812 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003813 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814 endinpos = (s+i+1)-starts;
3815 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003816 errors, &errorHandler,
3817 "unicodeescape", message,
3818 &starts, &end, &startinpos, &endinpos, &exc, &s,
3819 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003820 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003822 }
3823 chr = (chr<<4) & ~0xF;
3824 if (c >= '0' && c <= '9')
3825 chr += c - '0';
3826 else if (c >= 'a' && c <= 'f')
3827 chr += 10 + c - 'a';
3828 else
3829 chr += 10 + c - 'A';
3830 }
3831 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003832 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 /* _decoding_error will have already written into the
3834 target buffer. */
3835 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003836 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003837 /* when we get here, chr is a 32-bit unicode character */
3838 if (chr <= 0xffff)
3839 /* UCS-2 character */
3840 *p++ = (Py_UNICODE) chr;
3841 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003842 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003843 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003844#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003845 *p++ = chr;
3846#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003847 chr -= 0x10000L;
3848 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003849 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003850#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003851 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852 endinpos = s-starts;
3853 outpos = p-PyUnicode_AS_UNICODE(v);
3854 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 errors, &errorHandler,
3856 "unicodeescape", "illegal Unicode character",
3857 &starts, &end, &startinpos, &endinpos, &exc, &s,
3858 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003859 goto onError;
3860 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003861 break;
3862
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003864 case 'N':
3865 message = "malformed \\N character escape";
3866 if (ucnhash_CAPI == NULL) {
3867 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003868 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003869 if (ucnhash_CAPI == NULL)
3870 goto ucnhashError;
3871 }
3872 if (*s == '{') {
3873 const char *start = s+1;
3874 /* look for the closing brace */
3875 while (*s != '}' && s < end)
3876 s++;
3877 if (s > start && s < end && *s == '}') {
3878 /* found a name. look it up in the unicode database */
3879 message = "unknown Unicode character name";
3880 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003881 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003882 goto store;
3883 }
3884 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003885 endinpos = s-starts;
3886 outpos = p-PyUnicode_AS_UNICODE(v);
3887 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003888 errors, &errorHandler,
3889 "unicodeescape", message,
3890 &starts, &end, &startinpos, &endinpos, &exc, &s,
3891 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003892 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003893 break;
3894
3895 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003896 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 message = "\\ at end of string";
3898 s--;
3899 endinpos = s-starts;
3900 outpos = p-PyUnicode_AS_UNICODE(v);
3901 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003902 errors, &errorHandler,
3903 "unicodeescape", message,
3904 &starts, &end, &startinpos, &endinpos, &exc, &s,
3905 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003906 goto onError;
3907 }
3908 else {
3909 *p++ = '\\';
3910 *p++ = (unsigned char)s[-1];
3911 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003912 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003914 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003917 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003918 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003919 Py_XDECREF(errorHandler);
3920 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003922
Benjamin Peterson29060642009-01-31 22:14:21 +00003923 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003924 PyErr_SetString(
3925 PyExc_UnicodeError,
3926 "\\N escapes not supported (can't load unicodedata module)"
3927 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003928 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 Py_XDECREF(errorHandler);
3930 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003931 return NULL;
3932
Benjamin Peterson29060642009-01-31 22:14:21 +00003933 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 Py_XDECREF(errorHandler);
3936 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937 return NULL;
3938}
3939
3940/* Return a Unicode-Escape string version of the Unicode object.
3941
3942 If quotes is true, the string is enclosed in u"" or u'' quotes as
3943 appropriate.
3944
3945*/
3946
Thomas Wouters477c8d52006-05-27 19:21:47 +00003947Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003948 Py_ssize_t size,
3949 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003950{
3951 /* like wcschr, but doesn't stop at NULL characters */
3952
3953 while (size-- > 0) {
3954 if (*s == ch)
3955 return s;
3956 s++;
3957 }
3958
3959 return NULL;
3960}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003961
Walter Dörwald79e913e2007-05-12 11:08:06 +00003962static const char *hexdigits = "0123456789abcdef";
3963
3964PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003967 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003970#ifdef Py_UNICODE_WIDE
3971 const Py_ssize_t expandsize = 10;
3972#else
3973 const Py_ssize_t expandsize = 6;
3974#endif
3975
Thomas Wouters89f507f2006-12-13 04:49:30 +00003976 /* XXX(nnorwitz): rather than over-allocating, it would be
3977 better to choose a different scheme. Perhaps scan the
3978 first N-chars of the string and allocate based on that size.
3979 */
3980 /* Initial allocation is based on the longest-possible unichr
3981 escape.
3982
3983 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3984 unichr, so in this case it's the longest unichr escape. In
3985 narrow (UTF-16) builds this is five chars per source unichr
3986 since there are two unichrs in the surrogate pair, so in narrow
3987 (UTF-16) builds it's not the longest unichr escape.
3988
3989 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3990 so in the narrow (UTF-16) build case it's the longest unichr
3991 escape.
3992 */
3993
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003994 if (size == 0)
3995 return PyBytes_FromStringAndSize(NULL, 0);
3996
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003997 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003999
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004000 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004001 2
4002 + expandsize*size
4003 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 if (repr == NULL)
4005 return NULL;
4006
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004007 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009 while (size-- > 0) {
4010 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004011
Walter Dörwald79e913e2007-05-12 11:08:06 +00004012 /* Escape backslashes */
4013 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 *p++ = '\\';
4015 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004016 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004017 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004018
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004019#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004020 /* Map 21-bit characters to '\U00xxxxxx' */
4021 else if (ch >= 0x10000) {
4022 *p++ = '\\';
4023 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004024 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4025 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4026 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4027 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4028 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4029 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4030 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4031 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004032 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004033 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004034#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4036 else if (ch >= 0xD800 && ch < 0xDC00) {
4037 Py_UNICODE ch2;
4038 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004039
Benjamin Peterson29060642009-01-31 22:14:21 +00004040 ch2 = *s++;
4041 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004042 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4044 *p++ = '\\';
4045 *p++ = 'U';
4046 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4047 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4048 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4049 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4050 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4051 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4052 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4053 *p++ = hexdigits[ucs & 0x0000000F];
4054 continue;
4055 }
4056 /* Fall through: isolated surrogates are copied as-is */
4057 s--;
4058 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004059 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004060#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004061
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004063 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 *p++ = '\\';
4065 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004066 *p++ = hexdigits[(ch >> 12) & 0x000F];
4067 *p++ = hexdigits[(ch >> 8) & 0x000F];
4068 *p++ = hexdigits[(ch >> 4) & 0x000F];
4069 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004071
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004072 /* Map special whitespace to '\t', \n', '\r' */
4073 else if (ch == '\t') {
4074 *p++ = '\\';
4075 *p++ = 't';
4076 }
4077 else if (ch == '\n') {
4078 *p++ = '\\';
4079 *p++ = 'n';
4080 }
4081 else if (ch == '\r') {
4082 *p++ = '\\';
4083 *p++ = 'r';
4084 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004085
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004086 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004087 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004089 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004090 *p++ = hexdigits[(ch >> 4) & 0x000F];
4091 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004092 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004093
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 /* Copy everything else as-is */
4095 else
4096 *p++ = (char) ch;
4097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004099 assert(p - PyBytes_AS_STRING(repr) > 0);
4100 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4101 return NULL;
4102 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103}
4104
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004105PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004107 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108 if (!PyUnicode_Check(unicode)) {
4109 PyErr_BadArgument();
4110 return NULL;
4111 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004112 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4113 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004114 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115}
4116
4117/* --- Raw Unicode Escape Codec ------------------------------------------- */
4118
4119PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 Py_ssize_t size,
4121 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004124 Py_ssize_t startinpos;
4125 Py_ssize_t endinpos;
4126 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 const char *end;
4130 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 PyObject *errorHandler = NULL;
4132 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004133
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 /* Escaped strings will always be longer than the resulting
4135 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 length after conversion to the true value. (But decoding error
4137 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 v = _PyUnicode_New(size);
4139 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 end = s + size;
4145 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 unsigned char c;
4147 Py_UCS4 x;
4148 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004149 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 /* Non-escape characters are interpreted as Unicode ordinals */
4152 if (*s != '\\') {
4153 *p++ = (unsigned char)*s++;
4154 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004155 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004156 startinpos = s-starts;
4157
4158 /* \u-escapes are only interpreted iff the number of leading
4159 backslashes if odd */
4160 bs = s;
4161 for (;s < end;) {
4162 if (*s != '\\')
4163 break;
4164 *p++ = (unsigned char)*s++;
4165 }
4166 if (((s - bs) & 1) == 0 ||
4167 s >= end ||
4168 (*s != 'u' && *s != 'U')) {
4169 continue;
4170 }
4171 p--;
4172 count = *s=='u' ? 4 : 8;
4173 s++;
4174
4175 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4176 outpos = p-PyUnicode_AS_UNICODE(v);
4177 for (x = 0, i = 0; i < count; ++i, ++s) {
4178 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004179 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 endinpos = s-starts;
4181 if (unicode_decode_call_errorhandler(
4182 errors, &errorHandler,
4183 "rawunicodeescape", "truncated \\uXXXX",
4184 &starts, &end, &startinpos, &endinpos, &exc, &s,
4185 &v, &outpos, &p))
4186 goto onError;
4187 goto nextByte;
4188 }
4189 x = (x<<4) & ~0xF;
4190 if (c >= '0' && c <= '9')
4191 x += c - '0';
4192 else if (c >= 'a' && c <= 'f')
4193 x += 10 + c - 'a';
4194 else
4195 x += 10 + c - 'A';
4196 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004197 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 /* UCS-2 character */
4199 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004200 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 /* UCS-4 character. Either store directly, or as
4202 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004203#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004205#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 x -= 0x10000L;
4207 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4208 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004209#endif
4210 } else {
4211 endinpos = s-starts;
4212 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004213 if (unicode_decode_call_errorhandler(
4214 errors, &errorHandler,
4215 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 &starts, &end, &startinpos, &endinpos, &exc, &s,
4217 &v, &outpos, &p))
4218 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004219 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 nextByte:
4221 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004223 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004224 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 Py_XDECREF(errorHandler);
4226 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004228
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 Py_XDECREF(errorHandler);
4232 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 return NULL;
4234}
4235
4236PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004237 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004239 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004240 char *p;
4241 char *q;
4242
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004243#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004244 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004245#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004246 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004247#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004248
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004249 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004251
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004252 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253 if (repr == NULL)
4254 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004255 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004256 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004258 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 while (size-- > 0) {
4260 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004261#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 /* Map 32-bit characters to '\Uxxxxxxxx' */
4263 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004264 *p++ = '\\';
4265 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004266 *p++ = hexdigits[(ch >> 28) & 0xf];
4267 *p++ = hexdigits[(ch >> 24) & 0xf];
4268 *p++ = hexdigits[(ch >> 20) & 0xf];
4269 *p++ = hexdigits[(ch >> 16) & 0xf];
4270 *p++ = hexdigits[(ch >> 12) & 0xf];
4271 *p++ = hexdigits[(ch >> 8) & 0xf];
4272 *p++ = hexdigits[(ch >> 4) & 0xf];
4273 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004274 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004275 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004276#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004277 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4278 if (ch >= 0xD800 && ch < 0xDC00) {
4279 Py_UNICODE ch2;
4280 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004281
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 ch2 = *s++;
4283 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004284 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004285 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4286 *p++ = '\\';
4287 *p++ = 'U';
4288 *p++ = hexdigits[(ucs >> 28) & 0xf];
4289 *p++ = hexdigits[(ucs >> 24) & 0xf];
4290 *p++ = hexdigits[(ucs >> 20) & 0xf];
4291 *p++ = hexdigits[(ucs >> 16) & 0xf];
4292 *p++ = hexdigits[(ucs >> 12) & 0xf];
4293 *p++ = hexdigits[(ucs >> 8) & 0xf];
4294 *p++ = hexdigits[(ucs >> 4) & 0xf];
4295 *p++ = hexdigits[ucs & 0xf];
4296 continue;
4297 }
4298 /* Fall through: isolated surrogates are copied as-is */
4299 s--;
4300 size++;
4301 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004302#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 /* Map 16-bit characters to '\uxxxx' */
4304 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 *p++ = '\\';
4306 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004307 *p++ = hexdigits[(ch >> 12) & 0xf];
4308 *p++ = hexdigits[(ch >> 8) & 0xf];
4309 *p++ = hexdigits[(ch >> 4) & 0xf];
4310 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 /* Copy everything else as-is */
4313 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 *p++ = (char) ch;
4315 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004316 size = p - q;
4317
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004318 assert(size > 0);
4319 if (_PyBytes_Resize(&repr, size) < 0)
4320 return NULL;
4321 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322}
4323
4324PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4325{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004326 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004328 PyErr_BadArgument();
4329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004331 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4332 PyUnicode_GET_SIZE(unicode));
4333
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004334 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335}
4336
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004337/* --- Unicode Internal Codec ------------------------------------------- */
4338
4339PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004340 Py_ssize_t size,
4341 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004342{
4343 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004344 Py_ssize_t startinpos;
4345 Py_ssize_t endinpos;
4346 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004347 PyUnicodeObject *v;
4348 Py_UNICODE *p;
4349 const char *end;
4350 const char *reason;
4351 PyObject *errorHandler = NULL;
4352 PyObject *exc = NULL;
4353
Neal Norwitzd43069c2006-01-08 01:12:10 +00004354#ifdef Py_UNICODE_WIDE
4355 Py_UNICODE unimax = PyUnicode_GetMax();
4356#endif
4357
Thomas Wouters89f507f2006-12-13 04:49:30 +00004358 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004359 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4360 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004362 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004364 p = PyUnicode_AS_UNICODE(v);
4365 end = s + size;
4366
4367 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004368 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004369 /* We have to sanity check the raw data, otherwise doom looms for
4370 some malformed UCS-4 data. */
4371 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004372#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004373 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004374#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004375 end-s < Py_UNICODE_SIZE
4376 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004378 startinpos = s - starts;
4379 if (end-s < Py_UNICODE_SIZE) {
4380 endinpos = end-starts;
4381 reason = "truncated input";
4382 }
4383 else {
4384 endinpos = s - starts + Py_UNICODE_SIZE;
4385 reason = "illegal code point (> 0x10FFFF)";
4386 }
4387 outpos = p - PyUnicode_AS_UNICODE(v);
4388 if (unicode_decode_call_errorhandler(
4389 errors, &errorHandler,
4390 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004391 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004392 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004393 goto onError;
4394 }
4395 }
4396 else {
4397 p++;
4398 s += Py_UNICODE_SIZE;
4399 }
4400 }
4401
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004402 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004403 goto onError;
4404 Py_XDECREF(errorHandler);
4405 Py_XDECREF(exc);
4406 return (PyObject *)v;
4407
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004409 Py_XDECREF(v);
4410 Py_XDECREF(errorHandler);
4411 Py_XDECREF(exc);
4412 return NULL;
4413}
4414
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415/* --- Latin-1 Codec ------------------------------------------------------ */
4416
4417PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004418 Py_ssize_t size,
4419 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420{
4421 PyUnicodeObject *v;
4422 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004423 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004424
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004426 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 Py_UNICODE r = *(unsigned char*)s;
4428 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004429 }
4430
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 v = _PyUnicode_New(size);
4432 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004437 e = s + size;
4438 /* Unrolling the copy makes it much faster by reducing the looping
4439 overhead. This is similar to what many memcpy() implementations do. */
4440 unrolled_end = e - 4;
4441 while (s < unrolled_end) {
4442 p[0] = (unsigned char) s[0];
4443 p[1] = (unsigned char) s[1];
4444 p[2] = (unsigned char) s[2];
4445 p[3] = (unsigned char) s[3];
4446 s += 4;
4447 p += 4;
4448 }
4449 while (s < e)
4450 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004452
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 Py_XDECREF(v);
4455 return NULL;
4456}
4457
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458/* create or adjust a UnicodeEncodeError */
4459static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 const char *encoding,
4461 const Py_UNICODE *unicode, Py_ssize_t size,
4462 Py_ssize_t startpos, Py_ssize_t endpos,
4463 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 *exceptionObject = PyUnicodeEncodeError_Create(
4467 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 }
4469 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004470 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4471 goto onError;
4472 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4473 goto onError;
4474 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4475 goto onError;
4476 return;
4477 onError:
4478 Py_DECREF(*exceptionObject);
4479 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 }
4481}
4482
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483/* raises a UnicodeEncodeError */
4484static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 const char *encoding,
4486 const Py_UNICODE *unicode, Py_ssize_t size,
4487 Py_ssize_t startpos, Py_ssize_t endpos,
4488 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489{
4490 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494}
4495
4496/* error handling callback helper:
4497 build arguments, call the callback and check the arguments,
4498 put the result into newpos and return the replacement string, which
4499 has to be freed by the caller */
4500static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 PyObject **errorHandler,
4502 const char *encoding, const char *reason,
4503 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4504 Py_ssize_t startpos, Py_ssize_t endpos,
4505 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004507 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508
4509 PyObject *restuple;
4510 PyObject *resunicode;
4511
4512 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 }
4517
4518 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522
4523 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004528 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 Py_DECREF(restuple);
4530 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004532 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 &resunicode, newpos)) {
4534 Py_DECREF(restuple);
4535 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004537 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4538 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4539 Py_DECREF(restuple);
4540 return NULL;
4541 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004544 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4546 Py_DECREF(restuple);
4547 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004548 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 Py_INCREF(resunicode);
4550 Py_DECREF(restuple);
4551 return resunicode;
4552}
4553
4554static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004555 Py_ssize_t size,
4556 const char *errors,
4557 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558{
4559 /* output object */
4560 PyObject *res;
4561 /* pointers to the beginning and end+1 of input */
4562 const Py_UNICODE *startp = p;
4563 const Py_UNICODE *endp = p + size;
4564 /* pointer to the beginning of the unencodable characters */
4565 /* const Py_UNICODE *badp = NULL; */
4566 /* pointer into the output */
4567 char *str;
4568 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004569 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004570 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4571 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 PyObject *errorHandler = NULL;
4573 PyObject *exc = NULL;
4574 /* the following variable is used for caching string comparisons
4575 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4576 int known_errorHandler = -1;
4577
4578 /* allocate enough for a simple encoding without
4579 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004580 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004581 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004582 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004584 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004585 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586 ressize = size;
4587
4588 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 /* can we encode this? */
4592 if (c<limit) {
4593 /* no overflow check, because we know that the space is enough */
4594 *str++ = (char)c;
4595 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004596 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 else {
4598 Py_ssize_t unicodepos = p-startp;
4599 Py_ssize_t requiredsize;
4600 PyObject *repunicode;
4601 Py_ssize_t repsize;
4602 Py_ssize_t newpos;
4603 Py_ssize_t respos;
4604 Py_UNICODE *uni2;
4605 /* startpos for collecting unencodable chars */
4606 const Py_UNICODE *collstart = p;
4607 const Py_UNICODE *collend = p;
4608 /* find all unecodable characters */
4609 while ((collend < endp) && ((*collend)>=limit))
4610 ++collend;
4611 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4612 if (known_errorHandler==-1) {
4613 if ((errors==NULL) || (!strcmp(errors, "strict")))
4614 known_errorHandler = 1;
4615 else if (!strcmp(errors, "replace"))
4616 known_errorHandler = 2;
4617 else if (!strcmp(errors, "ignore"))
4618 known_errorHandler = 3;
4619 else if (!strcmp(errors, "xmlcharrefreplace"))
4620 known_errorHandler = 4;
4621 else
4622 known_errorHandler = 0;
4623 }
4624 switch (known_errorHandler) {
4625 case 1: /* strict */
4626 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4627 goto onError;
4628 case 2: /* replace */
4629 while (collstart++<collend)
4630 *str++ = '?'; /* fall through */
4631 case 3: /* ignore */
4632 p = collend;
4633 break;
4634 case 4: /* xmlcharrefreplace */
4635 respos = str - PyBytes_AS_STRING(res);
4636 /* determine replacement size (temporarily (mis)uses p) */
4637 for (p = collstart, repsize = 0; p < collend; ++p) {
4638 if (*p<10)
4639 repsize += 2+1+1;
4640 else if (*p<100)
4641 repsize += 2+2+1;
4642 else if (*p<1000)
4643 repsize += 2+3+1;
4644 else if (*p<10000)
4645 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004646#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004647 else
4648 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004649#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 else if (*p<100000)
4651 repsize += 2+5+1;
4652 else if (*p<1000000)
4653 repsize += 2+6+1;
4654 else
4655 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004656#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 }
4658 requiredsize = respos+repsize+(endp-collend);
4659 if (requiredsize > ressize) {
4660 if (requiredsize<2*ressize)
4661 requiredsize = 2*ressize;
4662 if (_PyBytes_Resize(&res, requiredsize))
4663 goto onError;
4664 str = PyBytes_AS_STRING(res) + respos;
4665 ressize = requiredsize;
4666 }
4667 /* generate replacement (temporarily (mis)uses p) */
4668 for (p = collstart; p < collend; ++p) {
4669 str += sprintf(str, "&#%d;", (int)*p);
4670 }
4671 p = collend;
4672 break;
4673 default:
4674 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4675 encoding, reason, startp, size, &exc,
4676 collstart-startp, collend-startp, &newpos);
4677 if (repunicode == NULL)
4678 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004679 if (PyBytes_Check(repunicode)) {
4680 /* Directly copy bytes result to output. */
4681 repsize = PyBytes_Size(repunicode);
4682 if (repsize > 1) {
4683 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004684 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004685 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4686 Py_DECREF(repunicode);
4687 goto onError;
4688 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004689 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004690 ressize += repsize-1;
4691 }
4692 memcpy(str, PyBytes_AsString(repunicode), repsize);
4693 str += repsize;
4694 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004695 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004696 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004697 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004698 /* need more space? (at least enough for what we
4699 have+the replacement+the rest of the string, so
4700 we won't have to check space for encodable characters) */
4701 respos = str - PyBytes_AS_STRING(res);
4702 repsize = PyUnicode_GET_SIZE(repunicode);
4703 requiredsize = respos+repsize+(endp-collend);
4704 if (requiredsize > ressize) {
4705 if (requiredsize<2*ressize)
4706 requiredsize = 2*ressize;
4707 if (_PyBytes_Resize(&res, requiredsize)) {
4708 Py_DECREF(repunicode);
4709 goto onError;
4710 }
4711 str = PyBytes_AS_STRING(res) + respos;
4712 ressize = requiredsize;
4713 }
4714 /* check if there is anything unencodable in the replacement
4715 and copy it to the output */
4716 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4717 c = *uni2;
4718 if (c >= limit) {
4719 raise_encode_exception(&exc, encoding, startp, size,
4720 unicodepos, unicodepos+1, reason);
4721 Py_DECREF(repunicode);
4722 goto onError;
4723 }
4724 *str = (char)c;
4725 }
4726 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004727 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004728 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004729 }
4730 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004731 /* Resize if we allocated to much */
4732 size = str - PyBytes_AS_STRING(res);
4733 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004734 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004735 if (_PyBytes_Resize(&res, size) < 0)
4736 goto onError;
4737 }
4738
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 Py_XDECREF(errorHandler);
4740 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004741 return res;
4742
4743 onError:
4744 Py_XDECREF(res);
4745 Py_XDECREF(errorHandler);
4746 Py_XDECREF(exc);
4747 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748}
4749
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004751 Py_ssize_t size,
4752 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755}
4756
4757PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4758{
4759 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004760 PyErr_BadArgument();
4761 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 }
4763 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 PyUnicode_GET_SIZE(unicode),
4765 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766}
4767
4768/* --- 7-bit ASCII Codec -------------------------------------------------- */
4769
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004771 Py_ssize_t size,
4772 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 PyUnicodeObject *v;
4776 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004777 Py_ssize_t startinpos;
4778 Py_ssize_t endinpos;
4779 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 const char *e;
4781 PyObject *errorHandler = NULL;
4782 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004783
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004785 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 Py_UNICODE r = *(unsigned char*)s;
4787 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004788 }
Tim Petersced69f82003-09-16 20:30:58 +00004789
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 v = _PyUnicode_New(size);
4791 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796 e = s + size;
4797 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 register unsigned char c = (unsigned char)*s;
4799 if (c < 128) {
4800 *p++ = c;
4801 ++s;
4802 }
4803 else {
4804 startinpos = s-starts;
4805 endinpos = startinpos + 1;
4806 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4807 if (unicode_decode_call_errorhandler(
4808 errors, &errorHandler,
4809 "ascii", "ordinal not in range(128)",
4810 &starts, &e, &startinpos, &endinpos, &exc, &s,
4811 &v, &outpos, &p))
4812 goto onError;
4813 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004815 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4817 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 Py_XDECREF(errorHandler);
4819 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004821
Benjamin Peterson29060642009-01-31 22:14:21 +00004822 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 Py_XDECREF(errorHandler);
4825 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 return NULL;
4827}
4828
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004830 Py_ssize_t size,
4831 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834}
4835
4836PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4837{
4838 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 PyErr_BadArgument();
4840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 }
4842 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 PyUnicode_GET_SIZE(unicode),
4844 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845}
4846
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004847#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004848
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004849/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004850
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004851#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004852#define NEED_RETRY
4853#endif
4854
4855/* XXX This code is limited to "true" double-byte encodings, as
4856 a) it assumes an incomplete character consists of a single byte, and
4857 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004859
4860static int is_dbcs_lead_byte(const char *s, int offset)
4861{
4862 const char *curr = s + offset;
4863
4864 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004865 const char *prev = CharPrev(s, curr);
4866 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004867 }
4868 return 0;
4869}
4870
4871/*
4872 * Decode MBCS string into unicode object. If 'final' is set, converts
4873 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4874 */
4875static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 const char *s, /* MBCS string */
4877 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004878 int final,
4879 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004880{
4881 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004882 Py_ssize_t n;
4883 DWORD usize;
4884 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004885
4886 assert(size >= 0);
4887
Victor Stinner554f3f02010-06-16 23:33:54 +00004888 /* check and handle 'errors' arg */
4889 if (errors==NULL || strcmp(errors, "strict")==0)
4890 flags = MB_ERR_INVALID_CHARS;
4891 else if (strcmp(errors, "ignore")==0)
4892 flags = 0;
4893 else {
4894 PyErr_Format(PyExc_ValueError,
4895 "mbcs encoding does not support errors='%s'",
4896 errors);
4897 return -1;
4898 }
4899
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004900 /* Skip trailing lead-byte unless 'final' is set */
4901 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004903
4904 /* First get the size of the result */
4905 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004906 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4907 if (usize==0)
4908 goto mbcs_decode_error;
4909 } else
4910 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004911
4912 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 /* Create unicode object */
4914 *v = _PyUnicode_New(usize);
4915 if (*v == NULL)
4916 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004917 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004918 }
4919 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 /* Extend unicode object */
4921 n = PyUnicode_GET_SIZE(*v);
4922 if (_PyUnicode_Resize(v, n + usize) < 0)
4923 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004924 }
4925
4926 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004927 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004929 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4930 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004932 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004933 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004934
4935mbcs_decode_error:
4936 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4937 we raise a UnicodeDecodeError - else it is a 'generic'
4938 windows error
4939 */
4940 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4941 /* Ideally, we should get reason from FormatMessage - this
4942 is the Windows 2000 English version of the message
4943 */
4944 PyObject *exc = NULL;
4945 const char *reason = "No mapping for the Unicode character exists "
4946 "in the target multi-byte code page.";
4947 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4948 if (exc != NULL) {
4949 PyCodec_StrictErrors(exc);
4950 Py_DECREF(exc);
4951 }
4952 } else {
4953 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4954 }
4955 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004956}
4957
4958PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004959 Py_ssize_t size,
4960 const char *errors,
4961 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004962{
4963 PyUnicodeObject *v = NULL;
4964 int done;
4965
4966 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004968
4969#ifdef NEED_RETRY
4970 retry:
4971 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004972 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004973 else
4974#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004975 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004976
4977 if (done < 0) {
4978 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004980 }
4981
4982 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004984
4985#ifdef NEED_RETRY
4986 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004987 s += done;
4988 size -= done;
4989 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004990 }
4991#endif
4992
4993 return (PyObject *)v;
4994}
4995
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004996PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 Py_ssize_t size,
4998 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004999{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005000 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5001}
5002
5003/*
5004 * Convert unicode into string object (MBCS).
5005 * Returns 0 if succeed, -1 otherwise.
5006 */
5007static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005009 int size, /* size of unicode */
5010 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005011{
Victor Stinner554f3f02010-06-16 23:33:54 +00005012 BOOL usedDefaultChar = FALSE;
5013 BOOL *pusedDefaultChar;
5014 int mbcssize;
5015 Py_ssize_t n;
5016 PyObject *exc = NULL;
5017 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005018
5019 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005020
Victor Stinner554f3f02010-06-16 23:33:54 +00005021 /* check and handle 'errors' arg */
5022 if (errors==NULL || strcmp(errors, "strict")==0) {
5023 flags = WC_NO_BEST_FIT_CHARS;
5024 pusedDefaultChar = &usedDefaultChar;
5025 } else if (strcmp(errors, "replace")==0) {
5026 flags = 0;
5027 pusedDefaultChar = NULL;
5028 } else {
5029 PyErr_Format(PyExc_ValueError,
5030 "mbcs encoding does not support errors='%s'",
5031 errors);
5032 return -1;
5033 }
5034
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005035 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005036 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005037 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5038 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 if (mbcssize == 0) {
5040 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5041 return -1;
5042 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005043 /* If we used a default char, then we failed! */
5044 if (pusedDefaultChar && *pusedDefaultChar)
5045 goto mbcs_encode_error;
5046 } else {
5047 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005048 }
5049
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005050 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 /* Create string object */
5052 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5053 if (*repr == NULL)
5054 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005055 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005056 }
5057 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 /* Extend string object */
5059 n = PyBytes_Size(*repr);
5060 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5061 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005062 }
5063
5064 /* Do the conversion */
5065 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005067 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5068 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5070 return -1;
5071 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005072 if (pusedDefaultChar && *pusedDefaultChar)
5073 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005074 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005075 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005076
5077mbcs_encode_error:
5078 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5079 Py_XDECREF(exc);
5080 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005081}
5082
5083PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 Py_ssize_t size,
5085 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005086{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005087 PyObject *repr = NULL;
5088 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005089
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005090#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005091 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005092 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005093 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005094 else
5095#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005096 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005097
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005098 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 Py_XDECREF(repr);
5100 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005101 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005102
5103#ifdef NEED_RETRY
5104 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 p += INT_MAX;
5106 size -= INT_MAX;
5107 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005108 }
5109#endif
5110
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005111 return repr;
5112}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005113
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005114PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5115{
5116 if (!PyUnicode_Check(unicode)) {
5117 PyErr_BadArgument();
5118 return NULL;
5119 }
5120 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005121 PyUnicode_GET_SIZE(unicode),
5122 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005123}
5124
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005125#undef NEED_RETRY
5126
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005127#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005128
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129/* --- Character Mapping Codec -------------------------------------------- */
5130
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 Py_ssize_t size,
5133 PyObject *mapping,
5134 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005136 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005137 Py_ssize_t startinpos;
5138 Py_ssize_t endinpos;
5139 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 PyUnicodeObject *v;
5142 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005143 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005144 PyObject *errorHandler = NULL;
5145 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005146 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005147 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005148
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 /* Default to Latin-1 */
5150 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152
5153 v = _PyUnicode_New(size);
5154 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005159 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005160 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 mapstring = PyUnicode_AS_UNICODE(mapping);
5162 maplen = PyUnicode_GET_SIZE(mapping);
5163 while (s < e) {
5164 unsigned char ch = *s;
5165 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 if (ch < maplen)
5168 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 if (x == 0xfffe) {
5171 /* undefined mapping */
5172 outpos = p-PyUnicode_AS_UNICODE(v);
5173 startinpos = s-starts;
5174 endinpos = startinpos+1;
5175 if (unicode_decode_call_errorhandler(
5176 errors, &errorHandler,
5177 "charmap", "character maps to <undefined>",
5178 &starts, &e, &startinpos, &endinpos, &exc, &s,
5179 &v, &outpos, &p)) {
5180 goto onError;
5181 }
5182 continue;
5183 }
5184 *p++ = x;
5185 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005186 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005187 }
5188 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 while (s < e) {
5190 unsigned char ch = *s;
5191 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005192
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5194 w = PyLong_FromLong((long)ch);
5195 if (w == NULL)
5196 goto onError;
5197 x = PyObject_GetItem(mapping, w);
5198 Py_DECREF(w);
5199 if (x == NULL) {
5200 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5201 /* No mapping found means: mapping is undefined. */
5202 PyErr_Clear();
5203 x = Py_None;
5204 Py_INCREF(x);
5205 } else
5206 goto onError;
5207 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005208
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 /* Apply mapping */
5210 if (PyLong_Check(x)) {
5211 long value = PyLong_AS_LONG(x);
5212 if (value < 0 || value > 65535) {
5213 PyErr_SetString(PyExc_TypeError,
5214 "character mapping must be in range(65536)");
5215 Py_DECREF(x);
5216 goto onError;
5217 }
5218 *p++ = (Py_UNICODE)value;
5219 }
5220 else if (x == Py_None) {
5221 /* undefined mapping */
5222 outpos = p-PyUnicode_AS_UNICODE(v);
5223 startinpos = s-starts;
5224 endinpos = startinpos+1;
5225 if (unicode_decode_call_errorhandler(
5226 errors, &errorHandler,
5227 "charmap", "character maps to <undefined>",
5228 &starts, &e, &startinpos, &endinpos, &exc, &s,
5229 &v, &outpos, &p)) {
5230 Py_DECREF(x);
5231 goto onError;
5232 }
5233 Py_DECREF(x);
5234 continue;
5235 }
5236 else if (PyUnicode_Check(x)) {
5237 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 if (targetsize == 1)
5240 /* 1-1 mapping */
5241 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005242
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 else if (targetsize > 1) {
5244 /* 1-n mapping */
5245 if (targetsize > extrachars) {
5246 /* resize first */
5247 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5248 Py_ssize_t needed = (targetsize - extrachars) + \
5249 (targetsize << 2);
5250 extrachars += needed;
5251 /* XXX overflow detection missing */
5252 if (_PyUnicode_Resize(&v,
5253 PyUnicode_GET_SIZE(v) + needed) < 0) {
5254 Py_DECREF(x);
5255 goto onError;
5256 }
5257 p = PyUnicode_AS_UNICODE(v) + oldpos;
5258 }
5259 Py_UNICODE_COPY(p,
5260 PyUnicode_AS_UNICODE(x),
5261 targetsize);
5262 p += targetsize;
5263 extrachars -= targetsize;
5264 }
5265 /* 1-0 mapping: skip the character */
5266 }
5267 else {
5268 /* wrong return value */
5269 PyErr_SetString(PyExc_TypeError,
5270 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005271 Py_DECREF(x);
5272 goto onError;
5273 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 Py_DECREF(x);
5275 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 }
5278 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5280 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005281 Py_XDECREF(errorHandler);
5282 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005284
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005286 Py_XDECREF(errorHandler);
5287 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 Py_XDECREF(v);
5289 return NULL;
5290}
5291
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005292/* Charmap encoding: the lookup table */
5293
5294struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 PyObject_HEAD
5296 unsigned char level1[32];
5297 int count2, count3;
5298 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005299};
5300
5301static PyObject*
5302encoding_map_size(PyObject *obj, PyObject* args)
5303{
5304 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005305 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005307}
5308
5309static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005310 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 PyDoc_STR("Return the size (in bytes) of this object") },
5312 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005313};
5314
5315static void
5316encoding_map_dealloc(PyObject* o)
5317{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005318 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005319}
5320
5321static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005322 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 "EncodingMap", /*tp_name*/
5324 sizeof(struct encoding_map), /*tp_basicsize*/
5325 0, /*tp_itemsize*/
5326 /* methods */
5327 encoding_map_dealloc, /*tp_dealloc*/
5328 0, /*tp_print*/
5329 0, /*tp_getattr*/
5330 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005331 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 0, /*tp_repr*/
5333 0, /*tp_as_number*/
5334 0, /*tp_as_sequence*/
5335 0, /*tp_as_mapping*/
5336 0, /*tp_hash*/
5337 0, /*tp_call*/
5338 0, /*tp_str*/
5339 0, /*tp_getattro*/
5340 0, /*tp_setattro*/
5341 0, /*tp_as_buffer*/
5342 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5343 0, /*tp_doc*/
5344 0, /*tp_traverse*/
5345 0, /*tp_clear*/
5346 0, /*tp_richcompare*/
5347 0, /*tp_weaklistoffset*/
5348 0, /*tp_iter*/
5349 0, /*tp_iternext*/
5350 encoding_map_methods, /*tp_methods*/
5351 0, /*tp_members*/
5352 0, /*tp_getset*/
5353 0, /*tp_base*/
5354 0, /*tp_dict*/
5355 0, /*tp_descr_get*/
5356 0, /*tp_descr_set*/
5357 0, /*tp_dictoffset*/
5358 0, /*tp_init*/
5359 0, /*tp_alloc*/
5360 0, /*tp_new*/
5361 0, /*tp_free*/
5362 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005363};
5364
5365PyObject*
5366PyUnicode_BuildEncodingMap(PyObject* string)
5367{
5368 Py_UNICODE *decode;
5369 PyObject *result;
5370 struct encoding_map *mresult;
5371 int i;
5372 int need_dict = 0;
5373 unsigned char level1[32];
5374 unsigned char level2[512];
5375 unsigned char *mlevel1, *mlevel2, *mlevel3;
5376 int count2 = 0, count3 = 0;
5377
5378 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5379 PyErr_BadArgument();
5380 return NULL;
5381 }
5382 decode = PyUnicode_AS_UNICODE(string);
5383 memset(level1, 0xFF, sizeof level1);
5384 memset(level2, 0xFF, sizeof level2);
5385
5386 /* If there isn't a one-to-one mapping of NULL to \0,
5387 or if there are non-BMP characters, we need to use
5388 a mapping dictionary. */
5389 if (decode[0] != 0)
5390 need_dict = 1;
5391 for (i = 1; i < 256; i++) {
5392 int l1, l2;
5393 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005394#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005395 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005396#endif
5397 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005398 need_dict = 1;
5399 break;
5400 }
5401 if (decode[i] == 0xFFFE)
5402 /* unmapped character */
5403 continue;
5404 l1 = decode[i] >> 11;
5405 l2 = decode[i] >> 7;
5406 if (level1[l1] == 0xFF)
5407 level1[l1] = count2++;
5408 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005409 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005410 }
5411
5412 if (count2 >= 0xFF || count3 >= 0xFF)
5413 need_dict = 1;
5414
5415 if (need_dict) {
5416 PyObject *result = PyDict_New();
5417 PyObject *key, *value;
5418 if (!result)
5419 return NULL;
5420 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005421 key = PyLong_FromLong(decode[i]);
5422 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005423 if (!key || !value)
5424 goto failed1;
5425 if (PyDict_SetItem(result, key, value) == -1)
5426 goto failed1;
5427 Py_DECREF(key);
5428 Py_DECREF(value);
5429 }
5430 return result;
5431 failed1:
5432 Py_XDECREF(key);
5433 Py_XDECREF(value);
5434 Py_DECREF(result);
5435 return NULL;
5436 }
5437
5438 /* Create a three-level trie */
5439 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5440 16*count2 + 128*count3 - 1);
5441 if (!result)
5442 return PyErr_NoMemory();
5443 PyObject_Init(result, &EncodingMapType);
5444 mresult = (struct encoding_map*)result;
5445 mresult->count2 = count2;
5446 mresult->count3 = count3;
5447 mlevel1 = mresult->level1;
5448 mlevel2 = mresult->level23;
5449 mlevel3 = mresult->level23 + 16*count2;
5450 memcpy(mlevel1, level1, 32);
5451 memset(mlevel2, 0xFF, 16*count2);
5452 memset(mlevel3, 0, 128*count3);
5453 count3 = 0;
5454 for (i = 1; i < 256; i++) {
5455 int o1, o2, o3, i2, i3;
5456 if (decode[i] == 0xFFFE)
5457 /* unmapped character */
5458 continue;
5459 o1 = decode[i]>>11;
5460 o2 = (decode[i]>>7) & 0xF;
5461 i2 = 16*mlevel1[o1] + o2;
5462 if (mlevel2[i2] == 0xFF)
5463 mlevel2[i2] = count3++;
5464 o3 = decode[i] & 0x7F;
5465 i3 = 128*mlevel2[i2] + o3;
5466 mlevel3[i3] = i;
5467 }
5468 return result;
5469}
5470
5471static int
5472encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5473{
5474 struct encoding_map *map = (struct encoding_map*)mapping;
5475 int l1 = c>>11;
5476 int l2 = (c>>7) & 0xF;
5477 int l3 = c & 0x7F;
5478 int i;
5479
5480#ifdef Py_UNICODE_WIDE
5481 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005483 }
5484#endif
5485 if (c == 0)
5486 return 0;
5487 /* level 1*/
5488 i = map->level1[l1];
5489 if (i == 0xFF) {
5490 return -1;
5491 }
5492 /* level 2*/
5493 i = map->level23[16*i+l2];
5494 if (i == 0xFF) {
5495 return -1;
5496 }
5497 /* level 3 */
5498 i = map->level23[16*map->count2 + 128*i + l3];
5499 if (i == 0) {
5500 return -1;
5501 }
5502 return i;
5503}
5504
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505/* Lookup the character ch in the mapping. If the character
5506 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005507 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509{
Christian Heimes217cfd12007-12-02 14:31:20 +00005510 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 PyObject *x;
5512
5513 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 x = PyObject_GetItem(mapping, w);
5516 Py_DECREF(w);
5517 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5519 /* No mapping found means: mapping is undefined. */
5520 PyErr_Clear();
5521 x = Py_None;
5522 Py_INCREF(x);
5523 return x;
5524 } else
5525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005527 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005529 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005530 long value = PyLong_AS_LONG(x);
5531 if (value < 0 || value > 255) {
5532 PyErr_SetString(PyExc_TypeError,
5533 "character mapping must be in range(256)");
5534 Py_DECREF(x);
5535 return NULL;
5536 }
5537 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005539 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 /* wrong return value */
5543 PyErr_Format(PyExc_TypeError,
5544 "character mapping must return integer, bytes or None, not %.400s",
5545 x->ob_type->tp_name);
5546 Py_DECREF(x);
5547 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 }
5549}
5550
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005551static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005552charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005553{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005554 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5555 /* exponentially overallocate to minimize reallocations */
5556 if (requiredsize < 2*outsize)
5557 requiredsize = 2*outsize;
5558 if (_PyBytes_Resize(outobj, requiredsize))
5559 return -1;
5560 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005561}
5562
Benjamin Peterson14339b62009-01-31 16:36:08 +00005563typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005565}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005567 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005568 space is available. Return a new reference to the object that
5569 was put in the output buffer, or Py_None, if the mapping was undefined
5570 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005571 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005573charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005575{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005576 PyObject *rep;
5577 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005578 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005579
Christian Heimes90aa7642007-12-19 02:45:37 +00005580 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005581 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005583 if (res == -1)
5584 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 if (outsize<requiredsize)
5586 if (charmapencode_resize(outobj, outpos, requiredsize))
5587 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005588 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 outstart[(*outpos)++] = (char)res;
5590 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005591 }
5592
5593 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005596 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 Py_DECREF(rep);
5598 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005599 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 if (PyLong_Check(rep)) {
5601 Py_ssize_t requiredsize = *outpos+1;
5602 if (outsize<requiredsize)
5603 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5604 Py_DECREF(rep);
5605 return enc_EXCEPTION;
5606 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005607 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005609 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 else {
5611 const char *repchars = PyBytes_AS_STRING(rep);
5612 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5613 Py_ssize_t requiredsize = *outpos+repsize;
5614 if (outsize<requiredsize)
5615 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5616 Py_DECREF(rep);
5617 return enc_EXCEPTION;
5618 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005619 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 memcpy(outstart + *outpos, repchars, repsize);
5621 *outpos += repsize;
5622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005624 Py_DECREF(rep);
5625 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005626}
5627
5628/* handle an error in PyUnicode_EncodeCharmap
5629 Return 0 on success, -1 on error */
5630static
5631int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005632 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005634 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005635 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636{
5637 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005638 Py_ssize_t repsize;
5639 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640 Py_UNICODE *uni2;
5641 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005642 Py_ssize_t collstartpos = *inpos;
5643 Py_ssize_t collendpos = *inpos+1;
5644 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645 char *encoding = "charmap";
5646 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005647 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005649 /* find all unencodable characters */
5650 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005651 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005652 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 int res = encoding_map_lookup(p[collendpos], mapping);
5654 if (res != -1)
5655 break;
5656 ++collendpos;
5657 continue;
5658 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005659
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 rep = charmapencode_lookup(p[collendpos], mapping);
5661 if (rep==NULL)
5662 return -1;
5663 else if (rep!=Py_None) {
5664 Py_DECREF(rep);
5665 break;
5666 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005667 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005669 }
5670 /* cache callback name lookup
5671 * (if not done yet, i.e. it's the first error) */
5672 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 if ((errors==NULL) || (!strcmp(errors, "strict")))
5674 *known_errorHandler = 1;
5675 else if (!strcmp(errors, "replace"))
5676 *known_errorHandler = 2;
5677 else if (!strcmp(errors, "ignore"))
5678 *known_errorHandler = 3;
5679 else if (!strcmp(errors, "xmlcharrefreplace"))
5680 *known_errorHandler = 4;
5681 else
5682 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683 }
5684 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005685 case 1: /* strict */
5686 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5687 return -1;
5688 case 2: /* replace */
5689 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 x = charmapencode_output('?', mapping, res, respos);
5691 if (x==enc_EXCEPTION) {
5692 return -1;
5693 }
5694 else if (x==enc_FAILED) {
5695 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5696 return -1;
5697 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005698 }
5699 /* fall through */
5700 case 3: /* ignore */
5701 *inpos = collendpos;
5702 break;
5703 case 4: /* xmlcharrefreplace */
5704 /* generate replacement (temporarily (mis)uses p) */
5705 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 char buffer[2+29+1+1];
5707 char *cp;
5708 sprintf(buffer, "&#%d;", (int)p[collpos]);
5709 for (cp = buffer; *cp; ++cp) {
5710 x = charmapencode_output(*cp, mapping, res, respos);
5711 if (x==enc_EXCEPTION)
5712 return -1;
5713 else if (x==enc_FAILED) {
5714 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5715 return -1;
5716 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005717 }
5718 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005719 *inpos = collendpos;
5720 break;
5721 default:
5722 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 encoding, reason, p, size, exceptionObject,
5724 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005725 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005727 if (PyBytes_Check(repunicode)) {
5728 /* Directly copy bytes result to output. */
5729 Py_ssize_t outsize = PyBytes_Size(*res);
5730 Py_ssize_t requiredsize;
5731 repsize = PyBytes_Size(repunicode);
5732 requiredsize = *respos + repsize;
5733 if (requiredsize > outsize)
5734 /* Make room for all additional bytes. */
5735 if (charmapencode_resize(res, respos, requiredsize)) {
5736 Py_DECREF(repunicode);
5737 return -1;
5738 }
5739 memcpy(PyBytes_AsString(*res) + *respos,
5740 PyBytes_AsString(repunicode), repsize);
5741 *respos += repsize;
5742 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005743 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005744 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005745 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005746 /* generate replacement */
5747 repsize = PyUnicode_GET_SIZE(repunicode);
5748 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 x = charmapencode_output(*uni2, mapping, res, respos);
5750 if (x==enc_EXCEPTION) {
5751 return -1;
5752 }
5753 else if (x==enc_FAILED) {
5754 Py_DECREF(repunicode);
5755 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5756 return -1;
5757 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005758 }
5759 *inpos = newpos;
5760 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 }
5762 return 0;
5763}
5764
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 Py_ssize_t size,
5767 PyObject *mapping,
5768 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 /* output object */
5771 PyObject *res = NULL;
5772 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005773 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005775 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 PyObject *errorHandler = NULL;
5777 PyObject *exc = NULL;
5778 /* the following variable is used for caching string comparisons
5779 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5780 * 3=ignore, 4=xmlcharrefreplace */
5781 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782
5783 /* Default to Latin-1 */
5784 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005787 /* allocate enough for a simple encoding without
5788 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005789 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 if (res == NULL)
5791 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005792 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 /* try to encode it */
5797 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5798 if (x==enc_EXCEPTION) /* error */
5799 goto onError;
5800 if (x==enc_FAILED) { /* unencodable character */
5801 if (charmap_encoding_error(p, size, &inpos, mapping,
5802 &exc,
5803 &known_errorHandler, &errorHandler, errors,
5804 &res, &respos)) {
5805 goto onError;
5806 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005807 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 else
5809 /* done with this character => adjust input position */
5810 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005813 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005814 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005815 if (_PyBytes_Resize(&res, respos) < 0)
5816 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 Py_XDECREF(exc);
5819 Py_XDECREF(errorHandler);
5820 return res;
5821
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005823 Py_XDECREF(res);
5824 Py_XDECREF(exc);
5825 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 return NULL;
5827}
5828
5829PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831{
5832 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 PyErr_BadArgument();
5834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 }
5836 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 PyUnicode_GET_SIZE(unicode),
5838 mapping,
5839 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840}
5841
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005842/* create or adjust a UnicodeTranslateError */
5843static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 const Py_UNICODE *unicode, Py_ssize_t size,
5845 Py_ssize_t startpos, Py_ssize_t endpos,
5846 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005849 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 }
5852 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5854 goto onError;
5855 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5856 goto onError;
5857 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5858 goto onError;
5859 return;
5860 onError:
5861 Py_DECREF(*exceptionObject);
5862 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 }
5864}
5865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866/* raises a UnicodeTranslateError */
5867static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 const Py_UNICODE *unicode, Py_ssize_t size,
5869 Py_ssize_t startpos, Py_ssize_t endpos,
5870 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871{
5872 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876}
5877
5878/* error handling callback helper:
5879 build arguments, call the callback and check the arguments,
5880 put the result into newpos and return the replacement string, which
5881 has to be freed by the caller */
5882static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 PyObject **errorHandler,
5884 const char *reason,
5885 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5886 Py_ssize_t startpos, Py_ssize_t endpos,
5887 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005888{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005889 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005891 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005892 PyObject *restuple;
5893 PyObject *resunicode;
5894
5895 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005898 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005899 }
5900
5901 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005903 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905
5906 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005908 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005911 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 Py_DECREF(restuple);
5913 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005914 }
5915 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 &resunicode, &i_newpos)) {
5917 Py_DECREF(restuple);
5918 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005920 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005922 else
5923 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005924 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5926 Py_DECREF(restuple);
5927 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005928 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005929 Py_INCREF(resunicode);
5930 Py_DECREF(restuple);
5931 return resunicode;
5932}
5933
5934/* Lookup the character ch in the mapping and put the result in result,
5935 which must be decrefed by the caller.
5936 Return 0 on success, -1 on error */
5937static
5938int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5939{
Christian Heimes217cfd12007-12-02 14:31:20 +00005940 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941 PyObject *x;
5942
5943 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005945 x = PyObject_GetItem(mapping, w);
5946 Py_DECREF(w);
5947 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5949 /* No mapping found means: use 1:1 mapping. */
5950 PyErr_Clear();
5951 *result = NULL;
5952 return 0;
5953 } else
5954 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005955 }
5956 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 *result = x;
5958 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005959 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005960 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005961 long value = PyLong_AS_LONG(x);
5962 long max = PyUnicode_GetMax();
5963 if (value < 0 || value > max) {
5964 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005965 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005966 Py_DECREF(x);
5967 return -1;
5968 }
5969 *result = x;
5970 return 0;
5971 }
5972 else if (PyUnicode_Check(x)) {
5973 *result = x;
5974 return 0;
5975 }
5976 else {
5977 /* wrong return value */
5978 PyErr_SetString(PyExc_TypeError,
5979 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005980 Py_DECREF(x);
5981 return -1;
5982 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005983}
5984/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 if not reallocate and adjust various state variables.
5986 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987static
Walter Dörwald4894c302003-10-24 14:25:28 +00005988int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005990{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005991 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005992 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 /* remember old output position */
5994 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5995 /* exponentially overallocate to minimize reallocations */
5996 if (requiredsize < 2 * oldsize)
5997 requiredsize = 2 * oldsize;
5998 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5999 return -1;
6000 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006001 }
6002 return 0;
6003}
6004/* lookup the character, put the result in the output string and adjust
6005 various state variables. Return a new reference to the object that
6006 was put in the output buffer in *result, or Py_None, if the mapping was
6007 undefined (in which case no character was written).
6008 The called must decref result.
6009 Return 0 on success, -1 on error. */
6010static
Walter Dörwald4894c302003-10-24 14:25:28 +00006011int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6013 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006014{
Walter Dörwald4894c302003-10-24 14:25:28 +00006015 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 /* not found => default to 1:1 mapping */
6019 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020 }
6021 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006023 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 /* no overflow check, because we know that the space is enough */
6025 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 }
6027 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6029 if (repsize==1) {
6030 /* no overflow check, because we know that the space is enough */
6031 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6032 }
6033 else if (repsize!=0) {
6034 /* more than one character */
6035 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6036 (insize - (curinp-startinp)) +
6037 repsize - 1;
6038 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6039 return -1;
6040 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6041 *outp += repsize;
6042 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 }
6044 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046 return 0;
6047}
6048
6049PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 Py_ssize_t size,
6051 PyObject *mapping,
6052 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006054 /* output object */
6055 PyObject *res = NULL;
6056 /* pointers to the beginning and end+1 of input */
6057 const Py_UNICODE *startp = p;
6058 const Py_UNICODE *endp = p + size;
6059 /* pointer into the output */
6060 Py_UNICODE *str;
6061 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006062 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006063 char *reason = "character maps to <undefined>";
6064 PyObject *errorHandler = NULL;
6065 PyObject *exc = NULL;
6066 /* the following variable is used for caching string comparisons
6067 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6068 * 3=ignore, 4=xmlcharrefreplace */
6069 int known_errorHandler = -1;
6070
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 PyErr_BadArgument();
6073 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075
6076 /* allocate enough for a simple 1:1 translation without
6077 replacements, if we need more, we'll resize */
6078 res = PyUnicode_FromUnicode(NULL, size);
6079 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006085 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 /* try to encode it */
6087 PyObject *x = NULL;
6088 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6089 Py_XDECREF(x);
6090 goto onError;
6091 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006092 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 if (x!=Py_None) /* it worked => adjust input pointer */
6094 ++p;
6095 else { /* untranslatable character */
6096 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6097 Py_ssize_t repsize;
6098 Py_ssize_t newpos;
6099 Py_UNICODE *uni2;
6100 /* startpos for collecting untranslatable chars */
6101 const Py_UNICODE *collstart = p;
6102 const Py_UNICODE *collend = p+1;
6103 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 /* find all untranslatable characters */
6106 while (collend < endp) {
6107 if (charmaptranslate_lookup(*collend, mapping, &x))
6108 goto onError;
6109 Py_XDECREF(x);
6110 if (x!=Py_None)
6111 break;
6112 ++collend;
6113 }
6114 /* cache callback name lookup
6115 * (if not done yet, i.e. it's the first error) */
6116 if (known_errorHandler==-1) {
6117 if ((errors==NULL) || (!strcmp(errors, "strict")))
6118 known_errorHandler = 1;
6119 else if (!strcmp(errors, "replace"))
6120 known_errorHandler = 2;
6121 else if (!strcmp(errors, "ignore"))
6122 known_errorHandler = 3;
6123 else if (!strcmp(errors, "xmlcharrefreplace"))
6124 known_errorHandler = 4;
6125 else
6126 known_errorHandler = 0;
6127 }
6128 switch (known_errorHandler) {
6129 case 1: /* strict */
6130 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006131 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 case 2: /* replace */
6133 /* No need to check for space, this is a 1:1 replacement */
6134 for (coll = collstart; coll<collend; ++coll)
6135 *str++ = '?';
6136 /* fall through */
6137 case 3: /* ignore */
6138 p = collend;
6139 break;
6140 case 4: /* xmlcharrefreplace */
6141 /* generate replacement (temporarily (mis)uses p) */
6142 for (p = collstart; p < collend; ++p) {
6143 char buffer[2+29+1+1];
6144 char *cp;
6145 sprintf(buffer, "&#%d;", (int)*p);
6146 if (charmaptranslate_makespace(&res, &str,
6147 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6148 goto onError;
6149 for (cp = buffer; *cp; ++cp)
6150 *str++ = *cp;
6151 }
6152 p = collend;
6153 break;
6154 default:
6155 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6156 reason, startp, size, &exc,
6157 collstart-startp, collend-startp, &newpos);
6158 if (repunicode == NULL)
6159 goto onError;
6160 /* generate replacement */
6161 repsize = PyUnicode_GET_SIZE(repunicode);
6162 if (charmaptranslate_makespace(&res, &str,
6163 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6164 Py_DECREF(repunicode);
6165 goto onError;
6166 }
6167 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6168 *str++ = *uni2;
6169 p = startp + newpos;
6170 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006171 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006172 }
6173 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174 /* Resize if we allocated to much */
6175 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006176 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 if (PyUnicode_Resize(&res, respos) < 0)
6178 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179 }
6180 Py_XDECREF(exc);
6181 Py_XDECREF(errorHandler);
6182 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 Py_XDECREF(res);
6186 Py_XDECREF(exc);
6187 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 return NULL;
6189}
6190
6191PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006192 PyObject *mapping,
6193 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194{
6195 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006196
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 str = PyUnicode_FromObject(str);
6198 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 PyUnicode_GET_SIZE(str),
6202 mapping,
6203 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 Py_DECREF(str);
6205 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006206
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 Py_XDECREF(str);
6209 return NULL;
6210}
Tim Petersced69f82003-09-16 20:30:58 +00006211
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006212PyObject *
6213PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6214 Py_ssize_t length)
6215{
6216 PyObject *result;
6217 Py_UNICODE *p; /* write pointer into result */
6218 Py_ssize_t i;
6219 /* Copy to a new string */
6220 result = (PyObject *)_PyUnicode_New(length);
6221 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6222 if (result == NULL)
6223 return result;
6224 p = PyUnicode_AS_UNICODE(result);
6225 /* Iterate over code points */
6226 for (i = 0; i < length; i++) {
6227 Py_UNICODE ch =s[i];
6228 if (ch > 127) {
6229 int decimal = Py_UNICODE_TODECIMAL(ch);
6230 if (decimal >= 0)
6231 p[i] = '0' + decimal;
6232 }
6233 }
6234 return result;
6235}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006236/* --- Decimal Encoder ---------------------------------------------------- */
6237
6238int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 Py_ssize_t length,
6240 char *output,
6241 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006242{
6243 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006244 PyObject *errorHandler = NULL;
6245 PyObject *exc = NULL;
6246 const char *encoding = "decimal";
6247 const char *reason = "invalid decimal Unicode string";
6248 /* the following variable is used for caching string comparisons
6249 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6250 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006251
6252 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 PyErr_BadArgument();
6254 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006255 }
6256
6257 p = s;
6258 end = s + length;
6259 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 register Py_UNICODE ch = *p;
6261 int decimal;
6262 PyObject *repunicode;
6263 Py_ssize_t repsize;
6264 Py_ssize_t newpos;
6265 Py_UNICODE *uni2;
6266 Py_UNICODE *collstart;
6267 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006268
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006270 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 ++p;
6272 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006273 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 decimal = Py_UNICODE_TODECIMAL(ch);
6275 if (decimal >= 0) {
6276 *output++ = '0' + decimal;
6277 ++p;
6278 continue;
6279 }
6280 if (0 < ch && ch < 256) {
6281 *output++ = (char)ch;
6282 ++p;
6283 continue;
6284 }
6285 /* All other characters are considered unencodable */
6286 collstart = p;
6287 collend = p+1;
6288 while (collend < end) {
6289 if ((0 < *collend && *collend < 256) ||
6290 !Py_UNICODE_ISSPACE(*collend) ||
6291 Py_UNICODE_TODECIMAL(*collend))
6292 break;
6293 }
6294 /* cache callback name lookup
6295 * (if not done yet, i.e. it's the first error) */
6296 if (known_errorHandler==-1) {
6297 if ((errors==NULL) || (!strcmp(errors, "strict")))
6298 known_errorHandler = 1;
6299 else if (!strcmp(errors, "replace"))
6300 known_errorHandler = 2;
6301 else if (!strcmp(errors, "ignore"))
6302 known_errorHandler = 3;
6303 else if (!strcmp(errors, "xmlcharrefreplace"))
6304 known_errorHandler = 4;
6305 else
6306 known_errorHandler = 0;
6307 }
6308 switch (known_errorHandler) {
6309 case 1: /* strict */
6310 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6311 goto onError;
6312 case 2: /* replace */
6313 for (p = collstart; p < collend; ++p)
6314 *output++ = '?';
6315 /* fall through */
6316 case 3: /* ignore */
6317 p = collend;
6318 break;
6319 case 4: /* xmlcharrefreplace */
6320 /* generate replacement (temporarily (mis)uses p) */
6321 for (p = collstart; p < collend; ++p)
6322 output += sprintf(output, "&#%d;", (int)*p);
6323 p = collend;
6324 break;
6325 default:
6326 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6327 encoding, reason, s, length, &exc,
6328 collstart-s, collend-s, &newpos);
6329 if (repunicode == NULL)
6330 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006331 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006332 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006333 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6334 Py_DECREF(repunicode);
6335 goto onError;
6336 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 /* generate replacement */
6338 repsize = PyUnicode_GET_SIZE(repunicode);
6339 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6340 Py_UNICODE ch = *uni2;
6341 if (Py_UNICODE_ISSPACE(ch))
6342 *output++ = ' ';
6343 else {
6344 decimal = Py_UNICODE_TODECIMAL(ch);
6345 if (decimal >= 0)
6346 *output++ = '0' + decimal;
6347 else if (0 < ch && ch < 256)
6348 *output++ = (char)ch;
6349 else {
6350 Py_DECREF(repunicode);
6351 raise_encode_exception(&exc, encoding,
6352 s, length, collstart-s, collend-s, reason);
6353 goto onError;
6354 }
6355 }
6356 }
6357 p = s + newpos;
6358 Py_DECREF(repunicode);
6359 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006360 }
6361 /* 0-terminate the output string */
6362 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363 Py_XDECREF(exc);
6364 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006365 return 0;
6366
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006368 Py_XDECREF(exc);
6369 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006370 return -1;
6371}
6372
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373/* --- Helpers ------------------------------------------------------------ */
6374
Eric Smith8c663262007-08-25 02:26:07 +00006375#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006376#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006377
Thomas Wouters477c8d52006-05-27 19:21:47 +00006378#include "stringlib/count.h"
6379#include "stringlib/find.h"
6380#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006381#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006382
Eric Smith5807c412008-05-11 21:00:57 +00006383#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006384#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006385#include "stringlib/localeutil.h"
6386
Thomas Wouters477c8d52006-05-27 19:21:47 +00006387/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006388#define ADJUST_INDICES(start, end, len) \
6389 if (end > len) \
6390 end = len; \
6391 else if (end < 0) { \
6392 end += len; \
6393 if (end < 0) \
6394 end = 0; \
6395 } \
6396 if (start < 0) { \
6397 start += len; \
6398 if (start < 0) \
6399 start = 0; \
6400 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006401
Martin v. Löwis18e16552006-02-15 17:27:45 +00006402Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006403 PyObject *substr,
6404 Py_ssize_t start,
6405 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006407 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006408 PyUnicodeObject* str_obj;
6409 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006410
Thomas Wouters477c8d52006-05-27 19:21:47 +00006411 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6412 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006414 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6415 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 Py_DECREF(str_obj);
6417 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 }
Tim Petersced69f82003-09-16 20:30:58 +00006419
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006420 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006421 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006422 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6423 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006424 );
6425
6426 Py_DECREF(sub_obj);
6427 Py_DECREF(str_obj);
6428
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 return result;
6430}
6431
Martin v. Löwis18e16552006-02-15 17:27:45 +00006432Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006433 PyObject *sub,
6434 Py_ssize_t start,
6435 Py_ssize_t end,
6436 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006438 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006439
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006441 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006443 sub = PyUnicode_FromObject(sub);
6444 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 Py_DECREF(str);
6446 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 }
Tim Petersced69f82003-09-16 20:30:58 +00006448
Thomas Wouters477c8d52006-05-27 19:21:47 +00006449 if (direction > 0)
6450 result = stringlib_find_slice(
6451 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6452 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6453 start, end
6454 );
6455 else
6456 result = stringlib_rfind_slice(
6457 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6458 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6459 start, end
6460 );
6461
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006463 Py_DECREF(sub);
6464
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 return result;
6466}
6467
Tim Petersced69f82003-09-16 20:30:58 +00006468static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 PyUnicodeObject *substring,
6471 Py_ssize_t start,
6472 Py_ssize_t end,
6473 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 if (substring->length == 0)
6476 return 1;
6477
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006478 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 end -= substring->length;
6480 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482
6483 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 if (Py_UNICODE_MATCH(self, end, substring))
6485 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 } else {
6487 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 }
6490
6491 return 0;
6492}
6493
Martin v. Löwis18e16552006-02-15 17:27:45 +00006494Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 PyObject *substr,
6496 Py_ssize_t start,
6497 Py_ssize_t end,
6498 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006500 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006501
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 str = PyUnicode_FromObject(str);
6503 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 substr = PyUnicode_FromObject(substr);
6506 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 Py_DECREF(str);
6508 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 }
Tim Petersced69f82003-09-16 20:30:58 +00006510
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 (PyUnicodeObject *)substr,
6513 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 Py_DECREF(str);
6515 Py_DECREF(substr);
6516 return result;
6517}
6518
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519/* Apply fixfct filter to the Unicode object self and return a
6520 reference to the modified object */
6521
Tim Petersced69f82003-09-16 20:30:58 +00006522static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006524 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525{
6526
6527 PyUnicodeObject *u;
6528
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006529 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006532
6533 Py_UNICODE_COPY(u->str, self->str, self->length);
6534
Tim Peters7a29bd52001-09-12 03:03:31 +00006535 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 /* fixfct should return TRUE if it modified the buffer. If
6537 FALSE, return a reference to the original buffer instead
6538 (to save space, not time) */
6539 Py_INCREF(self);
6540 Py_DECREF(u);
6541 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 }
6543 return (PyObject*) u;
6544}
6545
Tim Petersced69f82003-09-16 20:30:58 +00006546static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547int fixupper(PyUnicodeObject *self)
6548{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006549 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 Py_UNICODE *s = self->str;
6551 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006552
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006555
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 ch = Py_UNICODE_TOUPPER(*s);
6557 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 *s = ch;
6560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 s++;
6562 }
6563
6564 return status;
6565}
6566
Tim Petersced69f82003-09-16 20:30:58 +00006567static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568int fixlower(PyUnicodeObject *self)
6569{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006570 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 Py_UNICODE *s = self->str;
6572 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006573
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006576
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 ch = Py_UNICODE_TOLOWER(*s);
6578 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006580 *s = ch;
6581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 s++;
6583 }
6584
6585 return status;
6586}
6587
Tim Petersced69f82003-09-16 20:30:58 +00006588static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589int fixswapcase(PyUnicodeObject *self)
6590{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006591 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 Py_UNICODE *s = self->str;
6593 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006594
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 while (len-- > 0) {
6596 if (Py_UNICODE_ISUPPER(*s)) {
6597 *s = Py_UNICODE_TOLOWER(*s);
6598 status = 1;
6599 } else if (Py_UNICODE_ISLOWER(*s)) {
6600 *s = Py_UNICODE_TOUPPER(*s);
6601 status = 1;
6602 }
6603 s++;
6604 }
6605
6606 return status;
6607}
6608
Tim Petersced69f82003-09-16 20:30:58 +00006609static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610int fixcapitalize(PyUnicodeObject *self)
6611{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006612 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006613 Py_UNICODE *s = self->str;
6614 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006615
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006616 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006618 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 *s = Py_UNICODE_TOUPPER(*s);
6620 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006622 s++;
6623 while (--len > 0) {
6624 if (Py_UNICODE_ISUPPER(*s)) {
6625 *s = Py_UNICODE_TOLOWER(*s);
6626 status = 1;
6627 }
6628 s++;
6629 }
6630 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631}
6632
6633static
6634int fixtitle(PyUnicodeObject *self)
6635{
6636 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6637 register Py_UNICODE *e;
6638 int previous_is_cased;
6639
6640 /* Shortcut for single character strings */
6641 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6643 if (*p != ch) {
6644 *p = ch;
6645 return 1;
6646 }
6647 else
6648 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 }
Tim Petersced69f82003-09-16 20:30:58 +00006650
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 e = p + PyUnicode_GET_SIZE(self);
6652 previous_is_cased = 0;
6653 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006655
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 if (previous_is_cased)
6657 *p = Py_UNICODE_TOLOWER(ch);
6658 else
6659 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006660
Benjamin Peterson29060642009-01-31 22:14:21 +00006661 if (Py_UNICODE_ISLOWER(ch) ||
6662 Py_UNICODE_ISUPPER(ch) ||
6663 Py_UNICODE_ISTITLE(ch))
6664 previous_is_cased = 1;
6665 else
6666 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 }
6668 return 1;
6669}
6670
Tim Peters8ce9f162004-08-27 01:49:32 +00006671PyObject *
6672PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673{
Skip Montanaro6543b452004-09-16 03:28:13 +00006674 const Py_UNICODE blank = ' ';
6675 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006676 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006677 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006678 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6679 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006680 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6681 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006682 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006683 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
Tim Peters05eba1f2004-08-27 21:32:02 +00006685 fseq = PySequence_Fast(seq, "");
6686 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006687 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006688 }
6689
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006690 /* NOTE: the following code can't call back into Python code,
6691 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006692 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006693
Tim Peters05eba1f2004-08-27 21:32:02 +00006694 seqlen = PySequence_Fast_GET_SIZE(fseq);
6695 /* If empty sequence, return u"". */
6696 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006697 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6698 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006699 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006700 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006701 /* If singleton sequence with an exact Unicode, return that. */
6702 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 item = items[0];
6704 if (PyUnicode_CheckExact(item)) {
6705 Py_INCREF(item);
6706 res = (PyUnicodeObject *)item;
6707 goto Done;
6708 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006709 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006710 else {
6711 /* Set up sep and seplen */
6712 if (separator == NULL) {
6713 sep = &blank;
6714 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006715 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006716 else {
6717 if (!PyUnicode_Check(separator)) {
6718 PyErr_Format(PyExc_TypeError,
6719 "separator: expected str instance,"
6720 " %.80s found",
6721 Py_TYPE(separator)->tp_name);
6722 goto onError;
6723 }
6724 sep = PyUnicode_AS_UNICODE(separator);
6725 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006726 }
6727 }
6728
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006729 /* There are at least two things to join, or else we have a subclass
6730 * of str in the sequence.
6731 * Do a pre-pass to figure out the total amount of space we'll
6732 * need (sz), and see whether all argument are strings.
6733 */
6734 sz = 0;
6735 for (i = 0; i < seqlen; i++) {
6736 const Py_ssize_t old_sz = sz;
6737 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 if (!PyUnicode_Check(item)) {
6739 PyErr_Format(PyExc_TypeError,
6740 "sequence item %zd: expected str instance,"
6741 " %.80s found",
6742 i, Py_TYPE(item)->tp_name);
6743 goto onError;
6744 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006745 sz += PyUnicode_GET_SIZE(item);
6746 if (i != 0)
6747 sz += seplen;
6748 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6749 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006751 goto onError;
6752 }
6753 }
Tim Petersced69f82003-09-16 20:30:58 +00006754
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006755 res = _PyUnicode_New(sz);
6756 if (res == NULL)
6757 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006758
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006759 /* Catenate everything. */
6760 res_p = PyUnicode_AS_UNICODE(res);
6761 for (i = 0; i < seqlen; ++i) {
6762 Py_ssize_t itemlen;
6763 item = items[i];
6764 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 /* Copy item, and maybe the separator. */
6766 if (i) {
6767 Py_UNICODE_COPY(res_p, sep, seplen);
6768 res_p += seplen;
6769 }
6770 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6771 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006772 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006773
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006775 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 return (PyObject *)res;
6777
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006779 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006780 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781 return NULL;
6782}
6783
Tim Petersced69f82003-09-16 20:30:58 +00006784static
6785PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 Py_ssize_t left,
6787 Py_ssize_t right,
6788 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789{
6790 PyUnicodeObject *u;
6791
6792 if (left < 0)
6793 left = 0;
6794 if (right < 0)
6795 right = 0;
6796
Tim Peters7a29bd52001-09-12 03:03:31 +00006797 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 Py_INCREF(self);
6799 return self;
6800 }
6801
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006802 if (left > PY_SSIZE_T_MAX - self->length ||
6803 right > PY_SSIZE_T_MAX - (left + self->length)) {
6804 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6805 return NULL;
6806 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 u = _PyUnicode_New(left + self->length + right);
6808 if (u) {
6809 if (left)
6810 Py_UNICODE_FILL(u->str, fill, left);
6811 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6812 if (right)
6813 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6814 }
6815
6816 return u;
6817}
6818
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006819PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822
6823 string = PyUnicode_FromObject(string);
6824 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006827 list = stringlib_splitlines(
6828 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6829 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830
6831 Py_DECREF(string);
6832 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833}
6834
Tim Petersced69f82003-09-16 20:30:58 +00006835static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 PyUnicodeObject *substring,
6838 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006841 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006844 return stringlib_split_whitespace(
6845 (PyObject*) self, self->str, self->length, maxcount
6846 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006848 return stringlib_split(
6849 (PyObject*) self, self->str, self->length,
6850 substring->str, substring->length,
6851 maxcount
6852 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853}
6854
Tim Petersced69f82003-09-16 20:30:58 +00006855static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006856PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 PyUnicodeObject *substring,
6858 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006859{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006860 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006861 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006862
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006863 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006864 return stringlib_rsplit_whitespace(
6865 (PyObject*) self, self->str, self->length, maxcount
6866 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006867
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006868 return stringlib_rsplit(
6869 (PyObject*) self, self->str, self->length,
6870 substring->str, substring->length,
6871 maxcount
6872 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006873}
6874
6875static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 PyUnicodeObject *str1,
6878 PyUnicodeObject *str2,
6879 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880{
6881 PyUnicodeObject *u;
6882
6883 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006885 else if (maxcount == 0 || self->length == 0)
6886 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887
Thomas Wouters477c8d52006-05-27 19:21:47 +00006888 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006889 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006890 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006891 if (str1->length == 0)
6892 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006893 if (str1->length == 1) {
6894 /* replace characters */
6895 Py_UNICODE u1, u2;
6896 if (!findchar(self->str, self->length, str1->str[0]))
6897 goto nothing;
6898 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6899 if (!u)
6900 return NULL;
6901 Py_UNICODE_COPY(u->str, self->str, self->length);
6902 u1 = str1->str[0];
6903 u2 = str2->str[0];
6904 for (i = 0; i < u->length; i++)
6905 if (u->str[i] == u1) {
6906 if (--maxcount < 0)
6907 break;
6908 u->str[i] = u2;
6909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006911 i = stringlib_find(
6912 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006914 if (i < 0)
6915 goto nothing;
6916 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6917 if (!u)
6918 return NULL;
6919 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006920
6921 /* change everything in-place, starting with this one */
6922 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6923 i += str1->length;
6924
6925 while ( --maxcount > 0) {
6926 i = stringlib_find(self->str+i, self->length-i,
6927 str1->str, str1->length,
6928 i);
6929 if (i == -1)
6930 break;
6931 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6932 i += str1->length;
6933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006936
Brett Cannonb94767f2011-02-22 20:15:44 +00006937 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006938 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 Py_UNICODE *p;
6940
6941 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006942 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6943 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006944 if (n == 0)
6945 goto nothing;
6946 /* new_size = self->length + n * (str2->length - str1->length)); */
6947 delta = (str2->length - str1->length);
6948 if (delta == 0) {
6949 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006951 product = n * (str2->length - str1->length);
6952 if ((product / (str2->length - str1->length)) != n) {
6953 PyErr_SetString(PyExc_OverflowError,
6954 "replace string is too long");
6955 return NULL;
6956 }
6957 new_size = self->length + product;
6958 if (new_size < 0) {
6959 PyErr_SetString(PyExc_OverflowError,
6960 "replace string is too long");
6961 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 }
6963 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006964 u = _PyUnicode_New(new_size);
6965 if (!u)
6966 return NULL;
6967 i = 0;
6968 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006969 if (str1->length > 0) {
6970 while (n-- > 0) {
6971 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006972 j = stringlib_find(self->str+i, self->length-i,
6973 str1->str, str1->length,
6974 i);
6975 if (j == -1)
6976 break;
6977 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006978 /* copy unchanged part [i:j] */
6979 Py_UNICODE_COPY(p, self->str+i, j-i);
6980 p += j - i;
6981 }
6982 /* copy substitution string */
6983 if (str2->length > 0) {
6984 Py_UNICODE_COPY(p, str2->str, str2->length);
6985 p += str2->length;
6986 }
6987 i = j + str1->length;
6988 }
6989 if (i < self->length)
6990 /* copy tail [i:] */
6991 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6992 } else {
6993 /* interleave */
6994 while (n > 0) {
6995 Py_UNICODE_COPY(p, str2->str, str2->length);
6996 p += str2->length;
6997 if (--n <= 0)
6998 break;
6999 *p++ = self->str[i++];
7000 }
7001 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007005
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007007 /* nothing to replace; return original string (when possible) */
7008 if (PyUnicode_CheckExact(self)) {
7009 Py_INCREF(self);
7010 return (PyObject *) self;
7011 }
7012 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007013}
7014
7015/* --- Unicode Object Methods --------------------------------------------- */
7016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007017PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019\n\
7020Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007021characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022
7023static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007024unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 return fixup(self, fixtitle);
7027}
7028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007029PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031\n\
7032Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007033have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034
7035static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007036unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 return fixup(self, fixcapitalize);
7039}
7040
7041#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007042PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044\n\
7045Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007046normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047
7048static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007049unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050{
7051 PyObject *list;
7052 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007053 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 /* Split into words */
7056 list = split(self, NULL, -1);
7057 if (!list)
7058 return NULL;
7059
7060 /* Capitalize each word */
7061 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7062 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 if (item == NULL)
7065 goto onError;
7066 Py_DECREF(PyList_GET_ITEM(list, i));
7067 PyList_SET_ITEM(list, i, item);
7068 }
7069
7070 /* Join the words to form a new string */
7071 item = PyUnicode_Join(NULL, list);
7072
Benjamin Peterson29060642009-01-31 22:14:21 +00007073 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 Py_DECREF(list);
7075 return (PyObject *)item;
7076}
7077#endif
7078
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007079/* Argument converter. Coerces to a single unicode character */
7080
7081static int
7082convert_uc(PyObject *obj, void *addr)
7083{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007084 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7085 PyObject *uniobj;
7086 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007087
Benjamin Peterson14339b62009-01-31 16:36:08 +00007088 uniobj = PyUnicode_FromObject(obj);
7089 if (uniobj == NULL) {
7090 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007092 return 0;
7093 }
7094 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7095 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007096 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007097 Py_DECREF(uniobj);
7098 return 0;
7099 }
7100 unistr = PyUnicode_AS_UNICODE(uniobj);
7101 *fillcharloc = unistr[0];
7102 Py_DECREF(uniobj);
7103 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007104}
7105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007106PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007107 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007109Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007110done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111
7112static PyObject *
7113unicode_center(PyUnicodeObject *self, PyObject *args)
7114{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007115 Py_ssize_t marg, left;
7116 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007117 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118
Thomas Woutersde017742006-02-16 19:34:37 +00007119 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120 return NULL;
7121
Tim Peters7a29bd52001-09-12 03:03:31 +00007122 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123 Py_INCREF(self);
7124 return (PyObject*) self;
7125 }
7126
7127 marg = width - self->length;
7128 left = marg / 2 + (marg & width & 1);
7129
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007130 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131}
7132
Marc-André Lemburge5034372000-08-08 08:04:29 +00007133#if 0
7134
7135/* This code should go into some future Unicode collation support
7136 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007137 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007138
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007139/* speedy UTF-16 code point order comparison */
7140/* gleaned from: */
7141/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7142
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007143static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007144{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007145 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007146 0, 0, 0, 0, 0, 0, 0, 0,
7147 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007148 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007149};
7150
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151static int
7152unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7153{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007154 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007155
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156 Py_UNICODE *s1 = str1->str;
7157 Py_UNICODE *s2 = str2->str;
7158
7159 len1 = str1->length;
7160 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007161
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007163 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007164
7165 c1 = *s1++;
7166 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007167
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 if (c1 > (1<<11) * 26)
7169 c1 += utf16Fixup[c1>>11];
7170 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007171 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007172 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007173
7174 if (c1 != c2)
7175 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007176
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007177 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 }
7179
7180 return (len1 < len2) ? -1 : (len1 != len2);
7181}
7182
Marc-André Lemburge5034372000-08-08 08:04:29 +00007183#else
7184
7185static int
7186unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7187{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007188 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007189
7190 Py_UNICODE *s1 = str1->str;
7191 Py_UNICODE *s2 = str2->str;
7192
7193 len1 = str1->length;
7194 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007195
Marc-André Lemburge5034372000-08-08 08:04:29 +00007196 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007197 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007198
Fredrik Lundh45714e92001-06-26 16:39:36 +00007199 c1 = *s1++;
7200 c2 = *s2++;
7201
7202 if (c1 != c2)
7203 return (c1 < c2) ? -1 : 1;
7204
Marc-André Lemburge5034372000-08-08 08:04:29 +00007205 len1--; len2--;
7206 }
7207
7208 return (len1 < len2) ? -1 : (len1 != len2);
7209}
7210
7211#endif
7212
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007216 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7217 return unicode_compare((PyUnicodeObject *)left,
7218 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007219 PyErr_Format(PyExc_TypeError,
7220 "Can't compare %.100s and %.100s",
7221 left->ob_type->tp_name,
7222 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223 return -1;
7224}
7225
Martin v. Löwis5b222132007-06-10 09:51:05 +00007226int
7227PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7228{
7229 int i;
7230 Py_UNICODE *id;
7231 assert(PyUnicode_Check(uni));
7232 id = PyUnicode_AS_UNICODE(uni);
7233 /* Compare Unicode string and source character set string */
7234 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 if (id[i] != str[i])
7236 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007237 /* This check keeps Python strings that end in '\0' from comparing equal
7238 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007239 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007241 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007243 return 0;
7244}
7245
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007246
Benjamin Peterson29060642009-01-31 22:14:21 +00007247#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007248 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007249
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007250PyObject *PyUnicode_RichCompare(PyObject *left,
7251 PyObject *right,
7252 int op)
7253{
7254 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007255
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007256 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7257 PyObject *v;
7258 if (((PyUnicodeObject *) left)->length !=
7259 ((PyUnicodeObject *) right)->length) {
7260 if (op == Py_EQ) {
7261 Py_INCREF(Py_False);
7262 return Py_False;
7263 }
7264 if (op == Py_NE) {
7265 Py_INCREF(Py_True);
7266 return Py_True;
7267 }
7268 }
7269 if (left == right)
7270 result = 0;
7271 else
7272 result = unicode_compare((PyUnicodeObject *)left,
7273 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007274
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007275 /* Convert the return value to a Boolean */
7276 switch (op) {
7277 case Py_EQ:
7278 v = TEST_COND(result == 0);
7279 break;
7280 case Py_NE:
7281 v = TEST_COND(result != 0);
7282 break;
7283 case Py_LE:
7284 v = TEST_COND(result <= 0);
7285 break;
7286 case Py_GE:
7287 v = TEST_COND(result >= 0);
7288 break;
7289 case Py_LT:
7290 v = TEST_COND(result == -1);
7291 break;
7292 case Py_GT:
7293 v = TEST_COND(result == 1);
7294 break;
7295 default:
7296 PyErr_BadArgument();
7297 return NULL;
7298 }
7299 Py_INCREF(v);
7300 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007302
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007303 Py_INCREF(Py_NotImplemented);
7304 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007305}
7306
Guido van Rossum403d68b2000-03-13 15:55:09 +00007307int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007308 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007309{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007310 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007311 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007312
7313 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007314 sub = PyUnicode_FromObject(element);
7315 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 PyErr_Format(PyExc_TypeError,
7317 "'in <string>' requires string as left operand, not %s",
7318 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007319 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007320 }
7321
Thomas Wouters477c8d52006-05-27 19:21:47 +00007322 str = PyUnicode_FromObject(container);
7323 if (!str) {
7324 Py_DECREF(sub);
7325 return -1;
7326 }
7327
7328 result = stringlib_contains_obj(str, sub);
7329
7330 Py_DECREF(str);
7331 Py_DECREF(sub);
7332
Guido van Rossum403d68b2000-03-13 15:55:09 +00007333 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007334}
7335
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336/* Concat to string or Unicode object giving a new Unicode object. */
7337
7338PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340{
7341 PyUnicodeObject *u = NULL, *v = NULL, *w;
7342
7343 /* Coerce the two arguments */
7344 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7345 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7348 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350
7351 /* Shortcuts */
7352 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 Py_DECREF(v);
7354 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 }
7356 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 Py_DECREF(u);
7358 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 }
7360
7361 /* Concat the two Unicode strings */
7362 w = _PyUnicode_New(u->length + v->length);
7363 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007364 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 Py_UNICODE_COPY(w->str, u->str, u->length);
7366 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7367
7368 Py_DECREF(u);
7369 Py_DECREF(v);
7370 return (PyObject *)w;
7371
Benjamin Peterson29060642009-01-31 22:14:21 +00007372 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373 Py_XDECREF(u);
7374 Py_XDECREF(v);
7375 return NULL;
7376}
7377
Walter Dörwald1ab83302007-05-18 17:15:44 +00007378void
7379PyUnicode_Append(PyObject **pleft, PyObject *right)
7380{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007381 PyObject *new;
7382 if (*pleft == NULL)
7383 return;
7384 if (right == NULL || !PyUnicode_Check(*pleft)) {
7385 Py_DECREF(*pleft);
7386 *pleft = NULL;
7387 return;
7388 }
7389 new = PyUnicode_Concat(*pleft, right);
7390 Py_DECREF(*pleft);
7391 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007392}
7393
7394void
7395PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7396{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007397 PyUnicode_Append(pleft, right);
7398 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007399}
7400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007401PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007404Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007405string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007406interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407
7408static PyObject *
7409unicode_count(PyUnicodeObject *self, PyObject *args)
7410{
7411 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007412 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007413 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414 PyObject *result;
7415
Guido van Rossumb8872e62000-05-09 14:14:27 +00007416 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 return NULL;
7419
7420 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007421 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007424
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007425 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007426 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007427 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007428 substring->str, substring->length,
7429 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007430 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431
7432 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007433
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 return result;
7435}
7436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007437PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007438 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007440Encode S using the codec registered for encoding. Default encoding\n\
7441is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007442handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007443a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7444'xmlcharrefreplace' as well as any other name registered with\n\
7445codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
7447static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007448unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007450 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 char *encoding = NULL;
7452 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007453
Benjamin Peterson308d6372009-09-18 21:42:35 +00007454 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7455 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007457 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007458}
7459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007460PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462\n\
7463Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007464If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465
7466static PyObject*
7467unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7468{
7469 Py_UNICODE *e;
7470 Py_UNICODE *p;
7471 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007472 Py_UNICODE *qe;
7473 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 PyUnicodeObject *u;
7475 int tabsize = 8;
7476
7477 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
Thomas Wouters7e474022000-07-16 12:04:32 +00007480 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007481 i = 0; /* chars up to and including most recent \n or \r */
7482 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7483 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 for (p = self->str; p < e; p++)
7485 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 if (tabsize > 0) {
7487 incr = tabsize - (j % tabsize); /* cannot overflow */
7488 if (j > PY_SSIZE_T_MAX - incr)
7489 goto overflow1;
7490 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007491 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 if (j > PY_SSIZE_T_MAX - 1)
7495 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 j++;
7497 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 if (i > PY_SSIZE_T_MAX - j)
7499 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007501 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 }
7503 }
7504
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007505 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007507
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 /* Second pass: create output string and fill it */
7509 u = _PyUnicode_New(i + j);
7510 if (!u)
7511 return NULL;
7512
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007513 j = 0; /* same as in first pass */
7514 q = u->str; /* next output char */
7515 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516
7517 for (p = self->str; p < e; p++)
7518 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 if (tabsize > 0) {
7520 i = tabsize - (j % tabsize);
7521 j += i;
7522 while (i--) {
7523 if (q >= qe)
7524 goto overflow2;
7525 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007526 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 else {
7530 if (q >= qe)
7531 goto overflow2;
7532 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007533 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 if (*p == '\n' || *p == '\r')
7535 j = 0;
7536 }
7537
7538 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007539
7540 overflow2:
7541 Py_DECREF(u);
7542 overflow1:
7543 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545}
7546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007547PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549\n\
7550Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007551such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552arguments start and end are interpreted as in slice notation.\n\
7553\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007554Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555
7556static PyObject *
7557unicode_find(PyUnicodeObject *self, PyObject *args)
7558{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007559 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007560 Py_ssize_t start;
7561 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007562 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563
Christian Heimes9cd17752007-11-18 19:35:23 +00007564 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566
Thomas Wouters477c8d52006-05-27 19:21:47 +00007567 result = stringlib_find_slice(
7568 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7569 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7570 start, end
7571 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
7573 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007574
Christian Heimes217cfd12007-12-02 14:31:20 +00007575 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576}
7577
7578static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007579unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580{
7581 if (index < 0 || index >= self->length) {
7582 PyErr_SetString(PyExc_IndexError, "string index out of range");
7583 return NULL;
7584 }
7585
7586 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7587}
7588
Guido van Rossumc2504932007-09-18 19:42:40 +00007589/* Believe it or not, this produces the same value for ASCII strings
7590 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007591static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007592unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593{
Guido van Rossumc2504932007-09-18 19:42:40 +00007594 Py_ssize_t len;
7595 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007596 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007597
7598 if (self->hash != -1)
7599 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007600 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007601 p = self->str;
7602 x = *p << 7;
7603 while (--len >= 0)
7604 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007605 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007606 if (x == -1)
7607 x = -2;
7608 self->hash = x;
7609 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610}
7611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007612PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007615Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616
7617static PyObject *
7618unicode_index(PyUnicodeObject *self, PyObject *args)
7619{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007620 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007621 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007622 Py_ssize_t start;
7623 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624
Christian Heimes9cd17752007-11-18 19:35:23 +00007625 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
Thomas Wouters477c8d52006-05-27 19:21:47 +00007628 result = stringlib_find_slice(
7629 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7630 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7631 start, end
7632 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
7634 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007635
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 if (result < 0) {
7637 PyErr_SetString(PyExc_ValueError, "substring not found");
7638 return NULL;
7639 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007640
Christian Heimes217cfd12007-12-02 14:31:20 +00007641 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642}
7643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007644PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007647Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007648at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649
7650static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007651unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652{
7653 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7654 register const Py_UNICODE *e;
7655 int cased;
7656
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657 /* Shortcut for single character strings */
7658 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007661 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007662 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007664
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665 e = p + PyUnicode_GET_SIZE(self);
7666 cased = 0;
7667 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007669
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7671 return PyBool_FromLong(0);
7672 else if (!cased && Py_UNICODE_ISLOWER(ch))
7673 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007675 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676}
7677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007678PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007681Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007682at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683
7684static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007685unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686{
7687 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7688 register const Py_UNICODE *e;
7689 int cased;
7690
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 /* Shortcut for single character strings */
7692 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007695 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007696 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007698
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699 e = p + PyUnicode_GET_SIZE(self);
7700 cased = 0;
7701 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007703
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7705 return PyBool_FromLong(0);
7706 else if (!cased && Py_UNICODE_ISUPPER(ch))
7707 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007709 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710}
7711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007712PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007715Return True if S is a titlecased string and there is at least one\n\
7716character in S, i.e. upper- and titlecase characters may only\n\
7717follow uncased characters and lowercase characters only cased ones.\n\
7718Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
7720static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007721unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722{
7723 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7724 register const Py_UNICODE *e;
7725 int cased, previous_is_cased;
7726
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727 /* Shortcut for single character strings */
7728 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7730 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007732 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007733 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007735
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736 e = p + PyUnicode_GET_SIZE(self);
7737 cased = 0;
7738 previous_is_cased = 0;
7739 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007741
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7743 if (previous_is_cased)
7744 return PyBool_FromLong(0);
7745 previous_is_cased = 1;
7746 cased = 1;
7747 }
7748 else if (Py_UNICODE_ISLOWER(ch)) {
7749 if (!previous_is_cased)
7750 return PyBool_FromLong(0);
7751 previous_is_cased = 1;
7752 cased = 1;
7753 }
7754 else
7755 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007757 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758}
7759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007760PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007763Return True if all characters in S are whitespace\n\
7764and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
7766static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007767unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768{
7769 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7770 register const Py_UNICODE *e;
7771
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 /* Shortcut for single character strings */
7773 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 Py_UNICODE_ISSPACE(*p))
7775 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007777 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007778 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007780
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781 e = p + PyUnicode_GET_SIZE(self);
7782 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 if (!Py_UNICODE_ISSPACE(*p))
7784 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007786 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787}
7788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007789PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007791\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007792Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007793and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007794
7795static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007796unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007797{
7798 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7799 register const Py_UNICODE *e;
7800
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007801 /* Shortcut for single character strings */
7802 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 Py_UNICODE_ISALPHA(*p))
7804 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007805
7806 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007807 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007809
7810 e = p + PyUnicode_GET_SIZE(self);
7811 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 if (!Py_UNICODE_ISALPHA(*p))
7813 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007814 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007815 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007816}
7817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007818PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007820\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007821Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007822and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007823
7824static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007825unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007826{
7827 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7828 register const Py_UNICODE *e;
7829
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007830 /* Shortcut for single character strings */
7831 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 Py_UNICODE_ISALNUM(*p))
7833 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007834
7835 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007836 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007838
7839 e = p + PyUnicode_GET_SIZE(self);
7840 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 if (!Py_UNICODE_ISALNUM(*p))
7842 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007843 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007844 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007845}
7846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007847PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007850Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007851False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852
7853static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007854unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855{
7856 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7857 register const Py_UNICODE *e;
7858
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859 /* Shortcut for single character strings */
7860 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 Py_UNICODE_ISDECIMAL(*p))
7862 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007864 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007865 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007867
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 e = p + PyUnicode_GET_SIZE(self);
7869 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 if (!Py_UNICODE_ISDECIMAL(*p))
7871 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007873 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874}
7875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007876PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007879Return True if all characters in S are digits\n\
7880and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881
7882static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007883unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884{
7885 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7886 register const Py_UNICODE *e;
7887
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 /* Shortcut for single character strings */
7889 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 Py_UNICODE_ISDIGIT(*p))
7891 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007893 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007894 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007896
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 e = p + PyUnicode_GET_SIZE(self);
7898 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 if (!Py_UNICODE_ISDIGIT(*p))
7900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007902 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903}
7904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007905PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007908Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007909False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910
7911static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007912unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913{
7914 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7915 register const Py_UNICODE *e;
7916
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917 /* Shortcut for single character strings */
7918 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 Py_UNICODE_ISNUMERIC(*p))
7920 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007922 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007923 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007924 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007925
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 e = p + PyUnicode_GET_SIZE(self);
7927 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 if (!Py_UNICODE_ISNUMERIC(*p))
7929 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007931 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932}
7933
Martin v. Löwis47383402007-08-15 07:32:56 +00007934int
7935PyUnicode_IsIdentifier(PyObject *self)
7936{
7937 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7938 register const Py_UNICODE *e;
7939
7940 /* Special case for empty strings */
7941 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007943
7944 /* PEP 3131 says that the first character must be in
7945 XID_Start and subsequent characters in XID_Continue,
7946 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007947 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007948 letters, digits, underscore). However, given the current
7949 definition of XID_Start and XID_Continue, it is sufficient
7950 to check just for these, except that _ must be allowed
7951 as starting an identifier. */
7952 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7953 return 0;
7954
7955 e = p + PyUnicode_GET_SIZE(self);
7956 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 if (!_PyUnicode_IsXidContinue(*p))
7958 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007959 }
7960 return 1;
7961}
7962
7963PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007965\n\
7966Return True if S is a valid identifier according\n\
7967to the language definition.");
7968
7969static PyObject*
7970unicode_isidentifier(PyObject *self)
7971{
7972 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7973}
7974
Georg Brandl559e5d72008-06-11 18:37:52 +00007975PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007977\n\
7978Return True if all characters in S are considered\n\
7979printable in repr() or S is empty, False otherwise.");
7980
7981static PyObject*
7982unicode_isprintable(PyObject *self)
7983{
7984 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7985 register const Py_UNICODE *e;
7986
7987 /* Shortcut for single character strings */
7988 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7989 Py_RETURN_TRUE;
7990 }
7991
7992 e = p + PyUnicode_GET_SIZE(self);
7993 for (; p < e; p++) {
7994 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7995 Py_RETURN_FALSE;
7996 }
7997 }
7998 Py_RETURN_TRUE;
7999}
8000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008001PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008002 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003\n\
8004Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008005iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006
8007static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008008unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008010 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011}
8012
Martin v. Löwis18e16552006-02-15 17:27:45 +00008013static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014unicode_length(PyUnicodeObject *self)
8015{
8016 return self->length;
8017}
8018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008019PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008022Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008023done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024
8025static PyObject *
8026unicode_ljust(PyUnicodeObject *self, PyObject *args)
8027{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008028 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008029 Py_UNICODE fillchar = ' ';
8030
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008031 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 return NULL;
8033
Tim Peters7a29bd52001-09-12 03:03:31 +00008034 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035 Py_INCREF(self);
8036 return (PyObject*) self;
8037 }
8038
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008039 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040}
8041
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008042PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008045Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046
8047static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008048unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 return fixup(self, fixlower);
8051}
8052
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008053#define LEFTSTRIP 0
8054#define RIGHTSTRIP 1
8055#define BOTHSTRIP 2
8056
8057/* Arrays indexed by above */
8058static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8059
8060#define STRIPNAME(i) (stripformat[i]+3)
8061
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008062/* externally visible for str.strip(unicode) */
8063PyObject *
8064_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8065{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008066 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8067 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8068 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8069 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8070 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008071
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008073
Benjamin Peterson14339b62009-01-31 16:36:08 +00008074 i = 0;
8075 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8077 i++;
8078 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008079 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008080
Benjamin Peterson14339b62009-01-31 16:36:08 +00008081 j = len;
8082 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 do {
8084 j--;
8085 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8086 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008087 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008088
Benjamin Peterson14339b62009-01-31 16:36:08 +00008089 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 Py_INCREF(self);
8091 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008092 }
8093 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008095}
8096
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097
8098static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008099do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008101 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8102 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008103
Benjamin Peterson14339b62009-01-31 16:36:08 +00008104 i = 0;
8105 if (striptype != RIGHTSTRIP) {
8106 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8107 i++;
8108 }
8109 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008110
Benjamin Peterson14339b62009-01-31 16:36:08 +00008111 j = len;
8112 if (striptype != LEFTSTRIP) {
8113 do {
8114 j--;
8115 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8116 j++;
8117 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008118
Benjamin Peterson14339b62009-01-31 16:36:08 +00008119 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8120 Py_INCREF(self);
8121 return (PyObject*)self;
8122 }
8123 else
8124 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125}
8126
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008127
8128static PyObject *
8129do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8130{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008131 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008132
Benjamin Peterson14339b62009-01-31 16:36:08 +00008133 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8134 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008135
Benjamin Peterson14339b62009-01-31 16:36:08 +00008136 if (sep != NULL && sep != Py_None) {
8137 if (PyUnicode_Check(sep))
8138 return _PyUnicode_XStrip(self, striptype, sep);
8139 else {
8140 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 "%s arg must be None or str",
8142 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008143 return NULL;
8144 }
8145 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008146
Benjamin Peterson14339b62009-01-31 16:36:08 +00008147 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008148}
8149
8150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008151PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008153\n\
8154Return a copy of the string S with leading and trailing\n\
8155whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008156If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008157
8158static PyObject *
8159unicode_strip(PyUnicodeObject *self, PyObject *args)
8160{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008161 if (PyTuple_GET_SIZE(args) == 0)
8162 return do_strip(self, BOTHSTRIP); /* Common case */
8163 else
8164 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008165}
8166
8167
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008168PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008170\n\
8171Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008172If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008173
8174static PyObject *
8175unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8176{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008177 if (PyTuple_GET_SIZE(args) == 0)
8178 return do_strip(self, LEFTSTRIP); /* Common case */
8179 else
8180 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008181}
8182
8183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008184PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008186\n\
8187Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008188If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008189
8190static PyObject *
8191unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8192{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 if (PyTuple_GET_SIZE(args) == 0)
8194 return do_strip(self, RIGHTSTRIP); /* Common case */
8195 else
8196 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008197}
8198
8199
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008201unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202{
8203 PyUnicodeObject *u;
8204 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008205 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008206 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207
Georg Brandl222de0f2009-04-12 12:01:50 +00008208 if (len < 1) {
8209 Py_INCREF(unicode_empty);
8210 return (PyObject *)unicode_empty;
8211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212
Tim Peters7a29bd52001-09-12 03:03:31 +00008213 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214 /* no repeat, return original string */
8215 Py_INCREF(str);
8216 return (PyObject*) str;
8217 }
Tim Peters8f422462000-09-09 06:13:41 +00008218
8219 /* ensure # of chars needed doesn't overflow int and # of bytes
8220 * needed doesn't overflow size_t
8221 */
8222 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008223 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008224 PyErr_SetString(PyExc_OverflowError,
8225 "repeated string is too long");
8226 return NULL;
8227 }
8228 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8229 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8230 PyErr_SetString(PyExc_OverflowError,
8231 "repeated string is too long");
8232 return NULL;
8233 }
8234 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235 if (!u)
8236 return NULL;
8237
8238 p = u->str;
8239
Georg Brandl222de0f2009-04-12 12:01:50 +00008240 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008241 Py_UNICODE_FILL(p, str->str[0], len);
8242 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008243 Py_ssize_t done = str->length; /* number of characters copied this far */
8244 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008246 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008247 Py_UNICODE_COPY(p+done, p, n);
8248 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 }
8251
8252 return (PyObject*) u;
8253}
8254
8255PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 PyObject *subobj,
8257 PyObject *replobj,
8258 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259{
8260 PyObject *self;
8261 PyObject *str1;
8262 PyObject *str2;
8263 PyObject *result;
8264
8265 self = PyUnicode_FromObject(obj);
8266 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 str1 = PyUnicode_FromObject(subobj);
8269 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 Py_DECREF(self);
8271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 }
8273 str2 = PyUnicode_FromObject(replobj);
8274 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 Py_DECREF(self);
8276 Py_DECREF(str1);
8277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 }
Tim Petersced69f82003-09-16 20:30:58 +00008279 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 (PyUnicodeObject *)str1,
8281 (PyUnicodeObject *)str2,
8282 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 Py_DECREF(self);
8284 Py_DECREF(str1);
8285 Py_DECREF(str2);
8286 return result;
8287}
8288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008289PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008290 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291\n\
8292Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008293old replaced by new. If the optional argument count is\n\
8294given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295
8296static PyObject*
8297unicode_replace(PyUnicodeObject *self, PyObject *args)
8298{
8299 PyUnicodeObject *str1;
8300 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008301 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 PyObject *result;
8303
Martin v. Löwis18e16552006-02-15 17:27:45 +00008304 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 return NULL;
8306 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8307 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008310 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 Py_DECREF(str1);
8312 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314
8315 result = replace(self, str1, str2, maxcount);
8316
8317 Py_DECREF(str1);
8318 Py_DECREF(str2);
8319 return result;
8320}
8321
8322static
8323PyObject *unicode_repr(PyObject *unicode)
8324{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008325 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008326 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008327 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8328 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8329
8330 /* XXX(nnorwitz): rather than over-allocating, it would be
8331 better to choose a different scheme. Perhaps scan the
8332 first N-chars of the string and allocate based on that size.
8333 */
8334 /* Initial allocation is based on the longest-possible unichr
8335 escape.
8336
8337 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8338 unichr, so in this case it's the longest unichr escape. In
8339 narrow (UTF-16) builds this is five chars per source unichr
8340 since there are two unichrs in the surrogate pair, so in narrow
8341 (UTF-16) builds it's not the longest unichr escape.
8342
8343 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8344 so in the narrow (UTF-16) build case it's the longest unichr
8345 escape.
8346 */
8347
Walter Dörwald1ab83302007-05-18 17:15:44 +00008348 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008350#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008352#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008354#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008356 if (repr == NULL)
8357 return NULL;
8358
Walter Dörwald1ab83302007-05-18 17:15:44 +00008359 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008360
8361 /* Add quote */
8362 *p++ = (findchar(s, size, '\'') &&
8363 !findchar(s, size, '"')) ? '"' : '\'';
8364 while (size-- > 0) {
8365 Py_UNICODE ch = *s++;
8366
8367 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008368 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008369 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008370 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008371 continue;
8372 }
8373
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008375 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008376 *p++ = '\\';
8377 *p++ = 't';
8378 }
8379 else if (ch == '\n') {
8380 *p++ = '\\';
8381 *p++ = 'n';
8382 }
8383 else if (ch == '\r') {
8384 *p++ = '\\';
8385 *p++ = 'r';
8386 }
8387
8388 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008389 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008390 *p++ = '\\';
8391 *p++ = 'x';
8392 *p++ = hexdigits[(ch >> 4) & 0x000F];
8393 *p++ = hexdigits[ch & 0x000F];
8394 }
8395
Georg Brandl559e5d72008-06-11 18:37:52 +00008396 /* Copy ASCII characters as-is */
8397 else if (ch < 0x7F) {
8398 *p++ = ch;
8399 }
8400
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008402 else {
8403 Py_UCS4 ucs = ch;
8404
8405#ifndef Py_UNICODE_WIDE
8406 Py_UNICODE ch2 = 0;
8407 /* Get code point from surrogate pair */
8408 if (size > 0) {
8409 ch2 = *s;
8410 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008412 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008414 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008415 size--;
8416 }
8417 }
8418#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008419 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008420 (categories Z* and C* except ASCII space)
8421 */
8422 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8423 /* Map 8-bit characters to '\xhh' */
8424 if (ucs <= 0xff) {
8425 *p++ = '\\';
8426 *p++ = 'x';
8427 *p++ = hexdigits[(ch >> 4) & 0x000F];
8428 *p++ = hexdigits[ch & 0x000F];
8429 }
8430 /* Map 21-bit characters to '\U00xxxxxx' */
8431 else if (ucs >= 0x10000) {
8432 *p++ = '\\';
8433 *p++ = 'U';
8434 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8435 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8436 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8437 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8438 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8439 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8440 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8441 *p++ = hexdigits[ucs & 0x0000000F];
8442 }
8443 /* Map 16-bit characters to '\uxxxx' */
8444 else {
8445 *p++ = '\\';
8446 *p++ = 'u';
8447 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8448 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8449 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8450 *p++ = hexdigits[ucs & 0x000F];
8451 }
8452 }
8453 /* Copy characters as-is */
8454 else {
8455 *p++ = ch;
8456#ifndef Py_UNICODE_WIDE
8457 if (ucs >= 0x10000)
8458 *p++ = ch2;
8459#endif
8460 }
8461 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008462 }
8463 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008464 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008465
8466 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008467 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008468 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469}
8470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008471PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473\n\
8474Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008475such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476arguments start and end are interpreted as in slice notation.\n\
8477\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008478Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479
8480static PyObject *
8481unicode_rfind(PyUnicodeObject *self, PyObject *args)
8482{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008483 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008484 Py_ssize_t start;
8485 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008486 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487
Christian Heimes9cd17752007-11-18 19:35:23 +00008488 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490
Thomas Wouters477c8d52006-05-27 19:21:47 +00008491 result = stringlib_rfind_slice(
8492 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8493 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8494 start, end
8495 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496
8497 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008498
Christian Heimes217cfd12007-12-02 14:31:20 +00008499 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500}
8501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008502PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008505Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506
8507static PyObject *
8508unicode_rindex(PyUnicodeObject *self, PyObject *args)
8509{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008510 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008511 Py_ssize_t start;
8512 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008513 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514
Christian Heimes9cd17752007-11-18 19:35:23 +00008515 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517
Thomas Wouters477c8d52006-05-27 19:21:47 +00008518 result = stringlib_rfind_slice(
8519 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8520 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8521 start, end
8522 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523
8524 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008525
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526 if (result < 0) {
8527 PyErr_SetString(PyExc_ValueError, "substring not found");
8528 return NULL;
8529 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008530 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531}
8532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008533PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008536Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008537done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538
8539static PyObject *
8540unicode_rjust(PyUnicodeObject *self, PyObject *args)
8541{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008542 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008543 Py_UNICODE fillchar = ' ';
8544
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008545 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546 return NULL;
8547
Tim Peters7a29bd52001-09-12 03:03:31 +00008548 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549 Py_INCREF(self);
8550 return (PyObject*) self;
8551 }
8552
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008553 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554}
8555
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 PyObject *sep,
8558 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559{
8560 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008561
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 s = PyUnicode_FromObject(s);
8563 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008564 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 if (sep != NULL) {
8566 sep = PyUnicode_FromObject(sep);
8567 if (sep == NULL) {
8568 Py_DECREF(s);
8569 return NULL;
8570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 }
8572
8573 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8574
8575 Py_DECREF(s);
8576 Py_XDECREF(sep);
8577 return result;
8578}
8579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008580PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582\n\
8583Return a list of the words in S, using sep as the\n\
8584delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008585splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008586whitespace string is a separator and empty strings are\n\
8587removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
8589static PyObject*
8590unicode_split(PyUnicodeObject *self, PyObject *args)
8591{
8592 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008593 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594
Martin v. Löwis18e16552006-02-15 17:27:45 +00008595 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 return NULL;
8597
8598 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604}
8605
Thomas Wouters477c8d52006-05-27 19:21:47 +00008606PyObject *
8607PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8608{
8609 PyObject* str_obj;
8610 PyObject* sep_obj;
8611 PyObject* out;
8612
8613 str_obj = PyUnicode_FromObject(str_in);
8614 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008616 sep_obj = PyUnicode_FromObject(sep_in);
8617 if (!sep_obj) {
8618 Py_DECREF(str_obj);
8619 return NULL;
8620 }
8621
8622 out = stringlib_partition(
8623 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8624 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8625 );
8626
8627 Py_DECREF(sep_obj);
8628 Py_DECREF(str_obj);
8629
8630 return out;
8631}
8632
8633
8634PyObject *
8635PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8636{
8637 PyObject* str_obj;
8638 PyObject* sep_obj;
8639 PyObject* out;
8640
8641 str_obj = PyUnicode_FromObject(str_in);
8642 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008644 sep_obj = PyUnicode_FromObject(sep_in);
8645 if (!sep_obj) {
8646 Py_DECREF(str_obj);
8647 return NULL;
8648 }
8649
8650 out = stringlib_rpartition(
8651 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8652 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8653 );
8654
8655 Py_DECREF(sep_obj);
8656 Py_DECREF(str_obj);
8657
8658 return out;
8659}
8660
8661PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008663\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008664Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008665the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008666found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008667
8668static PyObject*
8669unicode_partition(PyUnicodeObject *self, PyObject *separator)
8670{
8671 return PyUnicode_Partition((PyObject *)self, separator);
8672}
8673
8674PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008675 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008676\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008677Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008678the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008679separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008680
8681static PyObject*
8682unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8683{
8684 return PyUnicode_RPartition((PyObject *)self, separator);
8685}
8686
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008687PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 PyObject *sep,
8689 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008690{
8691 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008692
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008693 s = PyUnicode_FromObject(s);
8694 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008695 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 if (sep != NULL) {
8697 sep = PyUnicode_FromObject(sep);
8698 if (sep == NULL) {
8699 Py_DECREF(s);
8700 return NULL;
8701 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008702 }
8703
8704 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8705
8706 Py_DECREF(s);
8707 Py_XDECREF(sep);
8708 return result;
8709}
8710
8711PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008713\n\
8714Return a list of the words in S, using sep as the\n\
8715delimiter string, starting at the end of the string and\n\
8716working to the front. If maxsplit is given, at most maxsplit\n\
8717splits are done. If sep is not specified, any whitespace string\n\
8718is a separator.");
8719
8720static PyObject*
8721unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8722{
8723 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008724 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008725
Martin v. Löwis18e16552006-02-15 17:27:45 +00008726 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008727 return NULL;
8728
8729 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008731 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008733 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008735}
8736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008737PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739\n\
8740Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008741Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008742is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743
8744static PyObject*
8745unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8746{
Guido van Rossum86662912000-04-11 15:38:46 +00008747 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748
Guido van Rossum86662912000-04-11 15:38:46 +00008749 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 return NULL;
8751
Guido van Rossum86662912000-04-11 15:38:46 +00008752 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753}
8754
8755static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008756PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757{
Walter Dörwald346737f2007-05-31 10:44:43 +00008758 if (PyUnicode_CheckExact(self)) {
8759 Py_INCREF(self);
8760 return self;
8761 } else
8762 /* Subtype -- return genuine unicode string with the same value. */
8763 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8764 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765}
8766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008767PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769\n\
8770Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008771and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772
8773static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008774unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776 return fixup(self, fixswapcase);
8777}
8778
Georg Brandlceee0772007-11-27 23:48:05 +00008779PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008781\n\
8782Return a translation table usable for str.translate().\n\
8783If there is only one argument, it must be a dictionary mapping Unicode\n\
8784ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008785Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008786If there are two arguments, they must be strings of equal length, and\n\
8787in the resulting dictionary, each character in x will be mapped to the\n\
8788character at the same position in y. If there is a third argument, it\n\
8789must be a string, whose characters will be mapped to None in the result.");
8790
8791static PyObject*
8792unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8793{
8794 PyObject *x, *y = NULL, *z = NULL;
8795 PyObject *new = NULL, *key, *value;
8796 Py_ssize_t i = 0;
8797 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008798
Georg Brandlceee0772007-11-27 23:48:05 +00008799 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8800 return NULL;
8801 new = PyDict_New();
8802 if (!new)
8803 return NULL;
8804 if (y != NULL) {
8805 /* x must be a string too, of equal length */
8806 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8807 if (!PyUnicode_Check(x)) {
8808 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8809 "be a string if there is a second argument");
8810 goto err;
8811 }
8812 if (PyUnicode_GET_SIZE(x) != ylen) {
8813 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8814 "arguments must have equal length");
8815 goto err;
8816 }
8817 /* create entries for translating chars in x to those in y */
8818 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008819 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8820 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008821 if (!key || !value)
8822 goto err;
8823 res = PyDict_SetItem(new, key, value);
8824 Py_DECREF(key);
8825 Py_DECREF(value);
8826 if (res < 0)
8827 goto err;
8828 }
8829 /* create entries for deleting chars in z */
8830 if (z != NULL) {
8831 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008832 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008833 if (!key)
8834 goto err;
8835 res = PyDict_SetItem(new, key, Py_None);
8836 Py_DECREF(key);
8837 if (res < 0)
8838 goto err;
8839 }
8840 }
8841 } else {
8842 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008843 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008844 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8845 "to maketrans it must be a dict");
8846 goto err;
8847 }
8848 /* copy entries into the new dict, converting string keys to int keys */
8849 while (PyDict_Next(x, &i, &key, &value)) {
8850 if (PyUnicode_Check(key)) {
8851 /* convert string keys to integer keys */
8852 PyObject *newkey;
8853 if (PyUnicode_GET_SIZE(key) != 1) {
8854 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8855 "table must be of length 1");
8856 goto err;
8857 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008858 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008859 if (!newkey)
8860 goto err;
8861 res = PyDict_SetItem(new, newkey, value);
8862 Py_DECREF(newkey);
8863 if (res < 0)
8864 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008865 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008866 /* just keep integer keys */
8867 if (PyDict_SetItem(new, key, value) < 0)
8868 goto err;
8869 } else {
8870 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8871 "be strings or integers");
8872 goto err;
8873 }
8874 }
8875 }
8876 return new;
8877 err:
8878 Py_DECREF(new);
8879 return NULL;
8880}
8881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008882PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884\n\
8885Return a copy of the string S, where all characters have been mapped\n\
8886through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008887Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008888Unmapped characters are left untouched. Characters mapped to None\n\
8889are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890
8891static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008892unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893{
Georg Brandlceee0772007-11-27 23:48:05 +00008894 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895}
8896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008897PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008900Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901
8902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008903unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 return fixup(self, fixupper);
8906}
8907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008908PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008911Pad a numeric string S with zeros on the left, to fill a field\n\
8912of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913
8914static PyObject *
8915unicode_zfill(PyUnicodeObject *self, PyObject *args)
8916{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008917 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918 PyUnicodeObject *u;
8919
Martin v. Löwis18e16552006-02-15 17:27:45 +00008920 Py_ssize_t width;
8921 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922 return NULL;
8923
8924 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008925 if (PyUnicode_CheckExact(self)) {
8926 Py_INCREF(self);
8927 return (PyObject*) self;
8928 }
8929 else
8930 return PyUnicode_FromUnicode(
8931 PyUnicode_AS_UNICODE(self),
8932 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 }
8935
8936 fill = width - self->length;
8937
8938 u = pad(self, fill, 0, '0');
8939
Walter Dörwald068325e2002-04-15 13:36:47 +00008940 if (u == NULL)
8941 return NULL;
8942
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943 if (u->str[fill] == '+' || u->str[fill] == '-') {
8944 /* move sign to beginning of string */
8945 u->str[0] = u->str[fill];
8946 u->str[fill] = '0';
8947 }
8948
8949 return (PyObject*) u;
8950}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951
8952#if 0
8953static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008954unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955{
Christian Heimes2202f872008-02-06 14:31:34 +00008956 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008958
8959static PyObject *
8960unicode__decimal2ascii(PyObject *self)
8961{
8962 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
8963 PyUnicode_GET_SIZE(self));
8964}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965#endif
8966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008967PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008968 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008970Return True if S starts with the specified prefix, False otherwise.\n\
8971With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008972With optional end, stop comparing S at that position.\n\
8973prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974
8975static PyObject *
8976unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008977 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008979 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008981 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008982 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008983 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008985 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8987 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008988 if (PyTuple_Check(subobj)) {
8989 Py_ssize_t i;
8990 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8991 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008993 if (substring == NULL)
8994 return NULL;
8995 result = tailmatch(self, substring, start, end, -1);
8996 Py_DECREF(substring);
8997 if (result) {
8998 Py_RETURN_TRUE;
8999 }
9000 }
9001 /* nothing matched */
9002 Py_RETURN_FALSE;
9003 }
9004 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009007 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009009 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010}
9011
9012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009013PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009016Return True if S ends with the specified suffix, False otherwise.\n\
9017With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009018With optional end, stop comparing S at that position.\n\
9019suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020
9021static PyObject *
9022unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009025 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009027 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009028 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009029 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009031 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9033 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009034 if (PyTuple_Check(subobj)) {
9035 Py_ssize_t i;
9036 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9037 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009038 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009039 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009041 result = tailmatch(self, substring, start, end, +1);
9042 Py_DECREF(substring);
9043 if (result) {
9044 Py_RETURN_TRUE;
9045 }
9046 }
9047 Py_RETURN_FALSE;
9048 }
9049 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009053 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009054 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009055 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056}
9057
Eric Smith8c663262007-08-25 02:26:07 +00009058#include "stringlib/string_format.h"
9059
9060PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009062\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009063Return a formatted version of S, using substitutions from args and kwargs.\n\
9064The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009065
Eric Smith27bbca62010-11-04 17:06:58 +00009066PyDoc_STRVAR(format_map__doc__,
9067 "S.format_map(mapping) -> str\n\
9068\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009069Return a formatted version of S, using substitutions from mapping.\n\
9070The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009071
Eric Smith4a7d76d2008-05-30 18:10:19 +00009072static PyObject *
9073unicode__format__(PyObject* self, PyObject* args)
9074{
9075 PyObject *format_spec;
9076
9077 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9078 return NULL;
9079
9080 return _PyUnicode_FormatAdvanced(self,
9081 PyUnicode_AS_UNICODE(format_spec),
9082 PyUnicode_GET_SIZE(format_spec));
9083}
9084
Eric Smith8c663262007-08-25 02:26:07 +00009085PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009087\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009088Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009089
9090static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009091unicode__sizeof__(PyUnicodeObject *v)
9092{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009093 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9094 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009095}
9096
9097PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009098 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009099
9100static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009101unicode_getnewargs(PyUnicodeObject *v)
9102{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009103 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009104}
9105
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106static PyMethodDef unicode_methods[] = {
9107
9108 /* Order is according to common usage: often used methods should
9109 appear first, since lookup is done sequentially. */
9110
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009111 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009112 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9113 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009114 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009115 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9116 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9117 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9118 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9119 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9120 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9121 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009122 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009123 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9124 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9125 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009126 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009127 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9128 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9129 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009130 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009131 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009132 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009133 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009134 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9135 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9136 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9137 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9138 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9139 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9140 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9141 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9142 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9143 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9144 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9145 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9146 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9147 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009148 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009149 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009150 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009151 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009152 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009153 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009154 {"maketrans", (PyCFunction) unicode_maketrans,
9155 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009156 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009157#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009158 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159#endif
9160
9161#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009162 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009163 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009164 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165#endif
9166
Benjamin Peterson14339b62009-01-31 16:36:08 +00009167 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168 {NULL, NULL}
9169};
9170
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009171static PyObject *
9172unicode_mod(PyObject *v, PyObject *w)
9173{
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 if (!PyUnicode_Check(v)) {
9175 Py_INCREF(Py_NotImplemented);
9176 return Py_NotImplemented;
9177 }
9178 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009179}
9180
9181static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009182 0, /*nb_add*/
9183 0, /*nb_subtract*/
9184 0, /*nb_multiply*/
9185 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009186};
9187
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009189 (lenfunc) unicode_length, /* sq_length */
9190 PyUnicode_Concat, /* sq_concat */
9191 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9192 (ssizeargfunc) unicode_getitem, /* sq_item */
9193 0, /* sq_slice */
9194 0, /* sq_ass_item */
9195 0, /* sq_ass_slice */
9196 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009197};
9198
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009199static PyObject*
9200unicode_subscript(PyUnicodeObject* self, PyObject* item)
9201{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009202 if (PyIndex_Check(item)) {
9203 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009204 if (i == -1 && PyErr_Occurred())
9205 return NULL;
9206 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009207 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009208 return unicode_getitem(self, i);
9209 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009210 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009211 Py_UNICODE* source_buf;
9212 Py_UNICODE* result_buf;
9213 PyObject* result;
9214
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009215 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009216 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009217 return NULL;
9218 }
9219
9220 if (slicelength <= 0) {
9221 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009222 } else if (start == 0 && step == 1 && slicelength == self->length &&
9223 PyUnicode_CheckExact(self)) {
9224 Py_INCREF(self);
9225 return (PyObject *)self;
9226 } else if (step == 1) {
9227 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009228 } else {
9229 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009230 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9231 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009232
Benjamin Peterson29060642009-01-31 22:14:21 +00009233 if (result_buf == NULL)
9234 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009235
9236 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9237 result_buf[i] = source_buf[cur];
9238 }
Tim Petersced69f82003-09-16 20:30:58 +00009239
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009240 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009241 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009242 return result;
9243 }
9244 } else {
9245 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9246 return NULL;
9247 }
9248}
9249
9250static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009251 (lenfunc)unicode_length, /* mp_length */
9252 (binaryfunc)unicode_subscript, /* mp_subscript */
9253 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009254};
9255
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257/* Helpers for PyUnicode_Format() */
9258
9259static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009260getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009262 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009264 (*p_argidx)++;
9265 if (arglen < 0)
9266 return args;
9267 else
9268 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269 }
9270 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009271 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009272 return NULL;
9273}
9274
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009275/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009277static PyObject *
9278formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009280 char *p;
9281 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009283
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284 x = PyFloat_AsDouble(v);
9285 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009286 return NULL;
9287
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009289 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009290
Eric Smith0923d1d2009-04-16 20:16:10 +00009291 p = PyOS_double_to_string(x, type, prec,
9292 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009293 if (p == NULL)
9294 return NULL;
9295 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009296 PyMem_Free(p);
9297 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298}
9299
Tim Peters38fd5b62000-09-21 05:43:11 +00009300static PyObject*
9301formatlong(PyObject *val, int flags, int prec, int type)
9302{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009303 char *buf;
9304 int len;
9305 PyObject *str; /* temporary string object. */
9306 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009307
Benjamin Peterson14339b62009-01-31 16:36:08 +00009308 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9309 if (!str)
9310 return NULL;
9311 result = PyUnicode_FromStringAndSize(buf, len);
9312 Py_DECREF(str);
9313 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009314}
9315
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316static int
9317formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009318 size_t buflen,
9319 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009321 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009322 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 if (PyUnicode_GET_SIZE(v) == 1) {
9324 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9325 buf[1] = '\0';
9326 return 1;
9327 }
9328#ifndef Py_UNICODE_WIDE
9329 if (PyUnicode_GET_SIZE(v) == 2) {
9330 /* Decode a valid surrogate pair */
9331 int c0 = PyUnicode_AS_UNICODE(v)[0];
9332 int c1 = PyUnicode_AS_UNICODE(v)[1];
9333 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9334 0xDC00 <= c1 && c1 <= 0xDFFF) {
9335 buf[0] = c0;
9336 buf[1] = c1;
9337 buf[2] = '\0';
9338 return 2;
9339 }
9340 }
9341#endif
9342 goto onError;
9343 }
9344 else {
9345 /* Integer input truncated to a character */
9346 long x;
9347 x = PyLong_AsLong(v);
9348 if (x == -1 && PyErr_Occurred())
9349 goto onError;
9350
9351 if (x < 0 || x > 0x10ffff) {
9352 PyErr_SetString(PyExc_OverflowError,
9353 "%c arg not in range(0x110000)");
9354 return -1;
9355 }
9356
9357#ifndef Py_UNICODE_WIDE
9358 if (x > 0xffff) {
9359 x -= 0x10000;
9360 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9361 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9362 return 2;
9363 }
9364#endif
9365 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009366 buf[1] = '\0';
9367 return 1;
9368 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009369
Benjamin Peterson29060642009-01-31 22:14:21 +00009370 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009371 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009373 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374}
9375
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009376/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009377 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009378*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009379#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009380
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383{
9384 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009385 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386 int args_owned = 0;
9387 PyUnicodeObject *result = NULL;
9388 PyObject *dict = NULL;
9389 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009390
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009392 PyErr_BadInternalCall();
9393 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394 }
9395 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009396 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398 fmt = PyUnicode_AS_UNICODE(uformat);
9399 fmtcnt = PyUnicode_GET_SIZE(uformat);
9400
9401 reslen = rescnt = fmtcnt + 100;
9402 result = _PyUnicode_New(reslen);
9403 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009404 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405 res = PyUnicode_AS_UNICODE(result);
9406
9407 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009408 arglen = PyTuple_Size(args);
9409 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410 }
9411 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009412 arglen = -1;
9413 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009415 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009416 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418
9419 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009420 if (*fmt != '%') {
9421 if (--rescnt < 0) {
9422 rescnt = fmtcnt + 100;
9423 reslen += rescnt;
9424 if (_PyUnicode_Resize(&result, reslen) < 0)
9425 goto onError;
9426 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9427 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009428 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009430 }
9431 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009432 /* Got a format specifier */
9433 int flags = 0;
9434 Py_ssize_t width = -1;
9435 int prec = -1;
9436 Py_UNICODE c = '\0';
9437 Py_UNICODE fill;
9438 int isnumok;
9439 PyObject *v = NULL;
9440 PyObject *temp = NULL;
9441 Py_UNICODE *pbuf;
9442 Py_UNICODE sign;
9443 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009444 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445
Benjamin Peterson29060642009-01-31 22:14:21 +00009446 fmt++;
9447 if (*fmt == '(') {
9448 Py_UNICODE *keystart;
9449 Py_ssize_t keylen;
9450 PyObject *key;
9451 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009452
Benjamin Peterson29060642009-01-31 22:14:21 +00009453 if (dict == NULL) {
9454 PyErr_SetString(PyExc_TypeError,
9455 "format requires a mapping");
9456 goto onError;
9457 }
9458 ++fmt;
9459 --fmtcnt;
9460 keystart = fmt;
9461 /* Skip over balanced parentheses */
9462 while (pcount > 0 && --fmtcnt >= 0) {
9463 if (*fmt == ')')
9464 --pcount;
9465 else if (*fmt == '(')
9466 ++pcount;
9467 fmt++;
9468 }
9469 keylen = fmt - keystart - 1;
9470 if (fmtcnt < 0 || pcount > 0) {
9471 PyErr_SetString(PyExc_ValueError,
9472 "incomplete format key");
9473 goto onError;
9474 }
9475#if 0
9476 /* keys are converted to strings using UTF-8 and
9477 then looked up since Python uses strings to hold
9478 variables names etc. in its namespaces and we
9479 wouldn't want to break common idioms. */
9480 key = PyUnicode_EncodeUTF8(keystart,
9481 keylen,
9482 NULL);
9483#else
9484 key = PyUnicode_FromUnicode(keystart, keylen);
9485#endif
9486 if (key == NULL)
9487 goto onError;
9488 if (args_owned) {
9489 Py_DECREF(args);
9490 args_owned = 0;
9491 }
9492 args = PyObject_GetItem(dict, key);
9493 Py_DECREF(key);
9494 if (args == NULL) {
9495 goto onError;
9496 }
9497 args_owned = 1;
9498 arglen = -1;
9499 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009500 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 while (--fmtcnt >= 0) {
9502 switch (c = *fmt++) {
9503 case '-': flags |= F_LJUST; continue;
9504 case '+': flags |= F_SIGN; continue;
9505 case ' ': flags |= F_BLANK; continue;
9506 case '#': flags |= F_ALT; continue;
9507 case '0': flags |= F_ZERO; continue;
9508 }
9509 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009510 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009511 if (c == '*') {
9512 v = getnextarg(args, arglen, &argidx);
9513 if (v == NULL)
9514 goto onError;
9515 if (!PyLong_Check(v)) {
9516 PyErr_SetString(PyExc_TypeError,
9517 "* wants int");
9518 goto onError;
9519 }
9520 width = PyLong_AsLong(v);
9521 if (width == -1 && PyErr_Occurred())
9522 goto onError;
9523 if (width < 0) {
9524 flags |= F_LJUST;
9525 width = -width;
9526 }
9527 if (--fmtcnt >= 0)
9528 c = *fmt++;
9529 }
9530 else if (c >= '0' && c <= '9') {
9531 width = c - '0';
9532 while (--fmtcnt >= 0) {
9533 c = *fmt++;
9534 if (c < '0' || c > '9')
9535 break;
9536 if ((width*10) / 10 != width) {
9537 PyErr_SetString(PyExc_ValueError,
9538 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009539 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 }
9541 width = width*10 + (c - '0');
9542 }
9543 }
9544 if (c == '.') {
9545 prec = 0;
9546 if (--fmtcnt >= 0)
9547 c = *fmt++;
9548 if (c == '*') {
9549 v = getnextarg(args, arglen, &argidx);
9550 if (v == NULL)
9551 goto onError;
9552 if (!PyLong_Check(v)) {
9553 PyErr_SetString(PyExc_TypeError,
9554 "* wants int");
9555 goto onError;
9556 }
9557 prec = PyLong_AsLong(v);
9558 if (prec == -1 && PyErr_Occurred())
9559 goto onError;
9560 if (prec < 0)
9561 prec = 0;
9562 if (--fmtcnt >= 0)
9563 c = *fmt++;
9564 }
9565 else if (c >= '0' && c <= '9') {
9566 prec = c - '0';
9567 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009568 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 if (c < '0' || c > '9')
9570 break;
9571 if ((prec*10) / 10 != prec) {
9572 PyErr_SetString(PyExc_ValueError,
9573 "prec too big");
9574 goto onError;
9575 }
9576 prec = prec*10 + (c - '0');
9577 }
9578 }
9579 } /* prec */
9580 if (fmtcnt >= 0) {
9581 if (c == 'h' || c == 'l' || c == 'L') {
9582 if (--fmtcnt >= 0)
9583 c = *fmt++;
9584 }
9585 }
9586 if (fmtcnt < 0) {
9587 PyErr_SetString(PyExc_ValueError,
9588 "incomplete format");
9589 goto onError;
9590 }
9591 if (c != '%') {
9592 v = getnextarg(args, arglen, &argidx);
9593 if (v == NULL)
9594 goto onError;
9595 }
9596 sign = 0;
9597 fill = ' ';
9598 switch (c) {
9599
9600 case '%':
9601 pbuf = formatbuf;
9602 /* presume that buffer length is at least 1 */
9603 pbuf[0] = '%';
9604 len = 1;
9605 break;
9606
9607 case 's':
9608 case 'r':
9609 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009610 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009611 temp = v;
9612 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009613 }
9614 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009615 if (c == 's')
9616 temp = PyObject_Str(v);
9617 else if (c == 'r')
9618 temp = PyObject_Repr(v);
9619 else
9620 temp = PyObject_ASCII(v);
9621 if (temp == NULL)
9622 goto onError;
9623 if (PyUnicode_Check(temp))
9624 /* nothing to do */;
9625 else {
9626 Py_DECREF(temp);
9627 PyErr_SetString(PyExc_TypeError,
9628 "%s argument has non-string str()");
9629 goto onError;
9630 }
9631 }
9632 pbuf = PyUnicode_AS_UNICODE(temp);
9633 len = PyUnicode_GET_SIZE(temp);
9634 if (prec >= 0 && len > prec)
9635 len = prec;
9636 break;
9637
9638 case 'i':
9639 case 'd':
9640 case 'u':
9641 case 'o':
9642 case 'x':
9643 case 'X':
9644 if (c == 'i')
9645 c = 'd';
9646 isnumok = 0;
9647 if (PyNumber_Check(v)) {
9648 PyObject *iobj=NULL;
9649
9650 if (PyLong_Check(v)) {
9651 iobj = v;
9652 Py_INCREF(iobj);
9653 }
9654 else {
9655 iobj = PyNumber_Long(v);
9656 }
9657 if (iobj!=NULL) {
9658 if (PyLong_Check(iobj)) {
9659 isnumok = 1;
9660 temp = formatlong(iobj, flags, prec, c);
9661 Py_DECREF(iobj);
9662 if (!temp)
9663 goto onError;
9664 pbuf = PyUnicode_AS_UNICODE(temp);
9665 len = PyUnicode_GET_SIZE(temp);
9666 sign = 1;
9667 }
9668 else {
9669 Py_DECREF(iobj);
9670 }
9671 }
9672 }
9673 if (!isnumok) {
9674 PyErr_Format(PyExc_TypeError,
9675 "%%%c format: a number is required, "
9676 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9677 goto onError;
9678 }
9679 if (flags & F_ZERO)
9680 fill = '0';
9681 break;
9682
9683 case 'e':
9684 case 'E':
9685 case 'f':
9686 case 'F':
9687 case 'g':
9688 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009689 temp = formatfloat(v, flags, prec, c);
9690 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009691 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009692 pbuf = PyUnicode_AS_UNICODE(temp);
9693 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009694 sign = 1;
9695 if (flags & F_ZERO)
9696 fill = '0';
9697 break;
9698
9699 case 'c':
9700 pbuf = formatbuf;
9701 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9702 if (len < 0)
9703 goto onError;
9704 break;
9705
9706 default:
9707 PyErr_Format(PyExc_ValueError,
9708 "unsupported format character '%c' (0x%x) "
9709 "at index %zd",
9710 (31<=c && c<=126) ? (char)c : '?',
9711 (int)c,
9712 (Py_ssize_t)(fmt - 1 -
9713 PyUnicode_AS_UNICODE(uformat)));
9714 goto onError;
9715 }
9716 if (sign) {
9717 if (*pbuf == '-' || *pbuf == '+') {
9718 sign = *pbuf++;
9719 len--;
9720 }
9721 else if (flags & F_SIGN)
9722 sign = '+';
9723 else if (flags & F_BLANK)
9724 sign = ' ';
9725 else
9726 sign = 0;
9727 }
9728 if (width < len)
9729 width = len;
9730 if (rescnt - (sign != 0) < width) {
9731 reslen -= rescnt;
9732 rescnt = width + fmtcnt + 100;
9733 reslen += rescnt;
9734 if (reslen < 0) {
9735 Py_XDECREF(temp);
9736 PyErr_NoMemory();
9737 goto onError;
9738 }
9739 if (_PyUnicode_Resize(&result, reslen) < 0) {
9740 Py_XDECREF(temp);
9741 goto onError;
9742 }
9743 res = PyUnicode_AS_UNICODE(result)
9744 + reslen - rescnt;
9745 }
9746 if (sign) {
9747 if (fill != ' ')
9748 *res++ = sign;
9749 rescnt--;
9750 if (width > len)
9751 width--;
9752 }
9753 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9754 assert(pbuf[0] == '0');
9755 assert(pbuf[1] == c);
9756 if (fill != ' ') {
9757 *res++ = *pbuf++;
9758 *res++ = *pbuf++;
9759 }
9760 rescnt -= 2;
9761 width -= 2;
9762 if (width < 0)
9763 width = 0;
9764 len -= 2;
9765 }
9766 if (width > len && !(flags & F_LJUST)) {
9767 do {
9768 --rescnt;
9769 *res++ = fill;
9770 } while (--width > len);
9771 }
9772 if (fill == ' ') {
9773 if (sign)
9774 *res++ = sign;
9775 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9776 assert(pbuf[0] == '0');
9777 assert(pbuf[1] == c);
9778 *res++ = *pbuf++;
9779 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009780 }
9781 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009782 Py_UNICODE_COPY(res, pbuf, len);
9783 res += len;
9784 rescnt -= len;
9785 while (--width >= len) {
9786 --rescnt;
9787 *res++ = ' ';
9788 }
9789 if (dict && (argidx < arglen) && c != '%') {
9790 PyErr_SetString(PyExc_TypeError,
9791 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009792 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009793 goto onError;
9794 }
9795 Py_XDECREF(temp);
9796 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797 } /* until end */
9798 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009799 PyErr_SetString(PyExc_TypeError,
9800 "not all arguments converted during string formatting");
9801 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802 }
9803
Thomas Woutersa96affe2006-03-12 00:29:36 +00009804 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009805 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009807 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808 }
9809 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810 return (PyObject *)result;
9811
Benjamin Peterson29060642009-01-31 22:14:21 +00009812 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813 Py_XDECREF(result);
9814 Py_DECREF(uformat);
9815 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009816 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817 }
9818 return NULL;
9819}
9820
Jeremy Hylton938ace62002-07-17 16:30:39 +00009821static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009822unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9823
Tim Peters6d6c1a32001-08-02 04:15:00 +00009824static PyObject *
9825unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9826{
Benjamin Peterson29060642009-01-31 22:14:21 +00009827 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009828 static char *kwlist[] = {"object", "encoding", "errors", 0};
9829 char *encoding = NULL;
9830 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009831
Benjamin Peterson14339b62009-01-31 16:36:08 +00009832 if (type != &PyUnicode_Type)
9833 return unicode_subtype_new(type, args, kwds);
9834 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009835 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009836 return NULL;
9837 if (x == NULL)
9838 return (PyObject *)_PyUnicode_New(0);
9839 if (encoding == NULL && errors == NULL)
9840 return PyObject_Str(x);
9841 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009842 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009843}
9844
Guido van Rossume023fe02001-08-30 03:12:59 +00009845static PyObject *
9846unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9847{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009848 PyUnicodeObject *tmp, *pnew;
9849 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009850
Benjamin Peterson14339b62009-01-31 16:36:08 +00009851 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9852 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9853 if (tmp == NULL)
9854 return NULL;
9855 assert(PyUnicode_Check(tmp));
9856 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9857 if (pnew == NULL) {
9858 Py_DECREF(tmp);
9859 return NULL;
9860 }
9861 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9862 if (pnew->str == NULL) {
9863 _Py_ForgetReference((PyObject *)pnew);
9864 PyObject_Del(pnew);
9865 Py_DECREF(tmp);
9866 return PyErr_NoMemory();
9867 }
9868 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9869 pnew->length = n;
9870 pnew->hash = tmp->hash;
9871 Py_DECREF(tmp);
9872 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009873}
9874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009875PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009876 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009877\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009878Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009879encoding defaults to the current default string encoding.\n\
9880errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009881
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009882static PyObject *unicode_iter(PyObject *seq);
9883
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009885 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009886 "str", /* tp_name */
9887 sizeof(PyUnicodeObject), /* tp_size */
9888 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009890 (destructor)unicode_dealloc, /* tp_dealloc */
9891 0, /* tp_print */
9892 0, /* tp_getattr */
9893 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009894 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009895 unicode_repr, /* tp_repr */
9896 &unicode_as_number, /* tp_as_number */
9897 &unicode_as_sequence, /* tp_as_sequence */
9898 &unicode_as_mapping, /* tp_as_mapping */
9899 (hashfunc) unicode_hash, /* tp_hash*/
9900 0, /* tp_call*/
9901 (reprfunc) unicode_str, /* tp_str */
9902 PyObject_GenericGetAttr, /* tp_getattro */
9903 0, /* tp_setattro */
9904 0, /* tp_as_buffer */
9905 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009906 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009907 unicode_doc, /* tp_doc */
9908 0, /* tp_traverse */
9909 0, /* tp_clear */
9910 PyUnicode_RichCompare, /* tp_richcompare */
9911 0, /* tp_weaklistoffset */
9912 unicode_iter, /* tp_iter */
9913 0, /* tp_iternext */
9914 unicode_methods, /* tp_methods */
9915 0, /* tp_members */
9916 0, /* tp_getset */
9917 &PyBaseObject_Type, /* tp_base */
9918 0, /* tp_dict */
9919 0, /* tp_descr_get */
9920 0, /* tp_descr_set */
9921 0, /* tp_dictoffset */
9922 0, /* tp_init */
9923 0, /* tp_alloc */
9924 unicode_new, /* tp_new */
9925 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926};
9927
9928/* Initialize the Unicode implementation */
9929
Thomas Wouters78890102000-07-22 19:25:51 +00009930void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009931{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009932 int i;
9933
Thomas Wouters477c8d52006-05-27 19:21:47 +00009934 /* XXX - move this array to unicodectype.c ? */
9935 Py_UNICODE linebreak[] = {
9936 0x000A, /* LINE FEED */
9937 0x000D, /* CARRIAGE RETURN */
9938 0x001C, /* FILE SEPARATOR */
9939 0x001D, /* GROUP SEPARATOR */
9940 0x001E, /* RECORD SEPARATOR */
9941 0x0085, /* NEXT LINE */
9942 0x2028, /* LINE SEPARATOR */
9943 0x2029, /* PARAGRAPH SEPARATOR */
9944 };
9945
Fred Drakee4315f52000-05-09 19:53:39 +00009946 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009947 free_list = NULL;
9948 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009950 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009951 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009952
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009953 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009954 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009955 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009956 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009957
9958 /* initialize the linebreak bloom filter */
9959 bloom_linebreak = make_bloom_mask(
9960 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9961 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009962
9963 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964}
9965
9966/* Finalize the Unicode implementation */
9967
Christian Heimesa156e092008-02-16 07:38:31 +00009968int
9969PyUnicode_ClearFreeList(void)
9970{
9971 int freelist_size = numfree;
9972 PyUnicodeObject *u;
9973
9974 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009975 PyUnicodeObject *v = u;
9976 u = *(PyUnicodeObject **)u;
9977 if (v->str)
9978 PyObject_DEL(v->str);
9979 Py_XDECREF(v->defenc);
9980 PyObject_Del(v);
9981 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009982 }
9983 free_list = NULL;
9984 assert(numfree == 0);
9985 return freelist_size;
9986}
9987
Guido van Rossumd57fd912000-03-10 22:53:23 +00009988void
Thomas Wouters78890102000-07-22 19:25:51 +00009989_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009991 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009993 Py_XDECREF(unicode_empty);
9994 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009995
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009996 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009997 if (unicode_latin1[i]) {
9998 Py_DECREF(unicode_latin1[i]);
9999 unicode_latin1[i] = NULL;
10000 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010001 }
Christian Heimesa156e092008-02-16 07:38:31 +000010002 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010004
Walter Dörwald16807132007-05-25 13:52:07 +000010005void
10006PyUnicode_InternInPlace(PyObject **p)
10007{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010008 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10009 PyObject *t;
10010 if (s == NULL || !PyUnicode_Check(s))
10011 Py_FatalError(
10012 "PyUnicode_InternInPlace: unicode strings only please!");
10013 /* If it's a subclass, we don't really know what putting
10014 it in the interned dict might do. */
10015 if (!PyUnicode_CheckExact(s))
10016 return;
10017 if (PyUnicode_CHECK_INTERNED(s))
10018 return;
10019 if (interned == NULL) {
10020 interned = PyDict_New();
10021 if (interned == NULL) {
10022 PyErr_Clear(); /* Don't leave an exception */
10023 return;
10024 }
10025 }
10026 /* It might be that the GetItem call fails even
10027 though the key is present in the dictionary,
10028 namely when this happens during a stack overflow. */
10029 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010030 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010031 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010032
Benjamin Peterson29060642009-01-31 22:14:21 +000010033 if (t) {
10034 Py_INCREF(t);
10035 Py_DECREF(*p);
10036 *p = t;
10037 return;
10038 }
Walter Dörwald16807132007-05-25 13:52:07 +000010039
Benjamin Peterson14339b62009-01-31 16:36:08 +000010040 PyThreadState_GET()->recursion_critical = 1;
10041 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10042 PyErr_Clear();
10043 PyThreadState_GET()->recursion_critical = 0;
10044 return;
10045 }
10046 PyThreadState_GET()->recursion_critical = 0;
10047 /* The two references in interned are not counted by refcnt.
10048 The deallocator will take care of this */
10049 Py_REFCNT(s) -= 2;
10050 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010051}
10052
10053void
10054PyUnicode_InternImmortal(PyObject **p)
10055{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010056 PyUnicode_InternInPlace(p);
10057 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10058 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10059 Py_INCREF(*p);
10060 }
Walter Dörwald16807132007-05-25 13:52:07 +000010061}
10062
10063PyObject *
10064PyUnicode_InternFromString(const char *cp)
10065{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010066 PyObject *s = PyUnicode_FromString(cp);
10067 if (s == NULL)
10068 return NULL;
10069 PyUnicode_InternInPlace(&s);
10070 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010071}
10072
10073void _Py_ReleaseInternedUnicodeStrings(void)
10074{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010075 PyObject *keys;
10076 PyUnicodeObject *s;
10077 Py_ssize_t i, n;
10078 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010079
Benjamin Peterson14339b62009-01-31 16:36:08 +000010080 if (interned == NULL || !PyDict_Check(interned))
10081 return;
10082 keys = PyDict_Keys(interned);
10083 if (keys == NULL || !PyList_Check(keys)) {
10084 PyErr_Clear();
10085 return;
10086 }
Walter Dörwald16807132007-05-25 13:52:07 +000010087
Benjamin Peterson14339b62009-01-31 16:36:08 +000010088 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10089 detector, interned unicode strings are not forcibly deallocated;
10090 rather, we give them their stolen references back, and then clear
10091 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010092
Benjamin Peterson14339b62009-01-31 16:36:08 +000010093 n = PyList_GET_SIZE(keys);
10094 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010095 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010096 for (i = 0; i < n; i++) {
10097 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10098 switch (s->state) {
10099 case SSTATE_NOT_INTERNED:
10100 /* XXX Shouldn't happen */
10101 break;
10102 case SSTATE_INTERNED_IMMORTAL:
10103 Py_REFCNT(s) += 1;
10104 immortal_size += s->length;
10105 break;
10106 case SSTATE_INTERNED_MORTAL:
10107 Py_REFCNT(s) += 2;
10108 mortal_size += s->length;
10109 break;
10110 default:
10111 Py_FatalError("Inconsistent interned string state.");
10112 }
10113 s->state = SSTATE_NOT_INTERNED;
10114 }
10115 fprintf(stderr, "total size of all interned strings: "
10116 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10117 "mortal/immortal\n", mortal_size, immortal_size);
10118 Py_DECREF(keys);
10119 PyDict_Clear(interned);
10120 Py_DECREF(interned);
10121 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010122}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010123
10124
10125/********************* Unicode Iterator **************************/
10126
10127typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010128 PyObject_HEAD
10129 Py_ssize_t it_index;
10130 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010131} unicodeiterobject;
10132
10133static void
10134unicodeiter_dealloc(unicodeiterobject *it)
10135{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010136 _PyObject_GC_UNTRACK(it);
10137 Py_XDECREF(it->it_seq);
10138 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010139}
10140
10141static int
10142unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10143{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010144 Py_VISIT(it->it_seq);
10145 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010146}
10147
10148static PyObject *
10149unicodeiter_next(unicodeiterobject *it)
10150{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010151 PyUnicodeObject *seq;
10152 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010153
Benjamin Peterson14339b62009-01-31 16:36:08 +000010154 assert(it != NULL);
10155 seq = it->it_seq;
10156 if (seq == NULL)
10157 return NULL;
10158 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010159
Benjamin Peterson14339b62009-01-31 16:36:08 +000010160 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10161 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010162 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010163 if (item != NULL)
10164 ++it->it_index;
10165 return item;
10166 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010167
Benjamin Peterson14339b62009-01-31 16:36:08 +000010168 Py_DECREF(seq);
10169 it->it_seq = NULL;
10170 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010171}
10172
10173static PyObject *
10174unicodeiter_len(unicodeiterobject *it)
10175{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010176 Py_ssize_t len = 0;
10177 if (it->it_seq)
10178 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10179 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010180}
10181
10182PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10183
10184static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010185 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010186 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010187 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010188};
10189
10190PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010191 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10192 "str_iterator", /* tp_name */
10193 sizeof(unicodeiterobject), /* tp_basicsize */
10194 0, /* tp_itemsize */
10195 /* methods */
10196 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10197 0, /* tp_print */
10198 0, /* tp_getattr */
10199 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010200 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010201 0, /* tp_repr */
10202 0, /* tp_as_number */
10203 0, /* tp_as_sequence */
10204 0, /* tp_as_mapping */
10205 0, /* tp_hash */
10206 0, /* tp_call */
10207 0, /* tp_str */
10208 PyObject_GenericGetAttr, /* tp_getattro */
10209 0, /* tp_setattro */
10210 0, /* tp_as_buffer */
10211 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10212 0, /* tp_doc */
10213 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10214 0, /* tp_clear */
10215 0, /* tp_richcompare */
10216 0, /* tp_weaklistoffset */
10217 PyObject_SelfIter, /* tp_iter */
10218 (iternextfunc)unicodeiter_next, /* tp_iternext */
10219 unicodeiter_methods, /* tp_methods */
10220 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010221};
10222
10223static PyObject *
10224unicode_iter(PyObject *seq)
10225{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010226 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010227
Benjamin Peterson14339b62009-01-31 16:36:08 +000010228 if (!PyUnicode_Check(seq)) {
10229 PyErr_BadInternalCall();
10230 return NULL;
10231 }
10232 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10233 if (it == NULL)
10234 return NULL;
10235 it->it_index = 0;
10236 Py_INCREF(seq);
10237 it->it_seq = (PyUnicodeObject *)seq;
10238 _PyObject_GC_TRACK(it);
10239 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010240}
10241
Martin v. Löwis5b222132007-06-10 09:51:05 +000010242size_t
10243Py_UNICODE_strlen(const Py_UNICODE *u)
10244{
10245 int res = 0;
10246 while(*u++)
10247 res++;
10248 return res;
10249}
10250
10251Py_UNICODE*
10252Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10253{
10254 Py_UNICODE *u = s1;
10255 while ((*u++ = *s2++));
10256 return s1;
10257}
10258
10259Py_UNICODE*
10260Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10261{
10262 Py_UNICODE *u = s1;
10263 while ((*u++ = *s2++))
10264 if (n-- == 0)
10265 break;
10266 return s1;
10267}
10268
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010269Py_UNICODE*
10270Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10271{
10272 Py_UNICODE *u1 = s1;
10273 u1 += Py_UNICODE_strlen(u1);
10274 Py_UNICODE_strcpy(u1, s2);
10275 return s1;
10276}
10277
Martin v. Löwis5b222132007-06-10 09:51:05 +000010278int
10279Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10280{
10281 while (*s1 && *s2 && *s1 == *s2)
10282 s1++, s2++;
10283 if (*s1 && *s2)
10284 return (*s1 < *s2) ? -1 : +1;
10285 if (*s1)
10286 return 1;
10287 if (*s2)
10288 return -1;
10289 return 0;
10290}
10291
Victor Stinneref8d95c2010-08-16 22:03:11 +000010292int
10293Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10294{
10295 register Py_UNICODE u1, u2;
10296 for (; n != 0; n--) {
10297 u1 = *s1;
10298 u2 = *s2;
10299 if (u1 != u2)
10300 return (u1 < u2) ? -1 : +1;
10301 if (u1 == '\0')
10302 return 0;
10303 s1++;
10304 s2++;
10305 }
10306 return 0;
10307}
10308
Martin v. Löwis5b222132007-06-10 09:51:05 +000010309Py_UNICODE*
10310Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10311{
10312 const Py_UNICODE *p;
10313 for (p = s; *p; p++)
10314 if (*p == c)
10315 return (Py_UNICODE*)p;
10316 return NULL;
10317}
10318
Victor Stinner331ea922010-08-10 16:37:20 +000010319Py_UNICODE*
10320Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10321{
10322 const Py_UNICODE *p;
10323 p = s + Py_UNICODE_strlen(s);
10324 while (p != s) {
10325 p--;
10326 if (*p == c)
10327 return (Py_UNICODE*)p;
10328 }
10329 return NULL;
10330}
10331
Victor Stinner71133ff2010-09-01 23:43:53 +000010332Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010333PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010334{
10335 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10336 Py_UNICODE *copy;
10337 Py_ssize_t size;
10338
10339 /* Ensure we won't overflow the size. */
10340 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10341 PyErr_NoMemory();
10342 return NULL;
10343 }
10344 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10345 size *= sizeof(Py_UNICODE);
10346 copy = PyMem_Malloc(size);
10347 if (copy == NULL) {
10348 PyErr_NoMemory();
10349 return NULL;
10350 }
10351 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10352 return copy;
10353}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010354
Georg Brandl66c221e2010-10-14 07:04:07 +000010355/* A _string module, to export formatter_parser and formatter_field_name_split
10356 to the string.Formatter class implemented in Python. */
10357
10358static PyMethodDef _string_methods[] = {
10359 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10360 METH_O, PyDoc_STR("split the argument as a field name")},
10361 {"formatter_parser", (PyCFunction) formatter_parser,
10362 METH_O, PyDoc_STR("parse the argument as a format string")},
10363 {NULL, NULL}
10364};
10365
10366static struct PyModuleDef _string_module = {
10367 PyModuleDef_HEAD_INIT,
10368 "_string",
10369 PyDoc_STR("string helper module"),
10370 0,
10371 _string_methods,
10372 NULL,
10373 NULL,
10374 NULL,
10375 NULL
10376};
10377
10378PyMODINIT_FUNC
10379PyInit__string(void)
10380{
10381 return PyModule_Create(&_string_module);
10382}
10383
10384
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010385#ifdef __cplusplus
10386}
10387#endif