blob: 423a53383ae169f40a4c4ff1f6fadfc40ea87768 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146 PyObject **errorHandler,const char *encoding, const char *reason,
147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
Victor Stinner31be90b2010-04-22 19:38:16 +0000150static void raise_encode_exception(PyObject **exceptionObject,
151 const char *encoding,
152 const Py_UNICODE *unicode, Py_ssize_t size,
153 Py_ssize_t startpos, Py_ssize_t endpos,
154 const char *reason);
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000160/* 0x000B, * LINE TABULATION */
161/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x001C, * FILE SEPARATOR */
166/* 0x001D, * GROUP SEPARATOR */
167/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000168 0, 0, 0, 0, 1, 1, 1, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000173
Benjamin Peterson14339b62009-01-31 16:36:08 +0000174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000182};
183
184
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000185Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000186PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000188#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000189 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 /* This is actually an illegal character, so it should
192 not be passed to unichr. */
193 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#endif
195}
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200 to keep things simple, we use a single bitmask, using the least 5
201 bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
Antoine Pitrouf068f942010-01-13 14:19:12 +0000205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
Antoine Pitrouf068f942010-01-13 14:19:12 +0000219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221
Benjamin Peterson29060642009-01-31 22:14:21 +0000222#define BLOOM_LINEBREAK(ch) \
223 ((ch) < 128U ? ascii_linebreak[(ch)] : \
224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228 /* calculate simple bloom-style bitmask for a given unicode string */
229
Antoine Pitrouf068f942010-01-13 14:19:12 +0000230 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 Py_ssize_t i;
232
233 mask = 0;
234 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000235 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236
237 return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242 Py_ssize_t i;
243
244 for (i = 0; i < setlen; i++)
245 if (set[i] == chr)
246 return 1;
247
248 return 0;
249}
250
Benjamin Peterson29060642009-01-31 22:14:21 +0000251#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254/* --- Unicode Object ----------------------------------------------------- */
255
256static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000257int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000258 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259{
260 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000261
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000266 /* Resizing shared object (unicode_empty or single character
267 objects) in-place is not allowed. Use PyUnicode_Resize()
268 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 (unicode->length == 1 &&
272 unicode->str[0] < 256U &&
273 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000275 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 return -1;
277 }
278
Thomas Wouters477c8d52006-05-27 19:21:47 +0000279 /* We allocate one more byte to make sure the string is Ux0000 terminated.
280 The overallocation is also used by fastsearch, which assumes that it's
281 safe to look at str[length] (without making any assumptions about what
282 it contains). */
283
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000285 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000286 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 PyErr_NoMemory();
290 return -1;
291 }
292 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294
Benjamin Peterson29060642009-01-31 22:14:21 +0000295 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000298 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 }
300 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000301
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 return 0;
303}
304
305/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000306 Ux0000 terminated; some code (e.g. new_identifier)
307 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308
309 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000310 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312*/
313
314static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316{
317 register PyUnicodeObject *unicode;
318
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (length == 0 && unicode_empty != NULL) {
321 Py_INCREF(unicode_empty);
322 return unicode_empty;
323 }
324
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000325 /* Ensure we won't overflow the size. */
326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327 return (PyUnicodeObject *)PyErr_NoMemory();
328 }
329
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000331 if (free_list) {
332 unicode = free_list;
333 free_list = *(PyUnicodeObject **)unicode;
334 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 if (unicode->str) {
336 /* Keep-Alive optimization: we only upsize the buffer,
337 never downsize it. */
338 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000339 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 PyObject_DEL(unicode->str);
341 unicode->str = NULL;
342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 }
348 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000351 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 if (unicode == NULL)
354 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
358
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000359 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 PyErr_NoMemory();
361 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000363 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000364 * the caller fails before initializing str -- unicode_resize()
365 * reads str[0], and the Keep-Alive optimization can keep memory
366 * allocated for str alive across a call to unicode_dealloc(unicode).
367 * We don't want unicode_resize to read uninitialized memory in
368 * that case.
369 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000374 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377
Benjamin Peterson29060642009-01-31 22:14:21 +0000378 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000379 /* XXX UNREF/NEWREF interface should be more symmetrical */
380 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000382 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384}
385
386static
Guido van Rossum9475a232001-10-05 20:51:39 +0000387void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388{
Walter Dörwald16807132007-05-25 13:52:07 +0000389 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_NOT_INTERNED:
391 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_INTERNED_MORTAL:
394 /* revive dead object temporarily for DelItem */
395 Py_REFCNT(unicode) = 3;
396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397 Py_FatalError(
398 "deletion of interned string failed");
399 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000400
Benjamin Peterson29060642009-01-31 22:14:21 +0000401 case SSTATE_INTERNED_IMMORTAL:
402 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 default:
405 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406 }
407
Guido van Rossum604ddf82001-12-06 20:03:56 +0000408 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000410 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412 PyObject_DEL(unicode->str);
413 unicode->str = NULL;
414 unicode->length = 0;
415 }
416 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000417 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000418 }
419 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000420 *(PyUnicodeObject **)unicode = free_list;
421 free_list = unicode;
422 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000425 PyObject_DEL(unicode->str);
426 Py_XDECREF(unicode->defenc);
427 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 }
429}
430
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433{
434 register PyUnicodeObject *v;
435
436 /* Argument checks */
437 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000441 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 PyErr_BadInternalCall();
444 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445 }
446
447 /* Resizing unicode_empty and single character objects is not
448 possible since these are being shared. We simply return a fresh
449 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000450 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 (v == unicode_empty || v->length == 1)) {
452 PyUnicodeObject *w = _PyUnicode_New(length);
453 if (w == NULL)
454 return -1;
455 Py_UNICODE_COPY(w->str, v->str,
456 length < v->length ? length : v->length);
457 Py_DECREF(*unicode);
458 *unicode = w;
459 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 }
461
462 /* Note that we don't have to modify *unicode for unshared Unicode
463 objects, since we can modify them in-place. */
464 return unicode_resize(v, length);
465}
466
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000473 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474{
475 PyUnicodeObject *unicode;
476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 /* If the Unicode data is known at construction time, we can apply
478 some optimizations which share commonly used objects. */
479 if (u != NULL) {
480
Benjamin Peterson29060642009-01-31 22:14:21 +0000481 /* Optimization for empty strings */
482 if (size == 0 && unicode_empty != NULL) {
483 Py_INCREF(unicode_empty);
484 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000485 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000486
487 /* Single character Unicode objects in the Latin-1 range are
488 shared when using this constructor */
489 if (size == 1 && *u < 256) {
490 unicode = unicode_latin1[*u];
491 if (!unicode) {
492 unicode = _PyUnicode_New(1);
493 if (!unicode)
494 return NULL;
495 unicode->str[0] = *u;
496 unicode_latin1[*u] = unicode;
497 }
498 Py_INCREF(unicode);
499 return (PyObject *)unicode;
500 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000501 }
Tim Petersced69f82003-09-16 20:30:58 +0000502
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
506
507 /* Copy the Unicode data into the new object */
508 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000509 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 return (PyObject *)unicode;
512}
513
Walter Dörwaldd2034312007-05-18 16:29:38 +0000514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000515{
516 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000517
Benjamin Peterson14339b62009-01-31 16:36:08 +0000518 if (size < 0) {
519 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 return NULL;
522 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000525 some optimizations which share commonly used objects.
526 Also, this means the input must be UTF-8, so fall back to the
527 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (u != NULL) {
529
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 /* Optimization for empty strings */
531 if (size == 0 && unicode_empty != NULL) {
532 Py_INCREF(unicode_empty);
533 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000534 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000535
536 /* Single characters are shared when using this constructor.
537 Restrict to ASCII, since the input must be UTF-8. */
538 if (size == 1 && Py_CHARMASK(*u) < 128) {
539 unicode = unicode_latin1[Py_CHARMASK(*u)];
540 if (!unicode) {
541 unicode = _PyUnicode_New(1);
542 if (!unicode)
543 return NULL;
544 unicode->str[0] = Py_CHARMASK(*u);
545 unicode_latin1[Py_CHARMASK(*u)] = unicode;
546 }
547 Py_INCREF(unicode);
548 return (PyObject *)unicode;
549 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000550
551 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000552 }
553
Walter Dörwald55507312007-05-18 13:12:10 +0000554 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 if (!unicode)
556 return NULL;
557
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 return (PyObject *)unicode;
559}
560
Walter Dörwaldd2034312007-05-18 16:29:38 +0000561PyObject *PyUnicode_FromString(const char *u)
562{
563 size_t size = strlen(u);
564 if (size > PY_SSIZE_T_MAX) {
565 PyErr_SetString(PyExc_OverflowError, "input too long");
566 return NULL;
567 }
568
569 return PyUnicode_FromStringAndSize(u, size);
570}
571
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572#ifdef HAVE_WCHAR_H
573
Mark Dickinson081dfee2009-03-18 14:47:41 +0000574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581 to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584 Py_ssize_t size)
585{
586 PyUnicodeObject *unicode;
587 register Py_ssize_t i;
588 Py_ssize_t alloc;
589 const wchar_t *orig_w;
590
591 if (w == NULL) {
592 if (size == 0)
593 return PyUnicode_FromStringAndSize(NULL, 0);
594 PyErr_BadInternalCall();
595 return NULL;
596 }
597
598 if (size == -1) {
599 size = wcslen(w);
600 }
601
602 alloc = size;
603 orig_w = w;
604 for (i = size; i > 0; i--) {
605 if (*w > 0xFFFF)
606 alloc++;
607 w++;
608 }
609 w = orig_w;
610 unicode = _PyUnicode_New(alloc);
611 if (!unicode)
612 return NULL;
613
614 /* Copy the wchar_t data into the new object */
615 {
616 register Py_UNICODE *u;
617 u = PyUnicode_AS_UNICODE(unicode);
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF) {
620 wchar_t ordinal = *w++;
621 ordinal -= 0x10000;
622 *u++ = 0xD800 | (ordinal >> 10);
623 *u++ = 0xDC00 | (ordinal & 0x3FF);
624 }
625 else
626 *u++ = *w++;
627 }
628 }
629 return (PyObject *)unicode;
630}
631
632#else
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000635 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636{
637 PyUnicodeObject *unicode;
638
639 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000640 if (size == 0)
641 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000642 PyErr_BadInternalCall();
643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644 }
645
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == -1) {
647 size = wcslen(w);
648 }
649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = _PyUnicode_New(size);
651 if (!unicode)
652 return NULL;
653
654 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000657#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000659 register Py_UNICODE *u;
660 register Py_ssize_t i;
661 u = PyUnicode_AS_UNICODE(unicode);
662 for (i = size; i > 0; i--)
663 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665#endif
666
667 return (PyObject *)unicode;
668}
669
Mark Dickinson081dfee2009-03-18 14:47:41 +0000670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
Walter Dörwald346737f2007-05-31 10:44:43 +0000674static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000677{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000678 *fmt++ = '%';
679 if (width) {
680 if (zeropad)
681 *fmt++ = '0';
682 fmt += sprintf(fmt, "%d", width);
683 }
684 if (precision)
685 fmt += sprintf(fmt, ".%d", precision);
686 if (longflag)
687 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000688 else if (longlongflag) {
689 /* longlongflag should only ever be nonzero on machines with
690 HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692 char *f = PY_FORMAT_LONG_LONG;
693 while (*f)
694 *fmt++ = *f++;
695#else
696 /* we shouldn't ever get here */
697 assert(0);
698 *fmt++ = 'l';
699#endif
700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000701 else if (size_tflag) {
702 char *f = PY_FORMAT_SIZE_T;
703 while (*f)
704 *fmt++ = *f++;
705 }
706 *fmt++ = c;
707 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000708}
709
Walter Dörwaldd2034312007-05-18 16:29:38 +0000710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld. 21 characters
715 allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 va_list count;
726 Py_ssize_t callcount = 0;
727 PyObject **callresults = NULL;
728 PyObject **callresult = NULL;
729 Py_ssize_t n = 0;
730 int width = 0;
731 int precision = 0;
732 int zeropad;
733 const char* f;
734 Py_UNICODE *s;
735 PyObject *string;
736 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000737 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000738 /* use abuffer instead of buffer, if we need more space
739 * (which can happen if there's a format specifier with width). */
740 char *abuffer = NULL;
741 char *realbuffer;
742 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000746 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000747 /* step 1: count the number of %S/%R/%A/%s format specifications
748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000752 if (*f == '%') {
753 if (*(f+1)=='%')
754 continue;
755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
756 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000757 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000758 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 ;
761 if (*f == 's')
762 ++callcount;
763 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000764 else if (128 <= (unsigned char)*f) {
765 PyErr_Format(PyExc_ValueError,
766 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000767 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000768 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000769 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 }
772 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 if (callcount) {
775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776 if (!callresults) {
777 PyErr_NoMemory();
778 return NULL;
779 }
780 callresult = callresults;
781 }
782 /* step 3: figure out how large a buffer we need */
783 for (f = format; *f; f++) {
784 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000785#ifdef HAVE_LONG_LONG
786 int longlongflag = 0;
787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000788 const char* p = f;
789 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000790 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796 * they don't affect the amount of space we reserve.
797 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000798 if (*f == 'l') {
799 if (f[1] == 'd' || f[1] == 'u') {
800 ++f;
801 }
802#ifdef HAVE_LONG_LONG
803 else if (f[1] == 'l' &&
804 (f[2] == 'd' || f[2] == 'u')) {
805 longlongflag = 1;
806 f += 2;
807 }
808#endif
809 }
810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000811 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000812 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
Benjamin Peterson14339b62009-01-31 16:36:08 +0000814 switch (*f) {
815 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +0000816 {
817#ifndef Py_UNICODE_WIDE
818 int ordinal = va_arg(count, int);
819 if (ordinal > 0xffff)
820 n += 2;
821 else
822 n++;
823#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000824 (void)va_arg(count, int);
Victor Stinner659eb842011-02-23 12:14:22 +0000825 n++;
826#endif
827 break;
828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 case '%':
830 n++;
831 break;
832 case 'd': case 'u': case 'i': case 'x':
833 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000834#ifdef HAVE_LONG_LONG
835 if (longlongflag) {
836 if (width < MAX_LONG_LONG_CHARS)
837 width = MAX_LONG_LONG_CHARS;
838 }
839 else
840#endif
841 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
842 including sign. Decimal takes the most space. This
843 isn't enough for octal. If a width is specified we
844 need more (which we allocate later). */
845 if (width < MAX_LONG_CHARS)
846 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000848 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000849 if (abuffersize < width)
850 abuffersize = width;
851 break;
852 case 's':
853 {
854 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000855 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000856 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
857 if (!str)
858 goto fail;
859 n += PyUnicode_GET_SIZE(str);
860 /* Remember the str and switch to the next slot */
861 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000862 break;
863 }
864 case 'U':
865 {
866 PyObject *obj = va_arg(count, PyObject *);
867 assert(obj && PyUnicode_Check(obj));
868 n += PyUnicode_GET_SIZE(obj);
869 break;
870 }
871 case 'V':
872 {
873 PyObject *obj = va_arg(count, PyObject *);
874 const char *str = va_arg(count, const char *);
875 assert(obj || str);
876 assert(!obj || PyUnicode_Check(obj));
877 if (obj)
878 n += PyUnicode_GET_SIZE(obj);
879 else
880 n += strlen(str);
881 break;
882 }
883 case 'S':
884 {
885 PyObject *obj = va_arg(count, PyObject *);
886 PyObject *str;
887 assert(obj);
888 str = PyObject_Str(obj);
889 if (!str)
890 goto fail;
891 n += PyUnicode_GET_SIZE(str);
892 /* Remember the str and switch to the next slot */
893 *callresult++ = str;
894 break;
895 }
896 case 'R':
897 {
898 PyObject *obj = va_arg(count, PyObject *);
899 PyObject *repr;
900 assert(obj);
901 repr = PyObject_Repr(obj);
902 if (!repr)
903 goto fail;
904 n += PyUnicode_GET_SIZE(repr);
905 /* Remember the repr and switch to the next slot */
906 *callresult++ = repr;
907 break;
908 }
909 case 'A':
910 {
911 PyObject *obj = va_arg(count, PyObject *);
912 PyObject *ascii;
913 assert(obj);
914 ascii = PyObject_ASCII(obj);
915 if (!ascii)
916 goto fail;
917 n += PyUnicode_GET_SIZE(ascii);
918 /* Remember the repr and switch to the next slot */
919 *callresult++ = ascii;
920 break;
921 }
922 case 'p':
923 (void) va_arg(count, int);
924 /* maximum 64-bit pointer representation:
925 * 0xffffffffffffffff
926 * so 19 characters is enough.
927 * XXX I count 18 -- what's the extra for?
928 */
929 n += 19;
930 break;
931 default:
932 /* if we stumble upon an unknown
933 formatting code, copy the rest of
934 the format string to the output
935 string. (we cannot just skip the
936 code, since there's no way to know
937 what's in the argument list) */
938 n += strlen(p);
939 goto expand;
940 }
941 } else
942 n++;
943 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000944 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000945 if (abuffersize > ITEM_BUFFER_LEN) {
946 /* add 1 for sprintf's trailing null byte */
947 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000948 if (!abuffer) {
949 PyErr_NoMemory();
950 goto fail;
951 }
952 realbuffer = abuffer;
953 }
954 else
955 realbuffer = buffer;
956 /* step 4: fill the buffer */
957 /* Since we've analyzed how much space we need for the worst case,
958 we don't have to resize the string.
959 There can be no errors beyond this point. */
960 string = PyUnicode_FromUnicode(NULL, n);
961 if (!string)
962 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000963
Benjamin Peterson14339b62009-01-31 16:36:08 +0000964 s = PyUnicode_AS_UNICODE(string);
965 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000966
Benjamin Peterson14339b62009-01-31 16:36:08 +0000967 for (f = format; *f; f++) {
968 if (*f == '%') {
969 const char* p = f++;
970 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000971 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000972 int size_tflag = 0;
973 zeropad = (*f == '0');
974 /* parse the width.precision part */
975 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000976 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000977 width = (width*10) + *f++ - '0';
978 precision = 0;
979 if (*f == '.') {
980 f++;
David Malcolm96960882010-11-05 17:23:41 +0000981 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000982 precision = (precision*10) + *f++ - '0';
983 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000984 /* Handle %ld, %lu, %lld and %llu. */
985 if (*f == 'l') {
986 if (f[1] == 'd' || f[1] == 'u') {
987 longflag = 1;
988 ++f;
989 }
990#ifdef HAVE_LONG_LONG
991 else if (f[1] == 'l' &&
992 (f[2] == 'd' || f[2] == 'u')) {
993 longlongflag = 1;
994 f += 2;
995 }
996#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000997 }
998 /* handle the size_t flag. */
999 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1000 size_tflag = 1;
1001 ++f;
1002 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001003
Benjamin Peterson14339b62009-01-31 16:36:08 +00001004 switch (*f) {
1005 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +00001006 {
1007 int ordinal = va_arg(vargs, int);
1008#ifndef Py_UNICODE_WIDE
1009 if (ordinal > 0xffff) {
1010 ordinal -= 0x10000;
1011 *s++ = 0xD800 | (ordinal >> 10);
1012 *s++ = 0xDC00 | (ordinal & 0x3FF);
1013 } else
1014#endif
1015 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001016 break;
Victor Stinner659eb842011-02-23 12:14:22 +00001017 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001019 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1020 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 if (longflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001023#ifdef HAVE_LONG_LONG
1024 else if (longlongflag)
1025 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1026#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001027 else if (size_tflag)
1028 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1029 else
1030 sprintf(realbuffer, fmt, va_arg(vargs, int));
1031 appendstring(realbuffer);
1032 break;
1033 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001034 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1035 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001036 if (longflag)
1037 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001038#ifdef HAVE_LONG_LONG
1039 else if (longlongflag)
1040 sprintf(realbuffer, fmt, va_arg(vargs,
1041 unsigned PY_LONG_LONG));
1042#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 else if (size_tflag)
1044 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1045 else
1046 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1047 appendstring(realbuffer);
1048 break;
1049 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001050 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 sprintf(realbuffer, fmt, va_arg(vargs, int));
1052 appendstring(realbuffer);
1053 break;
1054 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001055 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001056 sprintf(realbuffer, fmt, va_arg(vargs, int));
1057 appendstring(realbuffer);
1058 break;
1059 case 's':
1060 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001061 /* unused, since we already have the result */
1062 (void) va_arg(vargs, char *);
1063 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1064 PyUnicode_GET_SIZE(*callresult));
1065 s += PyUnicode_GET_SIZE(*callresult);
1066 /* We're done with the unicode()/repr() => forget it */
1067 Py_DECREF(*callresult);
1068 /* switch to next unicode()/repr() result */
1069 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001070 break;
1071 }
1072 case 'U':
1073 {
1074 PyObject *obj = va_arg(vargs, PyObject *);
1075 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1076 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1077 s += size;
1078 break;
1079 }
1080 case 'V':
1081 {
1082 PyObject *obj = va_arg(vargs, PyObject *);
1083 const char *str = va_arg(vargs, const char *);
1084 if (obj) {
1085 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1086 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1087 s += size;
1088 } else {
1089 appendstring(str);
1090 }
1091 break;
1092 }
1093 case 'S':
1094 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001095 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001096 {
1097 Py_UNICODE *ucopy;
1098 Py_ssize_t usize;
1099 Py_ssize_t upos;
1100 /* unused, since we already have the result */
1101 (void) va_arg(vargs, PyObject *);
1102 ucopy = PyUnicode_AS_UNICODE(*callresult);
1103 usize = PyUnicode_GET_SIZE(*callresult);
1104 for (upos = 0; upos<usize;)
1105 *s++ = ucopy[upos++];
1106 /* We're done with the unicode()/repr() => forget it */
1107 Py_DECREF(*callresult);
1108 /* switch to next unicode()/repr() result */
1109 ++callresult;
1110 break;
1111 }
1112 case 'p':
1113 sprintf(buffer, "%p", va_arg(vargs, void*));
1114 /* %p is ill-defined: ensure leading 0x. */
1115 if (buffer[1] == 'X')
1116 buffer[1] = 'x';
1117 else if (buffer[1] != 'x') {
1118 memmove(buffer+2, buffer, strlen(buffer)+1);
1119 buffer[0] = '0';
1120 buffer[1] = 'x';
1121 }
1122 appendstring(buffer);
1123 break;
1124 case '%':
1125 *s++ = '%';
1126 break;
1127 default:
1128 appendstring(p);
1129 goto end;
1130 }
Victor Stinner1205f272010-09-11 00:54:47 +00001131 }
Victor Stinner1205f272010-09-11 00:54:47 +00001132 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001133 *s++ = *f;
1134 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001135
Benjamin Peterson29060642009-01-31 22:14:21 +00001136 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001137 if (callresults)
1138 PyObject_Free(callresults);
1139 if (abuffer)
1140 PyObject_Free(abuffer);
1141 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1142 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001143 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 if (callresults) {
1145 PyObject **callresult2 = callresults;
1146 while (callresult2 < callresult) {
1147 Py_DECREF(*callresult2);
1148 ++callresult2;
1149 }
1150 PyObject_Free(callresults);
1151 }
1152 if (abuffer)
1153 PyObject_Free(abuffer);
1154 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155}
1156
1157#undef appendstring
1158
1159PyObject *
1160PyUnicode_FromFormat(const char *format, ...)
1161{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 PyObject* ret;
1163 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001164
1165#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001167#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001168 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001169#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001170 ret = PyUnicode_FromFormatV(format, vargs);
1171 va_end(vargs);
1172 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001173}
1174
Victor Stinner5593d8a2010-10-02 11:11:27 +00001175/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1176 convert a Unicode object to a wide character string.
1177
1178 - If w is NULL: return the number of wide characters (including the nul
1179 character) required to convert the unicode object. Ignore size argument.
1180
1181 - Otherwise: return the number of wide characters (excluding the nul
1182 character) written into w. Write at most size wide characters (including
1183 the nul character). */
1184static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001185unicode_aswidechar(PyUnicodeObject *unicode,
1186 wchar_t *w,
1187 Py_ssize_t size)
1188{
1189#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001190 Py_ssize_t res;
1191 if (w != NULL) {
1192 res = PyUnicode_GET_SIZE(unicode);
1193 if (size > res)
1194 size = res + 1;
1195 else
1196 res = size;
1197 memcpy(w, unicode->str, size * sizeof(wchar_t));
1198 return res;
1199 }
1200 else
1201 return PyUnicode_GET_SIZE(unicode) + 1;
1202#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1203 register const Py_UNICODE *u;
1204 const Py_UNICODE *uend;
1205 const wchar_t *worig, *wend;
1206 Py_ssize_t nchar;
1207
Victor Stinner137c34c2010-09-29 10:25:54 +00001208 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001209 uend = u + PyUnicode_GET_SIZE(unicode);
1210 if (w != NULL) {
1211 worig = w;
1212 wend = w + size;
1213 while (u != uend && w != wend) {
1214 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1215 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1216 {
1217 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1218 u += 2;
1219 }
1220 else {
1221 *w = *u;
1222 u++;
1223 }
1224 w++;
1225 }
1226 if (w != wend)
1227 *w = L'\0';
1228 return w - worig;
1229 }
1230 else {
1231 nchar = 1; /* nul character at the end */
1232 while (u != uend) {
1233 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1234 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1235 u += 2;
1236 else
1237 u++;
1238 nchar++;
1239 }
1240 }
1241 return nchar;
1242#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1243 register Py_UNICODE *u, *uend, ordinal;
1244 register Py_ssize_t i;
1245 wchar_t *worig, *wend;
1246 Py_ssize_t nchar;
1247
1248 u = PyUnicode_AS_UNICODE(unicode);
1249 uend = u + PyUnicode_GET_SIZE(u);
1250 if (w != NULL) {
1251 worig = w;
1252 wend = w + size;
1253 while (u != uend && w != wend) {
1254 ordinal = *u;
1255 if (ordinal > 0xffff) {
1256 ordinal -= 0x10000;
1257 *w++ = 0xD800 | (ordinal >> 10);
1258 *w++ = 0xDC00 | (ordinal & 0x3FF);
1259 }
1260 else
1261 *w++ = ordinal;
1262 u++;
1263 }
1264 if (w != wend)
1265 *w = 0;
1266 return w - worig;
1267 }
1268 else {
1269 nchar = 1; /* nul character */
1270 while (u != uend) {
1271 if (*u > 0xffff)
1272 nchar += 2;
1273 else
1274 nchar++;
1275 u++;
1276 }
1277 return nchar;
1278 }
1279#else
1280# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001281#endif
1282}
1283
1284Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001285PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001286 wchar_t *w,
1287 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288{
1289 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001290 PyErr_BadInternalCall();
1291 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001293 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
Victor Stinner137c34c2010-09-29 10:25:54 +00001296wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001297PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001298 Py_ssize_t *size)
1299{
1300 wchar_t* buffer;
1301 Py_ssize_t buflen;
1302
1303 if (unicode == NULL) {
1304 PyErr_BadInternalCall();
1305 return NULL;
1306 }
1307
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001308 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001309 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001310 PyErr_NoMemory();
1311 return NULL;
1312 }
1313
Victor Stinner137c34c2010-09-29 10:25:54 +00001314 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1315 if (buffer == NULL) {
1316 PyErr_NoMemory();
1317 return NULL;
1318 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001319 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001320 if (size != NULL)
1321 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001322 return buffer;
1323}
1324
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325#endif
1326
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001327PyObject *PyUnicode_FromOrdinal(int ordinal)
1328{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001329 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001330
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001331 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001332 PyErr_SetString(PyExc_ValueError,
1333 "chr() arg not in range(0x110000)");
1334 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001335 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001336
1337#ifndef Py_UNICODE_WIDE
1338 if (ordinal > 0xffff) {
1339 ordinal -= 0x10000;
1340 s[0] = 0xD800 | (ordinal >> 10);
1341 s[1] = 0xDC00 | (ordinal & 0x3FF);
1342 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001343 }
1344#endif
1345
Hye-Shik Chang40574832004-04-06 07:24:51 +00001346 s[0] = (Py_UNICODE)ordinal;
1347 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001348}
1349
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350PyObject *PyUnicode_FromObject(register PyObject *obj)
1351{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001352 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001354 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001355 Py_INCREF(obj);
1356 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001357 }
1358 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 /* For a Unicode subtype that's not a Unicode object,
1360 return a true Unicode object with the same data. */
1361 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1362 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001363 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001364 PyErr_Format(PyExc_TypeError,
1365 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001366 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001367 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001368}
1369
1370PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001371 const char *encoding,
1372 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001373{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001374 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001375 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001376
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001378 PyErr_BadInternalCall();
1379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001381
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001382 /* Decoding bytes objects is the most common case and should be fast */
1383 if (PyBytes_Check(obj)) {
1384 if (PyBytes_GET_SIZE(obj) == 0) {
1385 Py_INCREF(unicode_empty);
1386 v = (PyObject *) unicode_empty;
1387 }
1388 else {
1389 v = PyUnicode_Decode(
1390 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1391 encoding, errors);
1392 }
1393 return v;
1394 }
1395
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001396 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 PyErr_SetString(PyExc_TypeError,
1398 "decoding str is not supported");
1399 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001400 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001401
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001402 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1403 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1404 PyErr_Format(PyExc_TypeError,
1405 "coercing to str: need bytes, bytearray "
1406 "or buffer-like object, %.80s found",
1407 Py_TYPE(obj)->tp_name);
1408 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001409 }
Tim Petersced69f82003-09-16 20:30:58 +00001410
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001411 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001412 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001413 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 }
Tim Petersced69f82003-09-16 20:30:58 +00001415 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001416 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001417
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001418 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001419 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420}
1421
Victor Stinner600d3be2010-06-10 12:00:55 +00001422/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001423 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1424 1 on success. */
1425static int
1426normalize_encoding(const char *encoding,
1427 char *lower,
1428 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001430 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001431 char *l;
1432 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001434 e = encoding;
1435 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001436 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001437 while (*e) {
1438 if (l == l_end)
1439 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001440 if (Py_ISUPPER(*e)) {
1441 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001442 }
1443 else if (*e == '_') {
1444 *l++ = '-';
1445 e++;
1446 }
1447 else {
1448 *l++ = *e++;
1449 }
1450 }
1451 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001452 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001453}
1454
1455PyObject *PyUnicode_Decode(const char *s,
1456 Py_ssize_t size,
1457 const char *encoding,
1458 const char *errors)
1459{
1460 PyObject *buffer = NULL, *unicode;
1461 Py_buffer info;
1462 char lower[11]; /* Enough for any encoding shortcut */
1463
1464 if (encoding == NULL)
1465 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001466
1467 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001468 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1469 if (strcmp(lower, "utf-8") == 0)
1470 return PyUnicode_DecodeUTF8(s, size, errors);
1471 else if ((strcmp(lower, "latin-1") == 0) ||
1472 (strcmp(lower, "iso-8859-1") == 0))
1473 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001474#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001475 else if (strcmp(lower, "mbcs") == 0)
1476 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001477#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001478 else if (strcmp(lower, "ascii") == 0)
1479 return PyUnicode_DecodeASCII(s, size, errors);
1480 else if (strcmp(lower, "utf-16") == 0)
1481 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1482 else if (strcmp(lower, "utf-32") == 0)
1483 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485
1486 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001487 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001488 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001489 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001490 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 if (buffer == NULL)
1492 goto onError;
1493 unicode = PyCodec_Decode(buffer, encoding, errors);
1494 if (unicode == NULL)
1495 goto onError;
1496 if (!PyUnicode_Check(unicode)) {
1497 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001498 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001499 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 Py_DECREF(unicode);
1501 goto onError;
1502 }
1503 Py_DECREF(buffer);
1504 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001505
Benjamin Peterson29060642009-01-31 22:14:21 +00001506 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 Py_XDECREF(buffer);
1508 return NULL;
1509}
1510
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001511PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1512 const char *encoding,
1513 const char *errors)
1514{
1515 PyObject *v;
1516
1517 if (!PyUnicode_Check(unicode)) {
1518 PyErr_BadArgument();
1519 goto onError;
1520 }
1521
1522 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001524
1525 /* Decode via the codec registry */
1526 v = PyCodec_Decode(unicode, encoding, errors);
1527 if (v == NULL)
1528 goto onError;
1529 return v;
1530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001532 return NULL;
1533}
1534
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001535PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1536 const char *encoding,
1537 const char *errors)
1538{
1539 PyObject *v;
1540
1541 if (!PyUnicode_Check(unicode)) {
1542 PyErr_BadArgument();
1543 goto onError;
1544 }
1545
1546 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001547 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001548
1549 /* Decode via the codec registry */
1550 v = PyCodec_Decode(unicode, encoding, errors);
1551 if (v == NULL)
1552 goto onError;
1553 if (!PyUnicode_Check(v)) {
1554 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001555 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001556 Py_TYPE(v)->tp_name);
1557 Py_DECREF(v);
1558 goto onError;
1559 }
1560 return v;
1561
Benjamin Peterson29060642009-01-31 22:14:21 +00001562 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001563 return NULL;
1564}
1565
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 Py_ssize_t size,
1568 const char *encoding,
1569 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570{
1571 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001572
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573 unicode = PyUnicode_FromUnicode(s, size);
1574 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001576 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1577 Py_DECREF(unicode);
1578 return v;
1579}
1580
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001581PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1582 const char *encoding,
1583 const char *errors)
1584{
1585 PyObject *v;
1586
1587 if (!PyUnicode_Check(unicode)) {
1588 PyErr_BadArgument();
1589 goto onError;
1590 }
1591
1592 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001593 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001594
1595 /* Encode via the codec registry */
1596 v = PyCodec_Encode(unicode, encoding, errors);
1597 if (v == NULL)
1598 goto onError;
1599 return v;
1600
Benjamin Peterson29060642009-01-31 22:14:21 +00001601 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001602 return NULL;
1603}
1604
Victor Stinnerad158722010-10-27 00:25:46 +00001605PyObject *
1606PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001607{
Victor Stinner313a1202010-06-11 23:56:51 +00001608#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001609 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1610 PyUnicode_GET_SIZE(unicode),
1611 NULL);
1612#elif defined(__APPLE__)
1613 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1614 PyUnicode_GET_SIZE(unicode),
1615 "surrogateescape");
1616#else
1617 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001618 return PyUnicode_AsEncodedString(unicode,
1619 Py_FileSystemDefaultEncoding,
1620 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001621 }
1622 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001623 /* locale encoding with surrogateescape */
1624 wchar_t *wchar;
1625 char *bytes;
1626 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001627 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001628
1629 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1630 if (wchar == NULL)
1631 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001632 bytes = _Py_wchar2char(wchar, &error_pos);
1633 if (bytes == NULL) {
1634 if (error_pos != (size_t)-1) {
1635 char *errmsg = strerror(errno);
1636 PyObject *exc = NULL;
1637 if (errmsg == NULL)
1638 errmsg = "Py_wchar2char() failed";
1639 raise_encode_exception(&exc,
1640 "filesystemencoding",
1641 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1642 error_pos, error_pos+1,
1643 errmsg);
1644 Py_XDECREF(exc);
1645 }
1646 else
1647 PyErr_NoMemory();
1648 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001649 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001650 }
1651 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001652
1653 bytes_obj = PyBytes_FromString(bytes);
1654 PyMem_Free(bytes);
1655 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001656 }
Victor Stinnerad158722010-10-27 00:25:46 +00001657#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001658}
1659
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1661 const char *encoding,
1662 const char *errors)
1663{
1664 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001665 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001666
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667 if (!PyUnicode_Check(unicode)) {
1668 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670 }
Fred Drakee4315f52000-05-09 19:53:39 +00001671
Tim Petersced69f82003-09-16 20:30:58 +00001672 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001673 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001674
1675 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001676 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1677 if (strcmp(lower, "utf-8") == 0)
1678 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1679 PyUnicode_GET_SIZE(unicode),
1680 errors);
1681 else if ((strcmp(lower, "latin-1") == 0) ||
1682 (strcmp(lower, "iso-8859-1") == 0))
1683 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1684 PyUnicode_GET_SIZE(unicode),
1685 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001686#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001687 else if (strcmp(lower, "mbcs") == 0)
1688 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1689 PyUnicode_GET_SIZE(unicode),
1690 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001691#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001692 else if (strcmp(lower, "ascii") == 0)
1693 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1694 PyUnicode_GET_SIZE(unicode),
1695 errors);
1696 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001697 /* During bootstrap, we may need to find the encodings
1698 package, to load the file system encoding, and require the
1699 file system encoding in order to load the encodings
1700 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001701
Victor Stinner59e62db2010-05-15 13:14:32 +00001702 Break out of this dependency by assuming that the path to
1703 the encodings module is ASCII-only. XXX could try wcstombs
1704 instead, if the file system encoding is the locale's
1705 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001706 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001707 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1708 !PyThreadState_GET()->interp->codecs_initialized)
1709 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1710 PyUnicode_GET_SIZE(unicode),
1711 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712
1713 /* Encode via the codec registry */
1714 v = PyCodec_Encode(unicode, encoding, errors);
1715 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001716 return NULL;
1717
1718 /* The normal path */
1719 if (PyBytes_Check(v))
1720 return v;
1721
1722 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001723 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001724 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001725 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001726
1727 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1728 "encoder %s returned bytearray instead of bytes",
1729 encoding);
1730 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001731 Py_DECREF(v);
1732 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001733 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001734
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001735 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1736 Py_DECREF(v);
1737 return b;
1738 }
1739
1740 PyErr_Format(PyExc_TypeError,
1741 "encoder did not return a bytes object (type=%.400s)",
1742 Py_TYPE(v)->tp_name);
1743 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001744 return NULL;
1745}
1746
1747PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1748 const char *encoding,
1749 const char *errors)
1750{
1751 PyObject *v;
1752
1753 if (!PyUnicode_Check(unicode)) {
1754 PyErr_BadArgument();
1755 goto onError;
1756 }
1757
1758 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001759 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001760
1761 /* Encode via the codec registry */
1762 v = PyCodec_Encode(unicode, encoding, errors);
1763 if (v == NULL)
1764 goto onError;
1765 if (!PyUnicode_Check(v)) {
1766 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001767 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001768 Py_TYPE(v)->tp_name);
1769 Py_DECREF(v);
1770 goto onError;
1771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001773
Benjamin Peterson29060642009-01-31 22:14:21 +00001774 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 return NULL;
1776}
1777
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001778PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001779 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001780{
1781 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001782 if (v)
1783 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001784 if (errors != NULL)
1785 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001786 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001787 PyUnicode_GET_SIZE(unicode),
1788 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001789 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001790 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001791 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001792 return v;
1793}
1794
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001795PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001796PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001797 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001798 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1799}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001800
Christian Heimes5894ba72007-11-04 11:43:14 +00001801PyObject*
1802PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1803{
Victor Stinnerad158722010-10-27 00:25:46 +00001804#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1805 return PyUnicode_DecodeMBCS(s, size, NULL);
1806#elif defined(__APPLE__)
1807 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1808#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001809 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1810 can be undefined. If it is case, decode using UTF-8. The following assumes
1811 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1812 bootstrapping process where the codecs aren't ready yet.
1813 */
1814 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001815 return PyUnicode_Decode(s, size,
1816 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001817 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001818 }
1819 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001820 /* locale encoding with surrogateescape */
1821 wchar_t *wchar;
1822 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001823 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001824
1825 if (s[size] != '\0' || size != strlen(s)) {
1826 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1827 return NULL;
1828 }
1829
Victor Stinner168e1172010-10-16 23:16:16 +00001830 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001831 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001832 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001833
Victor Stinner168e1172010-10-16 23:16:16 +00001834 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001835 PyMem_Free(wchar);
1836 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001837 }
Victor Stinnerad158722010-10-27 00:25:46 +00001838#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001839}
1840
Martin v. Löwis011e8422009-05-05 04:43:17 +00001841
1842int
1843PyUnicode_FSConverter(PyObject* arg, void* addr)
1844{
1845 PyObject *output = NULL;
1846 Py_ssize_t size;
1847 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001848 if (arg == NULL) {
1849 Py_DECREF(*(PyObject**)addr);
1850 return 1;
1851 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001852 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001853 output = arg;
1854 Py_INCREF(output);
1855 }
1856 else {
1857 arg = PyUnicode_FromObject(arg);
1858 if (!arg)
1859 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001860 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001861 Py_DECREF(arg);
1862 if (!output)
1863 return 0;
1864 if (!PyBytes_Check(output)) {
1865 Py_DECREF(output);
1866 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1867 return 0;
1868 }
1869 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001870 size = PyBytes_GET_SIZE(output);
1871 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001872 if (size != strlen(data)) {
1873 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1874 Py_DECREF(output);
1875 return 0;
1876 }
1877 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001878 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001879}
1880
1881
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001882int
1883PyUnicode_FSDecoder(PyObject* arg, void* addr)
1884{
1885 PyObject *output = NULL;
1886 Py_ssize_t size;
1887 void *data;
1888 if (arg == NULL) {
1889 Py_DECREF(*(PyObject**)addr);
1890 return 1;
1891 }
1892 if (PyUnicode_Check(arg)) {
1893 output = arg;
1894 Py_INCREF(output);
1895 }
1896 else {
1897 arg = PyBytes_FromObject(arg);
1898 if (!arg)
1899 return 0;
1900 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1901 PyBytes_GET_SIZE(arg));
1902 Py_DECREF(arg);
1903 if (!output)
1904 return 0;
1905 if (!PyUnicode_Check(output)) {
1906 Py_DECREF(output);
1907 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1908 return 0;
1909 }
1910 }
1911 size = PyUnicode_GET_SIZE(output);
1912 data = PyUnicode_AS_UNICODE(output);
1913 if (size != Py_UNICODE_strlen(data)) {
1914 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1915 Py_DECREF(output);
1916 return 0;
1917 }
1918 *(PyObject**)addr = output;
1919 return Py_CLEANUP_SUPPORTED;
1920}
1921
1922
Martin v. Löwis5b222132007-06-10 09:51:05 +00001923char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001924_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001925{
Christian Heimesf3863112007-11-22 07:46:41 +00001926 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001927 if (!PyUnicode_Check(unicode)) {
1928 PyErr_BadArgument();
1929 return NULL;
1930 }
Christian Heimesf3863112007-11-22 07:46:41 +00001931 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1932 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001933 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001934 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001935 *psize = PyBytes_GET_SIZE(bytes);
1936 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001937}
1938
1939char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001940_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001941{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001942 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001943}
1944
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1946{
1947 if (!PyUnicode_Check(unicode)) {
1948 PyErr_BadArgument();
1949 goto onError;
1950 }
1951 return PyUnicode_AS_UNICODE(unicode);
1952
Benjamin Peterson29060642009-01-31 22:14:21 +00001953 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 return NULL;
1955}
1956
Martin v. Löwis18e16552006-02-15 17:27:45 +00001957Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001958{
1959 if (!PyUnicode_Check(unicode)) {
1960 PyErr_BadArgument();
1961 goto onError;
1962 }
1963 return PyUnicode_GET_SIZE(unicode);
1964
Benjamin Peterson29060642009-01-31 22:14:21 +00001965 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 return -1;
1967}
1968
Thomas Wouters78890102000-07-22 19:25:51 +00001969const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001970{
Victor Stinner42cb4622010-09-01 19:39:01 +00001971 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001972}
1973
Victor Stinner554f3f02010-06-16 23:33:54 +00001974/* create or adjust a UnicodeDecodeError */
1975static void
1976make_decode_exception(PyObject **exceptionObject,
1977 const char *encoding,
1978 const char *input, Py_ssize_t length,
1979 Py_ssize_t startpos, Py_ssize_t endpos,
1980 const char *reason)
1981{
1982 if (*exceptionObject == NULL) {
1983 *exceptionObject = PyUnicodeDecodeError_Create(
1984 encoding, input, length, startpos, endpos, reason);
1985 }
1986 else {
1987 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1988 goto onError;
1989 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1990 goto onError;
1991 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1992 goto onError;
1993 }
1994 return;
1995
1996onError:
1997 Py_DECREF(*exceptionObject);
1998 *exceptionObject = NULL;
1999}
2000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002001/* error handling callback helper:
2002 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002003 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002004 and adjust various state variables.
2005 return 0 on success, -1 on error
2006*/
2007
2008static
2009int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00002010 const char *encoding, const char *reason,
2011 const char **input, const char **inend, Py_ssize_t *startinpos,
2012 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2013 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002014{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002015 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002016
2017 PyObject *restuple = NULL;
2018 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002019 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002020 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002021 Py_ssize_t requiredsize;
2022 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002023 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002024 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002025 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026 int res = -1;
2027
2028 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002029 *errorHandler = PyCodec_LookupError(errors);
2030 if (*errorHandler == NULL)
2031 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032 }
2033
Victor Stinner554f3f02010-06-16 23:33:54 +00002034 make_decode_exception(exceptionObject,
2035 encoding,
2036 *input, *inend - *input,
2037 *startinpos, *endinpos,
2038 reason);
2039 if (*exceptionObject == NULL)
2040 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041
2042 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2043 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002044 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002045 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002046 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002047 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 }
2049 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002050 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002051
2052 /* Copy back the bytes variables, which might have been modified by the
2053 callback */
2054 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2055 if (!inputobj)
2056 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002057 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002058 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002059 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002060 *input = PyBytes_AS_STRING(inputobj);
2061 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002062 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002063 /* we can DECREF safely, as the exception has another reference,
2064 so the object won't go away. */
2065 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002066
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002068 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002069 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002070 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2071 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002072 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002073
2074 /* need more space? (at least enough for what we
2075 have+the replacement+the rest of the string (starting
2076 at the new input position), so we won't have to check space
2077 when there are no errors in the rest of the string) */
2078 repptr = PyUnicode_AS_UNICODE(repunicode);
2079 repsize = PyUnicode_GET_SIZE(repunicode);
2080 requiredsize = *outpos + repsize + insize-newpos;
2081 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 if (requiredsize<2*outsize)
2083 requiredsize = 2*outsize;
2084 if (_PyUnicode_Resize(output, requiredsize) < 0)
2085 goto onError;
2086 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002087 }
2088 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002089 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002090 Py_UNICODE_COPY(*outptr, repptr, repsize);
2091 *outptr += repsize;
2092 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002093
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 /* we made it! */
2095 res = 0;
2096
Benjamin Peterson29060642009-01-31 22:14:21 +00002097 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002098 Py_XDECREF(restuple);
2099 return res;
2100}
2101
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002102/* --- UTF-7 Codec -------------------------------------------------------- */
2103
Antoine Pitrou244651a2009-05-04 18:56:13 +00002104/* See RFC2152 for details. We encode conservatively and decode liberally. */
2105
2106/* Three simple macros defining base-64. */
2107
2108/* Is c a base-64 character? */
2109
2110#define IS_BASE64(c) \
2111 (((c) >= 'A' && (c) <= 'Z') || \
2112 ((c) >= 'a' && (c) <= 'z') || \
2113 ((c) >= '0' && (c) <= '9') || \
2114 (c) == '+' || (c) == '/')
2115
2116/* given that c is a base-64 character, what is its base-64 value? */
2117
2118#define FROM_BASE64(c) \
2119 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2120 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2121 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2122 (c) == '+' ? 62 : 63)
2123
2124/* What is the base-64 character of the bottom 6 bits of n? */
2125
2126#define TO_BASE64(n) \
2127 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2128
2129/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2130 * decoded as itself. We are permissive on decoding; the only ASCII
2131 * byte not decoding to itself is the + which begins a base64
2132 * string. */
2133
2134#define DECODE_DIRECT(c) \
2135 ((c) <= 127 && (c) != '+')
2136
2137/* The UTF-7 encoder treats ASCII characters differently according to
2138 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2139 * the above). See RFC2152. This array identifies these different
2140 * sets:
2141 * 0 : "Set D"
2142 * alphanumeric and '(),-./:?
2143 * 1 : "Set O"
2144 * !"#$%&*;<=>@[]^_`{|}
2145 * 2 : "whitespace"
2146 * ht nl cr sp
2147 * 3 : special (must be base64 encoded)
2148 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2149 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002150
Tim Petersced69f82003-09-16 20:30:58 +00002151static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002152char utf7_category[128] = {
2153/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2154 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2155/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2156 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2157/* sp ! " # $ % & ' ( ) * + , - . / */
2158 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2159/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2161/* @ A B C D E F G H I J K L M N O */
2162 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2163/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2164 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2165/* ` a b c d e f g h i j k l m n o */
2166 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2167/* p q r s t u v w x y z { | } ~ del */
2168 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002169};
2170
Antoine Pitrou244651a2009-05-04 18:56:13 +00002171/* ENCODE_DIRECT: this character should be encoded as itself. The
2172 * answer depends on whether we are encoding set O as itself, and also
2173 * on whether we are encoding whitespace as itself. RFC2152 makes it
2174 * clear that the answers to these questions vary between
2175 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002176
Antoine Pitrou244651a2009-05-04 18:56:13 +00002177#define ENCODE_DIRECT(c, directO, directWS) \
2178 ((c) < 128 && (c) > 0 && \
2179 ((utf7_category[(c)] == 0) || \
2180 (directWS && (utf7_category[(c)] == 2)) || \
2181 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002182
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002183PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002184 Py_ssize_t size,
2185 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002186{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002187 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2188}
2189
Antoine Pitrou244651a2009-05-04 18:56:13 +00002190/* The decoder. The only state we preserve is our read position,
2191 * i.e. how many characters we have consumed. So if we end in the
2192 * middle of a shift sequence we have to back off the read position
2193 * and the output to the beginning of the sequence, otherwise we lose
2194 * all the shift state (seen bits, number of bits seen, high
2195 * surrogate). */
2196
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002197PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002198 Py_ssize_t size,
2199 const char *errors,
2200 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002201{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002202 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002203 Py_ssize_t startinpos;
2204 Py_ssize_t endinpos;
2205 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206 const char *e;
2207 PyUnicodeObject *unicode;
2208 Py_UNICODE *p;
2209 const char *errmsg = "";
2210 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002211 Py_UNICODE *shiftOutStart;
2212 unsigned int base64bits = 0;
2213 unsigned long base64buffer = 0;
2214 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002215 PyObject *errorHandler = NULL;
2216 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002217
2218 unicode = _PyUnicode_New(size);
2219 if (!unicode)
2220 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002221 if (size == 0) {
2222 if (consumed)
2223 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002224 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002225 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002226
2227 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002228 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002229 e = s + size;
2230
2231 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002232 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002233 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002234 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002235
Antoine Pitrou244651a2009-05-04 18:56:13 +00002236 if (inShift) { /* in a base-64 section */
2237 if (IS_BASE64(ch)) { /* consume a base-64 character */
2238 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2239 base64bits += 6;
2240 s++;
2241 if (base64bits >= 16) {
2242 /* we have enough bits for a UTF-16 value */
2243 Py_UNICODE outCh = (Py_UNICODE)
2244 (base64buffer >> (base64bits-16));
2245 base64bits -= 16;
2246 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2247 if (surrogate) {
2248 /* expecting a second surrogate */
2249 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2250#ifdef Py_UNICODE_WIDE
2251 *p++ = (((surrogate & 0x3FF)<<10)
2252 | (outCh & 0x3FF)) + 0x10000;
2253#else
2254 *p++ = surrogate;
2255 *p++ = outCh;
2256#endif
2257 surrogate = 0;
2258 }
2259 else {
2260 surrogate = 0;
2261 errmsg = "second surrogate missing";
2262 goto utf7Error;
2263 }
2264 }
2265 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2266 /* first surrogate */
2267 surrogate = outCh;
2268 }
2269 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2270 errmsg = "unexpected second surrogate";
2271 goto utf7Error;
2272 }
2273 else {
2274 *p++ = outCh;
2275 }
2276 }
2277 }
2278 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002279 inShift = 0;
2280 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002281 if (surrogate) {
2282 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002283 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002284 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002285 if (base64bits > 0) { /* left-over bits */
2286 if (base64bits >= 6) {
2287 /* We've seen at least one base-64 character */
2288 errmsg = "partial character in shift sequence";
2289 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002290 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002291 else {
2292 /* Some bits remain; they should be zero */
2293 if (base64buffer != 0) {
2294 errmsg = "non-zero padding bits in shift sequence";
2295 goto utf7Error;
2296 }
2297 }
2298 }
2299 if (ch != '-') {
2300 /* '-' is absorbed; other terminating
2301 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002302 *p++ = ch;
2303 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002304 }
2305 }
2306 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002307 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002308 s++; /* consume '+' */
2309 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002310 s++;
2311 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002312 }
2313 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002314 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002315 shiftOutStart = p;
2316 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002317 }
2318 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002319 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002320 *p++ = ch;
2321 s++;
2322 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002323 else {
2324 startinpos = s-starts;
2325 s++;
2326 errmsg = "unexpected special character";
2327 goto utf7Error;
2328 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002329 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002330utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002331 outpos = p-PyUnicode_AS_UNICODE(unicode);
2332 endinpos = s-starts;
2333 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002334 errors, &errorHandler,
2335 "utf7", errmsg,
2336 &starts, &e, &startinpos, &endinpos, &exc, &s,
2337 &unicode, &outpos, &p))
2338 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002339 }
2340
Antoine Pitrou244651a2009-05-04 18:56:13 +00002341 /* end of string */
2342
2343 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2344 /* if we're in an inconsistent state, that's an error */
2345 if (surrogate ||
2346 (base64bits >= 6) ||
2347 (base64bits > 0 && base64buffer != 0)) {
2348 outpos = p-PyUnicode_AS_UNICODE(unicode);
2349 endinpos = size;
2350 if (unicode_decode_call_errorhandler(
2351 errors, &errorHandler,
2352 "utf7", "unterminated shift sequence",
2353 &starts, &e, &startinpos, &endinpos, &exc, &s,
2354 &unicode, &outpos, &p))
2355 goto onError;
2356 if (s < e)
2357 goto restart;
2358 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002359 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002360
2361 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002362 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002363 if (inShift) {
2364 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002365 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002366 }
2367 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002368 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002369 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002370 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002371
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002372 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002373 goto onError;
2374
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002375 Py_XDECREF(errorHandler);
2376 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002377 return (PyObject *)unicode;
2378
Benjamin Peterson29060642009-01-31 22:14:21 +00002379 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002380 Py_XDECREF(errorHandler);
2381 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002382 Py_DECREF(unicode);
2383 return NULL;
2384}
2385
2386
2387PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002388 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002389 int base64SetO,
2390 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002391 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002392{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002393 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002394 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002395 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002396 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002397 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002398 unsigned int base64bits = 0;
2399 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002400 char * out;
2401 char * start;
2402
2403 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002404 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002405
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002406 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002407 return PyErr_NoMemory();
2408
Antoine Pitrou244651a2009-05-04 18:56:13 +00002409 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002410 if (v == NULL)
2411 return NULL;
2412
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002413 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002414 for (;i < size; ++i) {
2415 Py_UNICODE ch = s[i];
2416
Antoine Pitrou244651a2009-05-04 18:56:13 +00002417 if (inShift) {
2418 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2419 /* shifting out */
2420 if (base64bits) { /* output remaining bits */
2421 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2422 base64buffer = 0;
2423 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002424 }
2425 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002426 /* Characters not in the BASE64 set implicitly unshift the sequence
2427 so no '-' is required, except if the character is itself a '-' */
2428 if (IS_BASE64(ch) || ch == '-') {
2429 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002430 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002431 *out++ = (char) ch;
2432 }
2433 else {
2434 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002435 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002436 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002437 else { /* not in a shift sequence */
2438 if (ch == '+') {
2439 *out++ = '+';
2440 *out++ = '-';
2441 }
2442 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2443 *out++ = (char) ch;
2444 }
2445 else {
2446 *out++ = '+';
2447 inShift = 1;
2448 goto encode_char;
2449 }
2450 }
2451 continue;
2452encode_char:
2453#ifdef Py_UNICODE_WIDE
2454 if (ch >= 0x10000) {
2455 /* code first surrogate */
2456 base64bits += 16;
2457 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2458 while (base64bits >= 6) {
2459 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2460 base64bits -= 6;
2461 }
2462 /* prepare second surrogate */
2463 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2464 }
2465#endif
2466 base64bits += 16;
2467 base64buffer = (base64buffer << 16) | ch;
2468 while (base64bits >= 6) {
2469 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2470 base64bits -= 6;
2471 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002472 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002473 if (base64bits)
2474 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2475 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002476 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002477 if (_PyBytes_Resize(&v, out - start) < 0)
2478 return NULL;
2479 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002480}
2481
Antoine Pitrou244651a2009-05-04 18:56:13 +00002482#undef IS_BASE64
2483#undef FROM_BASE64
2484#undef TO_BASE64
2485#undef DECODE_DIRECT
2486#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002487
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488/* --- UTF-8 Codec -------------------------------------------------------- */
2489
Tim Petersced69f82003-09-16 20:30:58 +00002490static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002492 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2493 illegal prefix. See RFC 3629 for details */
2494 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2495 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002496 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2498 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2499 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2500 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002501 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2504 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002505 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2506 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2507 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2508 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2509 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510};
2511
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002513 Py_ssize_t size,
2514 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515{
Walter Dörwald69652032004-09-07 20:24:22 +00002516 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2517}
2518
Antoine Pitrouab868312009-01-10 15:40:25 +00002519/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2520#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2521
2522/* Mask to quickly check whether a C 'long' contains a
2523 non-ASCII, UTF8-encoded char. */
2524#if (SIZEOF_LONG == 8)
2525# define ASCII_CHAR_MASK 0x8080808080808080L
2526#elif (SIZEOF_LONG == 4)
2527# define ASCII_CHAR_MASK 0x80808080L
2528#else
2529# error C 'long' size should be either 4 or 8!
2530#endif
2531
Walter Dörwald69652032004-09-07 20:24:22 +00002532PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002533 Py_ssize_t size,
2534 const char *errors,
2535 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002537 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002539 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002540 Py_ssize_t startinpos;
2541 Py_ssize_t endinpos;
2542 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002543 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 PyUnicodeObject *unicode;
2545 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002546 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002547 PyObject *errorHandler = NULL;
2548 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549
2550 /* Note: size will always be longer than the resulting Unicode
2551 character count */
2552 unicode = _PyUnicode_New(size);
2553 if (!unicode)
2554 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002555 if (size == 0) {
2556 if (consumed)
2557 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560
2561 /* Unpack UTF-8 encoded data */
2562 p = unicode->str;
2563 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002564 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002565
2566 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002567 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568
2569 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002570 /* Fast path for runs of ASCII characters. Given that common UTF-8
2571 input will consist of an overwhelming majority of ASCII
2572 characters, we try to optimize for this case by checking
2573 as many characters as a C 'long' can contain.
2574 First, check if we can do an aligned read, as most CPUs have
2575 a penalty for unaligned reads.
2576 */
2577 if (!((size_t) s & LONG_PTR_MASK)) {
2578 /* Help register allocation */
2579 register const char *_s = s;
2580 register Py_UNICODE *_p = p;
2581 while (_s < aligned_end) {
2582 /* Read a whole long at a time (either 4 or 8 bytes),
2583 and do a fast unrolled copy if it only contains ASCII
2584 characters. */
2585 unsigned long data = *(unsigned long *) _s;
2586 if (data & ASCII_CHAR_MASK)
2587 break;
2588 _p[0] = (unsigned char) _s[0];
2589 _p[1] = (unsigned char) _s[1];
2590 _p[2] = (unsigned char) _s[2];
2591 _p[3] = (unsigned char) _s[3];
2592#if (SIZEOF_LONG == 8)
2593 _p[4] = (unsigned char) _s[4];
2594 _p[5] = (unsigned char) _s[5];
2595 _p[6] = (unsigned char) _s[6];
2596 _p[7] = (unsigned char) _s[7];
2597#endif
2598 _s += SIZEOF_LONG;
2599 _p += SIZEOF_LONG;
2600 }
2601 s = _s;
2602 p = _p;
2603 if (s == e)
2604 break;
2605 ch = (unsigned char)*s;
2606 }
2607 }
2608
2609 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002610 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 s++;
2612 continue;
2613 }
2614
2615 n = utf8_code_length[ch];
2616
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002617 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002618 if (consumed)
2619 break;
2620 else {
2621 errmsg = "unexpected end of data";
2622 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002623 endinpos = startinpos+1;
2624 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2625 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002626 goto utf8Error;
2627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629
2630 switch (n) {
2631
2632 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002633 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002634 startinpos = s-starts;
2635 endinpos = startinpos+1;
2636 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637
2638 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002639 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002640 startinpos = s-starts;
2641 endinpos = startinpos+1;
2642 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643
2644 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002645 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002646 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002647 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002648 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002649 goto utf8Error;
2650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002652 assert ((ch > 0x007F) && (ch <= 0x07FF));
2653 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654 break;
2655
2656 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002657 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2658 will result in surrogates in range d800-dfff. Surrogates are
2659 not valid UTF-8 so they are rejected.
2660 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2661 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002662 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002663 (s[2] & 0xc0) != 0x80 ||
2664 ((unsigned char)s[0] == 0xE0 &&
2665 (unsigned char)s[1] < 0xA0) ||
2666 ((unsigned char)s[0] == 0xED &&
2667 (unsigned char)s[1] > 0x9F)) {
2668 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002669 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002670 endinpos = startinpos + 1;
2671
2672 /* if s[1] first two bits are 1 and 0, then the invalid
2673 continuation byte is s[2], so increment endinpos by 1,
2674 if not, s[1] is invalid and endinpos doesn't need to
2675 be incremented. */
2676 if ((s[1] & 0xC0) == 0x80)
2677 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002678 goto utf8Error;
2679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002681 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2682 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002683 break;
2684
2685 case 4:
2686 if ((s[1] & 0xc0) != 0x80 ||
2687 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002688 (s[3] & 0xc0) != 0x80 ||
2689 ((unsigned char)s[0] == 0xF0 &&
2690 (unsigned char)s[1] < 0x90) ||
2691 ((unsigned char)s[0] == 0xF4 &&
2692 (unsigned char)s[1] > 0x8F)) {
2693 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002694 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002695 endinpos = startinpos + 1;
2696 if ((s[1] & 0xC0) == 0x80) {
2697 endinpos++;
2698 if ((s[2] & 0xC0) == 0x80)
2699 endinpos++;
2700 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002701 goto utf8Error;
2702 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002703 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002704 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2705 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2706
Fredrik Lundh8f455852001-06-27 18:59:43 +00002707#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002708 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002709#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002710 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002711
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002712 /* translate from 10000..10FFFF to 0..FFFF */
2713 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002714
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002715 /* high surrogate = top 10 bits added to D800 */
2716 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002717
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002718 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002719 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002720#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 }
2723 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002724 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002725
Benjamin Peterson29060642009-01-31 22:14:21 +00002726 utf8Error:
2727 outpos = p-PyUnicode_AS_UNICODE(unicode);
2728 if (unicode_decode_call_errorhandler(
2729 errors, &errorHandler,
2730 "utf8", errmsg,
2731 &starts, &e, &startinpos, &endinpos, &exc, &s,
2732 &unicode, &outpos, &p))
2733 goto onError;
2734 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 }
Walter Dörwald69652032004-09-07 20:24:22 +00002736 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002737 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738
2739 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002740 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 goto onError;
2742
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 Py_XDECREF(errorHandler);
2744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 return (PyObject *)unicode;
2746
Benjamin Peterson29060642009-01-31 22:14:21 +00002747 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 Py_XDECREF(errorHandler);
2749 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 Py_DECREF(unicode);
2751 return NULL;
2752}
2753
Antoine Pitrouab868312009-01-10 15:40:25 +00002754#undef ASCII_CHAR_MASK
2755
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002756#ifdef __APPLE__
2757
2758/* Simplified UTF-8 decoder using surrogateescape error handler,
2759 used to decode the command line arguments on Mac OS X. */
2760
2761wchar_t*
2762_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2763{
2764 int n;
2765 const char *e;
2766 wchar_t *unicode, *p;
2767
2768 /* Note: size will always be longer than the resulting Unicode
2769 character count */
2770 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2771 PyErr_NoMemory();
2772 return NULL;
2773 }
2774 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2775 if (!unicode)
2776 return NULL;
2777
2778 /* Unpack UTF-8 encoded data */
2779 p = unicode;
2780 e = s + size;
2781 while (s < e) {
2782 Py_UCS4 ch = (unsigned char)*s;
2783
2784 if (ch < 0x80) {
2785 *p++ = (wchar_t)ch;
2786 s++;
2787 continue;
2788 }
2789
2790 n = utf8_code_length[ch];
2791 if (s + n > e) {
2792 goto surrogateescape;
2793 }
2794
2795 switch (n) {
2796 case 0:
2797 case 1:
2798 goto surrogateescape;
2799
2800 case 2:
2801 if ((s[1] & 0xc0) != 0x80)
2802 goto surrogateescape;
2803 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2804 assert ((ch > 0x007F) && (ch <= 0x07FF));
2805 *p++ = (wchar_t)ch;
2806 break;
2807
2808 case 3:
2809 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2810 will result in surrogates in range d800-dfff. Surrogates are
2811 not valid UTF-8 so they are rejected.
2812 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2813 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2814 if ((s[1] & 0xc0) != 0x80 ||
2815 (s[2] & 0xc0) != 0x80 ||
2816 ((unsigned char)s[0] == 0xE0 &&
2817 (unsigned char)s[1] < 0xA0) ||
2818 ((unsigned char)s[0] == 0xED &&
2819 (unsigned char)s[1] > 0x9F)) {
2820
2821 goto surrogateescape;
2822 }
2823 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2824 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2825 *p++ = (Py_UNICODE)ch;
2826 break;
2827
2828 case 4:
2829 if ((s[1] & 0xc0) != 0x80 ||
2830 (s[2] & 0xc0) != 0x80 ||
2831 (s[3] & 0xc0) != 0x80 ||
2832 ((unsigned char)s[0] == 0xF0 &&
2833 (unsigned char)s[1] < 0x90) ||
2834 ((unsigned char)s[0] == 0xF4 &&
2835 (unsigned char)s[1] > 0x8F)) {
2836 goto surrogateescape;
2837 }
2838 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2839 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2840 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2841
2842#if SIZEOF_WCHAR_T == 4
2843 *p++ = (wchar_t)ch;
2844#else
2845 /* compute and append the two surrogates: */
2846
2847 /* translate from 10000..10FFFF to 0..FFFF */
2848 ch -= 0x10000;
2849
2850 /* high surrogate = top 10 bits added to D800 */
2851 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2852
2853 /* low surrogate = bottom 10 bits added to DC00 */
2854 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2855#endif
2856 break;
2857 }
2858 s += n;
2859 continue;
2860
2861 surrogateescape:
2862 *p++ = 0xDC00 + ch;
2863 s++;
2864 }
2865 *p = L'\0';
2866 return unicode;
2867}
2868
2869#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002870
Tim Peters602f7402002-04-27 18:03:26 +00002871/* Allocation strategy: if the string is short, convert into a stack buffer
2872 and allocate exactly as much space needed at the end. Else allocate the
2873 maximum possible needed (4 result bytes per Unicode character), and return
2874 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002875*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002876PyObject *
2877PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 Py_ssize_t size,
2879 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880{
Tim Peters602f7402002-04-27 18:03:26 +00002881#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002882
Guido van Rossum98297ee2007-11-06 21:34:58 +00002883 Py_ssize_t i; /* index into s of next input byte */
2884 PyObject *result; /* result string object */
2885 char *p; /* next free byte in output buffer */
2886 Py_ssize_t nallocated; /* number of result bytes allocated */
2887 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002888 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002889 PyObject *errorHandler = NULL;
2890 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002891
Tim Peters602f7402002-04-27 18:03:26 +00002892 assert(s != NULL);
2893 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894
Tim Peters602f7402002-04-27 18:03:26 +00002895 if (size <= MAX_SHORT_UNICHARS) {
2896 /* Write into the stack buffer; nallocated can't overflow.
2897 * At the end, we'll allocate exactly as much heap space as it
2898 * turns out we need.
2899 */
2900 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002901 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002902 p = stackbuf;
2903 }
2904 else {
2905 /* Overallocate on the heap, and give the excess back at the end. */
2906 nallocated = size * 4;
2907 if (nallocated / 4 != size) /* overflow! */
2908 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002909 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002910 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002911 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002912 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002913 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002914
Tim Peters602f7402002-04-27 18:03:26 +00002915 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002916 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002917
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002918 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002919 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002921
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002923 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002924 *p++ = (char)(0xc0 | (ch >> 6));
2925 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002926 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002927#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002928 /* Special case: check for high and low surrogate */
2929 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2930 Py_UCS4 ch2 = s[i];
2931 /* Combine the two surrogates to form a UCS4 value */
2932 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2933 i++;
2934
2935 /* Encode UCS4 Unicode ordinals */
2936 *p++ = (char)(0xf0 | (ch >> 18));
2937 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002938 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2939 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002940 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002941#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002942 Py_ssize_t newpos;
2943 PyObject *rep;
2944 Py_ssize_t repsize, k;
2945 rep = unicode_encode_call_errorhandler
2946 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2947 s, size, &exc, i-1, i, &newpos);
2948 if (!rep)
2949 goto error;
2950
2951 if (PyBytes_Check(rep))
2952 repsize = PyBytes_GET_SIZE(rep);
2953 else
2954 repsize = PyUnicode_GET_SIZE(rep);
2955
2956 if (repsize > 4) {
2957 Py_ssize_t offset;
2958
2959 if (result == NULL)
2960 offset = p - stackbuf;
2961 else
2962 offset = p - PyBytes_AS_STRING(result);
2963
2964 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2965 /* integer overflow */
2966 PyErr_NoMemory();
2967 goto error;
2968 }
2969 nallocated += repsize - 4;
2970 if (result != NULL) {
2971 if (_PyBytes_Resize(&result, nallocated) < 0)
2972 goto error;
2973 } else {
2974 result = PyBytes_FromStringAndSize(NULL, nallocated);
2975 if (result == NULL)
2976 goto error;
2977 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2978 }
2979 p = PyBytes_AS_STRING(result) + offset;
2980 }
2981
2982 if (PyBytes_Check(rep)) {
2983 char *prep = PyBytes_AS_STRING(rep);
2984 for(k = repsize; k > 0; k--)
2985 *p++ = *prep++;
2986 } else /* rep is unicode */ {
2987 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2988 Py_UNICODE c;
2989
2990 for(k=0; k<repsize; k++) {
2991 c = prep[k];
2992 if (0x80 <= c) {
2993 raise_encode_exception(&exc, "utf-8", s, size,
2994 i-1, i, "surrogates not allowed");
2995 goto error;
2996 }
2997 *p++ = (char)prep[k];
2998 }
2999 }
3000 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003001#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003002 }
Victor Stinner445a6232010-04-22 20:01:57 +00003003#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003004 } else if (ch < 0x10000) {
3005 *p++ = (char)(0xe0 | (ch >> 12));
3006 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3007 *p++ = (char)(0x80 | (ch & 0x3f));
3008 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003009 /* Encode UCS4 Unicode ordinals */
3010 *p++ = (char)(0xf0 | (ch >> 18));
3011 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3012 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3013 *p++ = (char)(0x80 | (ch & 0x3f));
3014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003016
Guido van Rossum98297ee2007-11-06 21:34:58 +00003017 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003018 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003019 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003020 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003021 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003022 }
3023 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003024 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003025 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003026 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003027 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003028 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003029 Py_XDECREF(errorHandler);
3030 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003031 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003032 error:
3033 Py_XDECREF(errorHandler);
3034 Py_XDECREF(exc);
3035 Py_XDECREF(result);
3036 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003037
Tim Peters602f7402002-04-27 18:03:26 +00003038#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039}
3040
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3042{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 if (!PyUnicode_Check(unicode)) {
3044 PyErr_BadArgument();
3045 return NULL;
3046 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003047 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003048 PyUnicode_GET_SIZE(unicode),
3049 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050}
3051
Walter Dörwald41980ca2007-08-16 21:55:45 +00003052/* --- UTF-32 Codec ------------------------------------------------------- */
3053
3054PyObject *
3055PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 Py_ssize_t size,
3057 const char *errors,
3058 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003059{
3060 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3061}
3062
3063PyObject *
3064PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 Py_ssize_t size,
3066 const char *errors,
3067 int *byteorder,
3068 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003069{
3070 const char *starts = s;
3071 Py_ssize_t startinpos;
3072 Py_ssize_t endinpos;
3073 Py_ssize_t outpos;
3074 PyUnicodeObject *unicode;
3075 Py_UNICODE *p;
3076#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003077 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003078 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003079#else
3080 const int pairs = 0;
3081#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003082 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003083 int bo = 0; /* assume native ordering by default */
3084 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003085 /* Offsets from q for retrieving bytes in the right order. */
3086#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3087 int iorder[] = {0, 1, 2, 3};
3088#else
3089 int iorder[] = {3, 2, 1, 0};
3090#endif
3091 PyObject *errorHandler = NULL;
3092 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003093
Walter Dörwald41980ca2007-08-16 21:55:45 +00003094 q = (unsigned char *)s;
3095 e = q + size;
3096
3097 if (byteorder)
3098 bo = *byteorder;
3099
3100 /* Check for BOM marks (U+FEFF) in the input and adjust current
3101 byte order setting accordingly. In native mode, the leading BOM
3102 mark is skipped, in all other modes, it is copied to the output
3103 stream as-is (giving a ZWNBSP character). */
3104 if (bo == 0) {
3105 if (size >= 4) {
3106 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003107 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003108#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003109 if (bom == 0x0000FEFF) {
3110 q += 4;
3111 bo = -1;
3112 }
3113 else if (bom == 0xFFFE0000) {
3114 q += 4;
3115 bo = 1;
3116 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003117#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003118 if (bom == 0x0000FEFF) {
3119 q += 4;
3120 bo = 1;
3121 }
3122 else if (bom == 0xFFFE0000) {
3123 q += 4;
3124 bo = -1;
3125 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003126#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003128 }
3129
3130 if (bo == -1) {
3131 /* force LE */
3132 iorder[0] = 0;
3133 iorder[1] = 1;
3134 iorder[2] = 2;
3135 iorder[3] = 3;
3136 }
3137 else if (bo == 1) {
3138 /* force BE */
3139 iorder[0] = 3;
3140 iorder[1] = 2;
3141 iorder[2] = 1;
3142 iorder[3] = 0;
3143 }
3144
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003145 /* On narrow builds we split characters outside the BMP into two
3146 codepoints => count how much extra space we need. */
3147#ifndef Py_UNICODE_WIDE
3148 for (qq = q; qq < e; qq += 4)
3149 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3150 pairs++;
3151#endif
3152
3153 /* This might be one to much, because of a BOM */
3154 unicode = _PyUnicode_New((size+3)/4+pairs);
3155 if (!unicode)
3156 return NULL;
3157 if (size == 0)
3158 return (PyObject *)unicode;
3159
3160 /* Unpack UTF-32 encoded data */
3161 p = unicode->str;
3162
Walter Dörwald41980ca2007-08-16 21:55:45 +00003163 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003164 Py_UCS4 ch;
3165 /* remaining bytes at the end? (size should be divisible by 4) */
3166 if (e-q<4) {
3167 if (consumed)
3168 break;
3169 errmsg = "truncated data";
3170 startinpos = ((const char *)q)-starts;
3171 endinpos = ((const char *)e)-starts;
3172 goto utf32Error;
3173 /* The remaining input chars are ignored if the callback
3174 chooses to skip the input */
3175 }
3176 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3177 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003178
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 if (ch >= 0x110000)
3180 {
3181 errmsg = "codepoint not in range(0x110000)";
3182 startinpos = ((const char *)q)-starts;
3183 endinpos = startinpos+4;
3184 goto utf32Error;
3185 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003186#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 if (ch >= 0x10000)
3188 {
3189 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3190 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3191 }
3192 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003193#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003194 *p++ = ch;
3195 q += 4;
3196 continue;
3197 utf32Error:
3198 outpos = p-PyUnicode_AS_UNICODE(unicode);
3199 if (unicode_decode_call_errorhandler(
3200 errors, &errorHandler,
3201 "utf32", errmsg,
3202 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3203 &unicode, &outpos, &p))
3204 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003205 }
3206
3207 if (byteorder)
3208 *byteorder = bo;
3209
3210 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003211 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003212
3213 /* Adjust length */
3214 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3215 goto onError;
3216
3217 Py_XDECREF(errorHandler);
3218 Py_XDECREF(exc);
3219 return (PyObject *)unicode;
3220
Benjamin Peterson29060642009-01-31 22:14:21 +00003221 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003222 Py_DECREF(unicode);
3223 Py_XDECREF(errorHandler);
3224 Py_XDECREF(exc);
3225 return NULL;
3226}
3227
3228PyObject *
3229PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 Py_ssize_t size,
3231 const char *errors,
3232 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003233{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003234 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003235 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003236 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003237#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003238 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003239#else
3240 const int pairs = 0;
3241#endif
3242 /* Offsets from p for storing byte pairs in the right order. */
3243#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3244 int iorder[] = {0, 1, 2, 3};
3245#else
3246 int iorder[] = {3, 2, 1, 0};
3247#endif
3248
Benjamin Peterson29060642009-01-31 22:14:21 +00003249#define STORECHAR(CH) \
3250 do { \
3251 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3252 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3253 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3254 p[iorder[0]] = (CH) & 0xff; \
3255 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003256 } while(0)
3257
3258 /* In narrow builds we can output surrogate pairs as one codepoint,
3259 so we need less space. */
3260#ifndef Py_UNICODE_WIDE
3261 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3263 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3264 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003265#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003266 nsize = (size - pairs + (byteorder == 0));
3267 bytesize = nsize * 4;
3268 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003269 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003270 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003271 if (v == NULL)
3272 return NULL;
3273
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003274 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003275 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003276 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003277 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003278 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003279
3280 if (byteorder == -1) {
3281 /* force LE */
3282 iorder[0] = 0;
3283 iorder[1] = 1;
3284 iorder[2] = 2;
3285 iorder[3] = 3;
3286 }
3287 else if (byteorder == 1) {
3288 /* force BE */
3289 iorder[0] = 3;
3290 iorder[1] = 2;
3291 iorder[2] = 1;
3292 iorder[3] = 0;
3293 }
3294
3295 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003296 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003297#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3299 Py_UCS4 ch2 = *s;
3300 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3301 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3302 s++;
3303 size--;
3304 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003305 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003306#endif
3307 STORECHAR(ch);
3308 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003309
3310 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003311 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003312#undef STORECHAR
3313}
3314
3315PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3316{
3317 if (!PyUnicode_Check(unicode)) {
3318 PyErr_BadArgument();
3319 return NULL;
3320 }
3321 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003322 PyUnicode_GET_SIZE(unicode),
3323 NULL,
3324 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003325}
3326
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327/* --- UTF-16 Codec ------------------------------------------------------- */
3328
Tim Peters772747b2001-08-09 22:21:55 +00003329PyObject *
3330PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003331 Py_ssize_t size,
3332 const char *errors,
3333 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334{
Walter Dörwald69652032004-09-07 20:24:22 +00003335 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3336}
3337
Antoine Pitrouab868312009-01-10 15:40:25 +00003338/* Two masks for fast checking of whether a C 'long' may contain
3339 UTF16-encoded surrogate characters. This is an efficient heuristic,
3340 assuming that non-surrogate characters with a code point >= 0x8000 are
3341 rare in most input.
3342 FAST_CHAR_MASK is used when the input is in native byte ordering,
3343 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003344*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003345#if (SIZEOF_LONG == 8)
3346# define FAST_CHAR_MASK 0x8000800080008000L
3347# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3348#elif (SIZEOF_LONG == 4)
3349# define FAST_CHAR_MASK 0x80008000L
3350# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3351#else
3352# error C 'long' size should be either 4 or 8!
3353#endif
3354
Walter Dörwald69652032004-09-07 20:24:22 +00003355PyObject *
3356PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003357 Py_ssize_t size,
3358 const char *errors,
3359 int *byteorder,
3360 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003361{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003362 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003363 Py_ssize_t startinpos;
3364 Py_ssize_t endinpos;
3365 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 PyUnicodeObject *unicode;
3367 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003368 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003369 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003370 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003371 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003372 /* Offsets from q for retrieving byte pairs in the right order. */
3373#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3374 int ihi = 1, ilo = 0;
3375#else
3376 int ihi = 0, ilo = 1;
3377#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 PyObject *errorHandler = NULL;
3379 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003380
3381 /* Note: size will always be longer than the resulting Unicode
3382 character count */
3383 unicode = _PyUnicode_New(size);
3384 if (!unicode)
3385 return NULL;
3386 if (size == 0)
3387 return (PyObject *)unicode;
3388
3389 /* Unpack UTF-16 encoded data */
3390 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003391 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003392 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393
3394 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003395 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003397 /* Check for BOM marks (U+FEFF) in the input and adjust current
3398 byte order setting accordingly. In native mode, the leading BOM
3399 mark is skipped, in all other modes, it is copied to the output
3400 stream as-is (giving a ZWNBSP character). */
3401 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003402 if (size >= 2) {
3403 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003404#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003405 if (bom == 0xFEFF) {
3406 q += 2;
3407 bo = -1;
3408 }
3409 else if (bom == 0xFFFE) {
3410 q += 2;
3411 bo = 1;
3412 }
Tim Petersced69f82003-09-16 20:30:58 +00003413#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003414 if (bom == 0xFEFF) {
3415 q += 2;
3416 bo = 1;
3417 }
3418 else if (bom == 0xFFFE) {
3419 q += 2;
3420 bo = -1;
3421 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003422#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003423 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425
Tim Peters772747b2001-08-09 22:21:55 +00003426 if (bo == -1) {
3427 /* force LE */
3428 ihi = 1;
3429 ilo = 0;
3430 }
3431 else if (bo == 1) {
3432 /* force BE */
3433 ihi = 0;
3434 ilo = 1;
3435 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003436#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3437 native_ordering = ilo < ihi;
3438#else
3439 native_ordering = ilo > ihi;
3440#endif
Tim Peters772747b2001-08-09 22:21:55 +00003441
Antoine Pitrouab868312009-01-10 15:40:25 +00003442 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003443 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003444 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003445 /* First check for possible aligned read of a C 'long'. Unaligned
3446 reads are more expensive, better to defer to another iteration. */
3447 if (!((size_t) q & LONG_PTR_MASK)) {
3448 /* Fast path for runs of non-surrogate chars. */
3449 register const unsigned char *_q = q;
3450 Py_UNICODE *_p = p;
3451 if (native_ordering) {
3452 /* Native ordering is simple: as long as the input cannot
3453 possibly contain a surrogate char, do an unrolled copy
3454 of several 16-bit code points to the target object.
3455 The non-surrogate check is done on several input bytes
3456 at a time (as many as a C 'long' can contain). */
3457 while (_q < aligned_end) {
3458 unsigned long data = * (unsigned long *) _q;
3459 if (data & FAST_CHAR_MASK)
3460 break;
3461 _p[0] = ((unsigned short *) _q)[0];
3462 _p[1] = ((unsigned short *) _q)[1];
3463#if (SIZEOF_LONG == 8)
3464 _p[2] = ((unsigned short *) _q)[2];
3465 _p[3] = ((unsigned short *) _q)[3];
3466#endif
3467 _q += SIZEOF_LONG;
3468 _p += SIZEOF_LONG / 2;
3469 }
3470 }
3471 else {
3472 /* Byteswapped ordering is similar, but we must decompose
3473 the copy bytewise, and take care of zero'ing out the
3474 upper bytes if the target object is in 32-bit units
3475 (that is, in UCS-4 builds). */
3476 while (_q < aligned_end) {
3477 unsigned long data = * (unsigned long *) _q;
3478 if (data & SWAPPED_FAST_CHAR_MASK)
3479 break;
3480 /* Zero upper bytes in UCS-4 builds */
3481#if (Py_UNICODE_SIZE > 2)
3482 _p[0] = 0;
3483 _p[1] = 0;
3484#if (SIZEOF_LONG == 8)
3485 _p[2] = 0;
3486 _p[3] = 0;
3487#endif
3488#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003489 /* Issue #4916; UCS-4 builds on big endian machines must
3490 fill the two last bytes of each 4-byte unit. */
3491#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3492# define OFF 2
3493#else
3494# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003495#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003496 ((unsigned char *) _p)[OFF + 1] = _q[0];
3497 ((unsigned char *) _p)[OFF + 0] = _q[1];
3498 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3499 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3500#if (SIZEOF_LONG == 8)
3501 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3502 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3503 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3504 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3505#endif
3506#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003507 _q += SIZEOF_LONG;
3508 _p += SIZEOF_LONG / 2;
3509 }
3510 }
3511 p = _p;
3512 q = _q;
3513 if (q >= e)
3514 break;
3515 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003516 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517
Benjamin Peterson14339b62009-01-31 16:36:08 +00003518 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003519
3520 if (ch < 0xD800 || ch > 0xDFFF) {
3521 *p++ = ch;
3522 continue;
3523 }
3524
3525 /* UTF-16 code pair: */
3526 if (q > e) {
3527 errmsg = "unexpected end of data";
3528 startinpos = (((const char *)q) - 2) - starts;
3529 endinpos = ((const char *)e) + 1 - starts;
3530 goto utf16Error;
3531 }
3532 if (0xD800 <= ch && ch <= 0xDBFF) {
3533 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3534 q += 2;
3535 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003536#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 *p++ = ch;
3538 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003539#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003540 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003541#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003542 continue;
3543 }
3544 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003545 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003546 startinpos = (((const char *)q)-4)-starts;
3547 endinpos = startinpos+2;
3548 goto utf16Error;
3549 }
3550
Benjamin Peterson14339b62009-01-31 16:36:08 +00003551 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003552 errmsg = "illegal encoding";
3553 startinpos = (((const char *)q)-2)-starts;
3554 endinpos = startinpos+2;
3555 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003556
Benjamin Peterson29060642009-01-31 22:14:21 +00003557 utf16Error:
3558 outpos = p - PyUnicode_AS_UNICODE(unicode);
3559 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003560 errors,
3561 &errorHandler,
3562 "utf16", errmsg,
3563 &starts,
3564 (const char **)&e,
3565 &startinpos,
3566 &endinpos,
3567 &exc,
3568 (const char **)&q,
3569 &unicode,
3570 &outpos,
3571 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003572 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003574 /* remaining byte at the end? (size should be even) */
3575 if (e == q) {
3576 if (!consumed) {
3577 errmsg = "truncated data";
3578 startinpos = ((const char *)q) - starts;
3579 endinpos = ((const char *)e) + 1 - starts;
3580 outpos = p - PyUnicode_AS_UNICODE(unicode);
3581 if (unicode_decode_call_errorhandler(
3582 errors,
3583 &errorHandler,
3584 "utf16", errmsg,
3585 &starts,
3586 (const char **)&e,
3587 &startinpos,
3588 &endinpos,
3589 &exc,
3590 (const char **)&q,
3591 &unicode,
3592 &outpos,
3593 &p))
3594 goto onError;
3595 /* The remaining input chars are ignored if the callback
3596 chooses to skip the input */
3597 }
3598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599
3600 if (byteorder)
3601 *byteorder = bo;
3602
Walter Dörwald69652032004-09-07 20:24:22 +00003603 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003604 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003605
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003607 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 goto onError;
3609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 Py_XDECREF(errorHandler);
3611 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 return (PyObject *)unicode;
3613
Benjamin Peterson29060642009-01-31 22:14:21 +00003614 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616 Py_XDECREF(errorHandler);
3617 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 return NULL;
3619}
3620
Antoine Pitrouab868312009-01-10 15:40:25 +00003621#undef FAST_CHAR_MASK
3622#undef SWAPPED_FAST_CHAR_MASK
3623
Tim Peters772747b2001-08-09 22:21:55 +00003624PyObject *
3625PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003626 Py_ssize_t size,
3627 const char *errors,
3628 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003630 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003631 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003632 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003633#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003634 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003635#else
3636 const int pairs = 0;
3637#endif
Tim Peters772747b2001-08-09 22:21:55 +00003638 /* Offsets from p for storing byte pairs in the right order. */
3639#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3640 int ihi = 1, ilo = 0;
3641#else
3642 int ihi = 0, ilo = 1;
3643#endif
3644
Benjamin Peterson29060642009-01-31 22:14:21 +00003645#define STORECHAR(CH) \
3646 do { \
3647 p[ihi] = ((CH) >> 8) & 0xff; \
3648 p[ilo] = (CH) & 0xff; \
3649 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003650 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003652#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003653 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003654 if (s[i] >= 0x10000)
3655 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003656#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003657 /* 2 * (size + pairs + (byteorder == 0)) */
3658 if (size > PY_SSIZE_T_MAX ||
3659 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003661 nsize = size + pairs + (byteorder == 0);
3662 bytesize = nsize * 2;
3663 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003665 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 if (v == NULL)
3667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003669 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003672 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003673 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003674
3675 if (byteorder == -1) {
3676 /* force LE */
3677 ihi = 1;
3678 ilo = 0;
3679 }
3680 else if (byteorder == 1) {
3681 /* force BE */
3682 ihi = 0;
3683 ilo = 1;
3684 }
3685
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003686 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003687 Py_UNICODE ch = *s++;
3688 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003689#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003690 if (ch >= 0x10000) {
3691 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3692 ch = 0xD800 | ((ch-0x10000) >> 10);
3693 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003694#endif
Tim Peters772747b2001-08-09 22:21:55 +00003695 STORECHAR(ch);
3696 if (ch2)
3697 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003698 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003699
3700 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003701 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003702#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703}
3704
3705PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3706{
3707 if (!PyUnicode_Check(unicode)) {
3708 PyErr_BadArgument();
3709 return NULL;
3710 }
3711 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 PyUnicode_GET_SIZE(unicode),
3713 NULL,
3714 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715}
3716
3717/* --- Unicode Escape Codec ----------------------------------------------- */
3718
Fredrik Lundh06d12682001-01-24 07:59:11 +00003719static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003720
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003722 Py_ssize_t size,
3723 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003726 Py_ssize_t startinpos;
3727 Py_ssize_t endinpos;
3728 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003732 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003733 char* message;
3734 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003735 PyObject *errorHandler = NULL;
3736 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003737
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 /* Escaped strings will always be longer than the resulting
3739 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 length after conversion to the true value.
3741 (but if the error callback returns a long replacement string
3742 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 v = _PyUnicode_New(size);
3744 if (v == NULL)
3745 goto onError;
3746 if (size == 0)
3747 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003748
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003751
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 while (s < end) {
3753 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003754 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756
3757 /* Non-escape characters are interpreted as Unicode ordinals */
3758 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003759 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 continue;
3761 }
3762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 /* \ - Escapes */
3765 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003766 c = *s++;
3767 if (s > end)
3768 c = '\0'; /* Invalid after \ */
3769 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770
Benjamin Peterson29060642009-01-31 22:14:21 +00003771 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 case '\n': break;
3773 case '\\': *p++ = '\\'; break;
3774 case '\'': *p++ = '\''; break;
3775 case '\"': *p++ = '\"'; break;
3776 case 'b': *p++ = '\b'; break;
3777 case 'f': *p++ = '\014'; break; /* FF */
3778 case 't': *p++ = '\t'; break;
3779 case 'n': *p++ = '\n'; break;
3780 case 'r': *p++ = '\r'; break;
3781 case 'v': *p++ = '\013'; break; /* VT */
3782 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3783
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 case '0': case '1': case '2': case '3':
3786 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003787 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003788 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003789 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003790 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003791 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003793 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794 break;
3795
Benjamin Peterson29060642009-01-31 22:14:21 +00003796 /* hex escapes */
3797 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003799 digits = 2;
3800 message = "truncated \\xXX escape";
3801 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802
Benjamin Peterson29060642009-01-31 22:14:21 +00003803 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003805 digits = 4;
3806 message = "truncated \\uXXXX escape";
3807 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808
Benjamin Peterson29060642009-01-31 22:14:21 +00003809 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003810 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003811 digits = 8;
3812 message = "truncated \\UXXXXXXXX escape";
3813 hexescape:
3814 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 outpos = p-PyUnicode_AS_UNICODE(v);
3816 if (s+digits>end) {
3817 endinpos = size;
3818 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003819 errors, &errorHandler,
3820 "unicodeescape", "end of string in escape sequence",
3821 &starts, &end, &startinpos, &endinpos, &exc, &s,
3822 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 goto onError;
3824 goto nextByte;
3825 }
3826 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003827 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003828 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829 endinpos = (s+i+1)-starts;
3830 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003831 errors, &errorHandler,
3832 "unicodeescape", message,
3833 &starts, &end, &startinpos, &endinpos, &exc, &s,
3834 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003835 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003837 }
3838 chr = (chr<<4) & ~0xF;
3839 if (c >= '0' && c <= '9')
3840 chr += c - '0';
3841 else if (c >= 'a' && c <= 'f')
3842 chr += 10 + c - 'a';
3843 else
3844 chr += 10 + c - 'A';
3845 }
3846 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003847 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003848 /* _decoding_error will have already written into the
3849 target buffer. */
3850 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003851 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003852 /* when we get here, chr is a 32-bit unicode character */
3853 if (chr <= 0xffff)
3854 /* UCS-2 character */
3855 *p++ = (Py_UNICODE) chr;
3856 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003857 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003858 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003859#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003860 *p++ = chr;
3861#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003862 chr -= 0x10000L;
3863 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003864 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003865#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003866 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867 endinpos = s-starts;
3868 outpos = p-PyUnicode_AS_UNICODE(v);
3869 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003870 errors, &errorHandler,
3871 "unicodeescape", "illegal Unicode character",
3872 &starts, &end, &startinpos, &endinpos, &exc, &s,
3873 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003874 goto onError;
3875 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003876 break;
3877
Benjamin Peterson29060642009-01-31 22:14:21 +00003878 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003879 case 'N':
3880 message = "malformed \\N character escape";
3881 if (ucnhash_CAPI == NULL) {
3882 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003883 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003884 if (ucnhash_CAPI == NULL)
3885 goto ucnhashError;
3886 }
3887 if (*s == '{') {
3888 const char *start = s+1;
3889 /* look for the closing brace */
3890 while (*s != '}' && s < end)
3891 s++;
3892 if (s > start && s < end && *s == '}') {
3893 /* found a name. look it up in the unicode database */
3894 message = "unknown Unicode character name";
3895 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003896 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003897 goto store;
3898 }
3899 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003900 endinpos = s-starts;
3901 outpos = p-PyUnicode_AS_UNICODE(v);
3902 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003903 errors, &errorHandler,
3904 "unicodeescape", message,
3905 &starts, &end, &startinpos, &endinpos, &exc, &s,
3906 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003907 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003908 break;
3909
3910 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003911 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912 message = "\\ at end of string";
3913 s--;
3914 endinpos = s-starts;
3915 outpos = p-PyUnicode_AS_UNICODE(v);
3916 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003917 errors, &errorHandler,
3918 "unicodeescape", message,
3919 &starts, &end, &startinpos, &endinpos, &exc, &s,
3920 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003921 goto onError;
3922 }
3923 else {
3924 *p++ = '\\';
3925 *p++ = (unsigned char)s[-1];
3926 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003927 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003932 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003934 Py_XDECREF(errorHandler);
3935 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003937
Benjamin Peterson29060642009-01-31 22:14:21 +00003938 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003939 PyErr_SetString(
3940 PyExc_UnicodeError,
3941 "\\N escapes not supported (can't load unicodedata module)"
3942 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003943 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944 Py_XDECREF(errorHandler);
3945 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003946 return NULL;
3947
Benjamin Peterson29060642009-01-31 22:14:21 +00003948 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003950 Py_XDECREF(errorHandler);
3951 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 return NULL;
3953}
3954
3955/* Return a Unicode-Escape string version of the Unicode object.
3956
3957 If quotes is true, the string is enclosed in u"" or u'' quotes as
3958 appropriate.
3959
3960*/
3961
Thomas Wouters477c8d52006-05-27 19:21:47 +00003962Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 Py_ssize_t size,
3964 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003965{
3966 /* like wcschr, but doesn't stop at NULL characters */
3967
3968 while (size-- > 0) {
3969 if (*s == ch)
3970 return s;
3971 s++;
3972 }
3973
3974 return NULL;
3975}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003976
Walter Dörwald79e913e2007-05-12 11:08:06 +00003977static const char *hexdigits = "0123456789abcdef";
3978
3979PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003982 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003985#ifdef Py_UNICODE_WIDE
3986 const Py_ssize_t expandsize = 10;
3987#else
3988 const Py_ssize_t expandsize = 6;
3989#endif
3990
Thomas Wouters89f507f2006-12-13 04:49:30 +00003991 /* XXX(nnorwitz): rather than over-allocating, it would be
3992 better to choose a different scheme. Perhaps scan the
3993 first N-chars of the string and allocate based on that size.
3994 */
3995 /* Initial allocation is based on the longest-possible unichr
3996 escape.
3997
3998 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3999 unichr, so in this case it's the longest unichr escape. In
4000 narrow (UTF-16) builds this is five chars per source unichr
4001 since there are two unichrs in the surrogate pair, so in narrow
4002 (UTF-16) builds it's not the longest unichr escape.
4003
4004 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4005 so in the narrow (UTF-16) build case it's the longest unichr
4006 escape.
4007 */
4008
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004009 if (size == 0)
4010 return PyBytes_FromStringAndSize(NULL, 0);
4011
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004012 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004013 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004014
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004015 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004016 2
4017 + expandsize*size
4018 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019 if (repr == NULL)
4020 return NULL;
4021
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004022 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 while (size-- > 0) {
4025 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004026
Walter Dörwald79e913e2007-05-12 11:08:06 +00004027 /* Escape backslashes */
4028 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 *p++ = '\\';
4030 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004031 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004032 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004033
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004034#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004035 /* Map 21-bit characters to '\U00xxxxxx' */
4036 else if (ch >= 0x10000) {
4037 *p++ = '\\';
4038 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004039 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4040 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4041 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4042 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4043 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4044 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4045 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4046 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004048 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004049#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4051 else if (ch >= 0xD800 && ch < 0xDC00) {
4052 Py_UNICODE ch2;
4053 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004054
Benjamin Peterson29060642009-01-31 22:14:21 +00004055 ch2 = *s++;
4056 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004057 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004058 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4059 *p++ = '\\';
4060 *p++ = 'U';
4061 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4062 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4063 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4064 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4065 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4066 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4067 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4068 *p++ = hexdigits[ucs & 0x0000000F];
4069 continue;
4070 }
4071 /* Fall through: isolated surrogates are copied as-is */
4072 s--;
4073 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004074 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004075#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004076
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004078 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 *p++ = '\\';
4080 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004081 *p++ = hexdigits[(ch >> 12) & 0x000F];
4082 *p++ = hexdigits[(ch >> 8) & 0x000F];
4083 *p++ = hexdigits[(ch >> 4) & 0x000F];
4084 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004086
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004087 /* Map special whitespace to '\t', \n', '\r' */
4088 else if (ch == '\t') {
4089 *p++ = '\\';
4090 *p++ = 't';
4091 }
4092 else if (ch == '\n') {
4093 *p++ = '\\';
4094 *p++ = 'n';
4095 }
4096 else if (ch == '\r') {
4097 *p++ = '\\';
4098 *p++ = 'r';
4099 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004100
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004101 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004102 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004104 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004105 *p++ = hexdigits[(ch >> 4) & 0x000F];
4106 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004107 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004108
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109 /* Copy everything else as-is */
4110 else
4111 *p++ = (char) ch;
4112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004114 assert(p - PyBytes_AS_STRING(repr) > 0);
4115 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4116 return NULL;
4117 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118}
4119
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004120PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004122 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 if (!PyUnicode_Check(unicode)) {
4124 PyErr_BadArgument();
4125 return NULL;
4126 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004127 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4128 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004129 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130}
4131
4132/* --- Raw Unicode Escape Codec ------------------------------------------- */
4133
4134PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 Py_ssize_t size,
4136 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004139 Py_ssize_t startinpos;
4140 Py_ssize_t endinpos;
4141 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 const char *end;
4145 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146 PyObject *errorHandler = NULL;
4147 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004148
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149 /* Escaped strings will always be longer than the resulting
4150 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151 length after conversion to the true value. (But decoding error
4152 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153 v = _PyUnicode_New(size);
4154 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004155 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004157 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 end = s + size;
4160 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 unsigned char c;
4162 Py_UCS4 x;
4163 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004164 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 /* Non-escape characters are interpreted as Unicode ordinals */
4167 if (*s != '\\') {
4168 *p++ = (unsigned char)*s++;
4169 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004170 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004171 startinpos = s-starts;
4172
4173 /* \u-escapes are only interpreted iff the number of leading
4174 backslashes if odd */
4175 bs = s;
4176 for (;s < end;) {
4177 if (*s != '\\')
4178 break;
4179 *p++ = (unsigned char)*s++;
4180 }
4181 if (((s - bs) & 1) == 0 ||
4182 s >= end ||
4183 (*s != 'u' && *s != 'U')) {
4184 continue;
4185 }
4186 p--;
4187 count = *s=='u' ? 4 : 8;
4188 s++;
4189
4190 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4191 outpos = p-PyUnicode_AS_UNICODE(v);
4192 for (x = 0, i = 0; i < count; ++i, ++s) {
4193 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004194 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 endinpos = s-starts;
4196 if (unicode_decode_call_errorhandler(
4197 errors, &errorHandler,
4198 "rawunicodeescape", "truncated \\uXXXX",
4199 &starts, &end, &startinpos, &endinpos, &exc, &s,
4200 &v, &outpos, &p))
4201 goto onError;
4202 goto nextByte;
4203 }
4204 x = (x<<4) & ~0xF;
4205 if (c >= '0' && c <= '9')
4206 x += c - '0';
4207 else if (c >= 'a' && c <= 'f')
4208 x += 10 + c - 'a';
4209 else
4210 x += 10 + c - 'A';
4211 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004212 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 /* UCS-2 character */
4214 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004215 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 /* UCS-4 character. Either store directly, or as
4217 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004218#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004219 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004220#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 x -= 0x10000L;
4222 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4223 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004224#endif
4225 } else {
4226 endinpos = s-starts;
4227 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004228 if (unicode_decode_call_errorhandler(
4229 errors, &errorHandler,
4230 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 &starts, &end, &startinpos, &endinpos, &exc, &s,
4232 &v, &outpos, &p))
4233 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004234 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004235 nextByte:
4236 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004238 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004239 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240 Py_XDECREF(errorHandler);
4241 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004243
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004245 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004246 Py_XDECREF(errorHandler);
4247 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248 return NULL;
4249}
4250
4251PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004254 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 char *p;
4256 char *q;
4257
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004258#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004259 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004260#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004261 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004262#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004263
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004264 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004266
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004267 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 if (repr == NULL)
4269 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004270 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004271 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004273 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274 while (size-- > 0) {
4275 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004276#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004277 /* Map 32-bit characters to '\Uxxxxxxxx' */
4278 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004279 *p++ = '\\';
4280 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004281 *p++ = hexdigits[(ch >> 28) & 0xf];
4282 *p++ = hexdigits[(ch >> 24) & 0xf];
4283 *p++ = hexdigits[(ch >> 20) & 0xf];
4284 *p++ = hexdigits[(ch >> 16) & 0xf];
4285 *p++ = hexdigits[(ch >> 12) & 0xf];
4286 *p++ = hexdigits[(ch >> 8) & 0xf];
4287 *p++ = hexdigits[(ch >> 4) & 0xf];
4288 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004289 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004290 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004291#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4293 if (ch >= 0xD800 && ch < 0xDC00) {
4294 Py_UNICODE ch2;
4295 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004296
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 ch2 = *s++;
4298 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004299 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4301 *p++ = '\\';
4302 *p++ = 'U';
4303 *p++ = hexdigits[(ucs >> 28) & 0xf];
4304 *p++ = hexdigits[(ucs >> 24) & 0xf];
4305 *p++ = hexdigits[(ucs >> 20) & 0xf];
4306 *p++ = hexdigits[(ucs >> 16) & 0xf];
4307 *p++ = hexdigits[(ucs >> 12) & 0xf];
4308 *p++ = hexdigits[(ucs >> 8) & 0xf];
4309 *p++ = hexdigits[(ucs >> 4) & 0xf];
4310 *p++ = hexdigits[ucs & 0xf];
4311 continue;
4312 }
4313 /* Fall through: isolated surrogates are copied as-is */
4314 s--;
4315 size++;
4316 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004317#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 /* Map 16-bit characters to '\uxxxx' */
4319 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320 *p++ = '\\';
4321 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004322 *p++ = hexdigits[(ch >> 12) & 0xf];
4323 *p++ = hexdigits[(ch >> 8) & 0xf];
4324 *p++ = hexdigits[(ch >> 4) & 0xf];
4325 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004327 /* Copy everything else as-is */
4328 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 *p++ = (char) ch;
4330 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004331 size = p - q;
4332
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004333 assert(size > 0);
4334 if (_PyBytes_Resize(&repr, size) < 0)
4335 return NULL;
4336 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337}
4338
4339PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4340{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004341 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004343 PyErr_BadArgument();
4344 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004346 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4347 PyUnicode_GET_SIZE(unicode));
4348
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004349 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350}
4351
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004352/* --- Unicode Internal Codec ------------------------------------------- */
4353
4354PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004355 Py_ssize_t size,
4356 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004357{
4358 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004359 Py_ssize_t startinpos;
4360 Py_ssize_t endinpos;
4361 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004362 PyUnicodeObject *v;
4363 Py_UNICODE *p;
4364 const char *end;
4365 const char *reason;
4366 PyObject *errorHandler = NULL;
4367 PyObject *exc = NULL;
4368
Neal Norwitzd43069c2006-01-08 01:12:10 +00004369#ifdef Py_UNICODE_WIDE
4370 Py_UNICODE unimax = PyUnicode_GetMax();
4371#endif
4372
Thomas Wouters89f507f2006-12-13 04:49:30 +00004373 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004374 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4375 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004377 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004378 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004379 p = PyUnicode_AS_UNICODE(v);
4380 end = s + size;
4381
4382 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004383 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004384 /* We have to sanity check the raw data, otherwise doom looms for
4385 some malformed UCS-4 data. */
4386 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004387#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004388 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004389#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004390 end-s < Py_UNICODE_SIZE
4391 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004393 startinpos = s - starts;
4394 if (end-s < Py_UNICODE_SIZE) {
4395 endinpos = end-starts;
4396 reason = "truncated input";
4397 }
4398 else {
4399 endinpos = s - starts + Py_UNICODE_SIZE;
4400 reason = "illegal code point (> 0x10FFFF)";
4401 }
4402 outpos = p - PyUnicode_AS_UNICODE(v);
4403 if (unicode_decode_call_errorhandler(
4404 errors, &errorHandler,
4405 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004406 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004407 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004408 goto onError;
4409 }
4410 }
4411 else {
4412 p++;
4413 s += Py_UNICODE_SIZE;
4414 }
4415 }
4416
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004417 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004418 goto onError;
4419 Py_XDECREF(errorHandler);
4420 Py_XDECREF(exc);
4421 return (PyObject *)v;
4422
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004424 Py_XDECREF(v);
4425 Py_XDECREF(errorHandler);
4426 Py_XDECREF(exc);
4427 return NULL;
4428}
4429
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430/* --- Latin-1 Codec ------------------------------------------------------ */
4431
4432PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 Py_ssize_t size,
4434 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435{
4436 PyUnicodeObject *v;
4437 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004438 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004439
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004441 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 Py_UNICODE r = *(unsigned char*)s;
4443 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004444 }
4445
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 v = _PyUnicode_New(size);
4447 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004452 e = s + size;
4453 /* Unrolling the copy makes it much faster by reducing the looping
4454 overhead. This is similar to what many memcpy() implementations do. */
4455 unrolled_end = e - 4;
4456 while (s < unrolled_end) {
4457 p[0] = (unsigned char) s[0];
4458 p[1] = (unsigned char) s[1];
4459 p[2] = (unsigned char) s[2];
4460 p[3] = (unsigned char) s[3];
4461 s += 4;
4462 p += 4;
4463 }
4464 while (s < e)
4465 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004467
Benjamin Peterson29060642009-01-31 22:14:21 +00004468 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 Py_XDECREF(v);
4470 return NULL;
4471}
4472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473/* create or adjust a UnicodeEncodeError */
4474static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 const char *encoding,
4476 const Py_UNICODE *unicode, Py_ssize_t size,
4477 Py_ssize_t startpos, Py_ssize_t endpos,
4478 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 *exceptionObject = PyUnicodeEncodeError_Create(
4482 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483 }
4484 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4486 goto onError;
4487 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4488 goto onError;
4489 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4490 goto onError;
4491 return;
4492 onError:
4493 Py_DECREF(*exceptionObject);
4494 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 }
4496}
4497
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498/* raises a UnicodeEncodeError */
4499static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 const char *encoding,
4501 const Py_UNICODE *unicode, Py_ssize_t size,
4502 Py_ssize_t startpos, Py_ssize_t endpos,
4503 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504{
4505 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509}
4510
4511/* error handling callback helper:
4512 build arguments, call the callback and check the arguments,
4513 put the result into newpos and return the replacement string, which
4514 has to be freed by the caller */
4515static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 PyObject **errorHandler,
4517 const char *encoding, const char *reason,
4518 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4519 Py_ssize_t startpos, Py_ssize_t endpos,
4520 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004522 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523
4524 PyObject *restuple;
4525 PyObject *resunicode;
4526
4527 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 }
4532
4533 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537
4538 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004543 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 Py_DECREF(restuple);
4545 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004547 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 &resunicode, newpos)) {
4549 Py_DECREF(restuple);
4550 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004552 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4553 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4554 Py_DECREF(restuple);
4555 return NULL;
4556 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004559 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4561 Py_DECREF(restuple);
4562 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004563 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 Py_INCREF(resunicode);
4565 Py_DECREF(restuple);
4566 return resunicode;
4567}
4568
4569static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 Py_ssize_t size,
4571 const char *errors,
4572 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573{
4574 /* output object */
4575 PyObject *res;
4576 /* pointers to the beginning and end+1 of input */
4577 const Py_UNICODE *startp = p;
4578 const Py_UNICODE *endp = p + size;
4579 /* pointer to the beginning of the unencodable characters */
4580 /* const Py_UNICODE *badp = NULL; */
4581 /* pointer into the output */
4582 char *str;
4583 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004584 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004585 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4586 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 PyObject *errorHandler = NULL;
4588 PyObject *exc = NULL;
4589 /* the following variable is used for caching string comparisons
4590 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4591 int known_errorHandler = -1;
4592
4593 /* allocate enough for a simple encoding without
4594 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004595 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004596 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004597 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004599 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004600 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 ressize = size;
4602
4603 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 /* can we encode this? */
4607 if (c<limit) {
4608 /* no overflow check, because we know that the space is enough */
4609 *str++ = (char)c;
4610 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004611 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 else {
4613 Py_ssize_t unicodepos = p-startp;
4614 Py_ssize_t requiredsize;
4615 PyObject *repunicode;
4616 Py_ssize_t repsize;
4617 Py_ssize_t newpos;
4618 Py_ssize_t respos;
4619 Py_UNICODE *uni2;
4620 /* startpos for collecting unencodable chars */
4621 const Py_UNICODE *collstart = p;
4622 const Py_UNICODE *collend = p;
4623 /* find all unecodable characters */
4624 while ((collend < endp) && ((*collend)>=limit))
4625 ++collend;
4626 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4627 if (known_errorHandler==-1) {
4628 if ((errors==NULL) || (!strcmp(errors, "strict")))
4629 known_errorHandler = 1;
4630 else if (!strcmp(errors, "replace"))
4631 known_errorHandler = 2;
4632 else if (!strcmp(errors, "ignore"))
4633 known_errorHandler = 3;
4634 else if (!strcmp(errors, "xmlcharrefreplace"))
4635 known_errorHandler = 4;
4636 else
4637 known_errorHandler = 0;
4638 }
4639 switch (known_errorHandler) {
4640 case 1: /* strict */
4641 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4642 goto onError;
4643 case 2: /* replace */
4644 while (collstart++<collend)
4645 *str++ = '?'; /* fall through */
4646 case 3: /* ignore */
4647 p = collend;
4648 break;
4649 case 4: /* xmlcharrefreplace */
4650 respos = str - PyBytes_AS_STRING(res);
4651 /* determine replacement size (temporarily (mis)uses p) */
4652 for (p = collstart, repsize = 0; p < collend; ++p) {
4653 if (*p<10)
4654 repsize += 2+1+1;
4655 else if (*p<100)
4656 repsize += 2+2+1;
4657 else if (*p<1000)
4658 repsize += 2+3+1;
4659 else if (*p<10000)
4660 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004661#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 else
4663 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004664#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 else if (*p<100000)
4666 repsize += 2+5+1;
4667 else if (*p<1000000)
4668 repsize += 2+6+1;
4669 else
4670 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004671#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 }
4673 requiredsize = respos+repsize+(endp-collend);
4674 if (requiredsize > ressize) {
4675 if (requiredsize<2*ressize)
4676 requiredsize = 2*ressize;
4677 if (_PyBytes_Resize(&res, requiredsize))
4678 goto onError;
4679 str = PyBytes_AS_STRING(res) + respos;
4680 ressize = requiredsize;
4681 }
4682 /* generate replacement (temporarily (mis)uses p) */
4683 for (p = collstart; p < collend; ++p) {
4684 str += sprintf(str, "&#%d;", (int)*p);
4685 }
4686 p = collend;
4687 break;
4688 default:
4689 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4690 encoding, reason, startp, size, &exc,
4691 collstart-startp, collend-startp, &newpos);
4692 if (repunicode == NULL)
4693 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004694 if (PyBytes_Check(repunicode)) {
4695 /* Directly copy bytes result to output. */
4696 repsize = PyBytes_Size(repunicode);
4697 if (repsize > 1) {
4698 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004699 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004700 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4701 Py_DECREF(repunicode);
4702 goto onError;
4703 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004704 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004705 ressize += repsize-1;
4706 }
4707 memcpy(str, PyBytes_AsString(repunicode), repsize);
4708 str += repsize;
4709 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004710 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004711 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004712 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004713 /* need more space? (at least enough for what we
4714 have+the replacement+the rest of the string, so
4715 we won't have to check space for encodable characters) */
4716 respos = str - PyBytes_AS_STRING(res);
4717 repsize = PyUnicode_GET_SIZE(repunicode);
4718 requiredsize = respos+repsize+(endp-collend);
4719 if (requiredsize > ressize) {
4720 if (requiredsize<2*ressize)
4721 requiredsize = 2*ressize;
4722 if (_PyBytes_Resize(&res, requiredsize)) {
4723 Py_DECREF(repunicode);
4724 goto onError;
4725 }
4726 str = PyBytes_AS_STRING(res) + respos;
4727 ressize = requiredsize;
4728 }
4729 /* check if there is anything unencodable in the replacement
4730 and copy it to the output */
4731 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4732 c = *uni2;
4733 if (c >= limit) {
4734 raise_encode_exception(&exc, encoding, startp, size,
4735 unicodepos, unicodepos+1, reason);
4736 Py_DECREF(repunicode);
4737 goto onError;
4738 }
4739 *str = (char)c;
4740 }
4741 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004742 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004743 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004744 }
4745 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004746 /* Resize if we allocated to much */
4747 size = str - PyBytes_AS_STRING(res);
4748 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004749 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004750 if (_PyBytes_Resize(&res, size) < 0)
4751 goto onError;
4752 }
4753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 Py_XDECREF(errorHandler);
4755 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004756 return res;
4757
4758 onError:
4759 Py_XDECREF(res);
4760 Py_XDECREF(errorHandler);
4761 Py_XDECREF(exc);
4762 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763}
4764
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 Py_ssize_t size,
4767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770}
4771
4772PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4773{
4774 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004775 PyErr_BadArgument();
4776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 }
4778 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 PyUnicode_GET_SIZE(unicode),
4780 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781}
4782
4783/* --- 7-bit ASCII Codec -------------------------------------------------- */
4784
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 Py_ssize_t size,
4787 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 PyUnicodeObject *v;
4791 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004792 Py_ssize_t startinpos;
4793 Py_ssize_t endinpos;
4794 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 const char *e;
4796 PyObject *errorHandler = NULL;
4797 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004798
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004800 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004801 Py_UNICODE r = *(unsigned char*)s;
4802 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004803 }
Tim Petersced69f82003-09-16 20:30:58 +00004804
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 v = _PyUnicode_New(size);
4806 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004807 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 e = s + size;
4812 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 register unsigned char c = (unsigned char)*s;
4814 if (c < 128) {
4815 *p++ = c;
4816 ++s;
4817 }
4818 else {
4819 startinpos = s-starts;
4820 endinpos = startinpos + 1;
4821 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4822 if (unicode_decode_call_errorhandler(
4823 errors, &errorHandler,
4824 "ascii", "ordinal not in range(128)",
4825 &starts, &e, &startinpos, &endinpos, &exc, &s,
4826 &v, &outpos, &p))
4827 goto onError;
4828 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004830 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4832 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 Py_XDECREF(errorHandler);
4834 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004836
Benjamin Peterson29060642009-01-31 22:14:21 +00004837 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 Py_XDECREF(errorHandler);
4840 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 return NULL;
4842}
4843
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 Py_ssize_t size,
4846 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849}
4850
4851PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4852{
4853 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 PyErr_BadArgument();
4855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 }
4857 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 PyUnicode_GET_SIZE(unicode),
4859 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860}
4861
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004862#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004863
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004864/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004865
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004866#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004867#define NEED_RETRY
4868#endif
4869
4870/* XXX This code is limited to "true" double-byte encodings, as
4871 a) it assumes an incomplete character consists of a single byte, and
4872 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004873 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004874
4875static int is_dbcs_lead_byte(const char *s, int offset)
4876{
4877 const char *curr = s + offset;
4878
4879 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 const char *prev = CharPrev(s, curr);
4881 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004882 }
4883 return 0;
4884}
4885
4886/*
4887 * Decode MBCS string into unicode object. If 'final' is set, converts
4888 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4889 */
4890static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 const char *s, /* MBCS string */
4892 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004893 int final,
4894 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004895{
4896 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004897 Py_ssize_t n;
4898 DWORD usize;
4899 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004900
4901 assert(size >= 0);
4902
Victor Stinner554f3f02010-06-16 23:33:54 +00004903 /* check and handle 'errors' arg */
4904 if (errors==NULL || strcmp(errors, "strict")==0)
4905 flags = MB_ERR_INVALID_CHARS;
4906 else if (strcmp(errors, "ignore")==0)
4907 flags = 0;
4908 else {
4909 PyErr_Format(PyExc_ValueError,
4910 "mbcs encoding does not support errors='%s'",
4911 errors);
4912 return -1;
4913 }
4914
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004915 /* Skip trailing lead-byte unless 'final' is set */
4916 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004918
4919 /* First get the size of the result */
4920 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004921 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4922 if (usize==0)
4923 goto mbcs_decode_error;
4924 } else
4925 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004926
4927 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 /* Create unicode object */
4929 *v = _PyUnicode_New(usize);
4930 if (*v == NULL)
4931 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004932 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004933 }
4934 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 /* Extend unicode object */
4936 n = PyUnicode_GET_SIZE(*v);
4937 if (_PyUnicode_Resize(v, n + usize) < 0)
4938 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004939 }
4940
4941 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004942 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004944 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4945 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004947 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004948 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004949
4950mbcs_decode_error:
4951 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4952 we raise a UnicodeDecodeError - else it is a 'generic'
4953 windows error
4954 */
4955 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4956 /* Ideally, we should get reason from FormatMessage - this
4957 is the Windows 2000 English version of the message
4958 */
4959 PyObject *exc = NULL;
4960 const char *reason = "No mapping for the Unicode character exists "
4961 "in the target multi-byte code page.";
4962 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4963 if (exc != NULL) {
4964 PyCodec_StrictErrors(exc);
4965 Py_DECREF(exc);
4966 }
4967 } else {
4968 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4969 }
4970 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004971}
4972
4973PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 Py_ssize_t size,
4975 const char *errors,
4976 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004977{
4978 PyUnicodeObject *v = NULL;
4979 int done;
4980
4981 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004983
4984#ifdef NEED_RETRY
4985 retry:
4986 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004987 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004988 else
4989#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004990 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004991
4992 if (done < 0) {
4993 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004995 }
4996
4997 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004999
5000#ifdef NEED_RETRY
5001 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005002 s += done;
5003 size -= done;
5004 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005005 }
5006#endif
5007
5008 return (PyObject *)v;
5009}
5010
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005011PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 Py_ssize_t size,
5013 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005014{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005015 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5016}
5017
5018/*
5019 * Convert unicode into string object (MBCS).
5020 * Returns 0 if succeed, -1 otherwise.
5021 */
5022static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005024 int size, /* size of unicode */
5025 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005026{
Victor Stinner554f3f02010-06-16 23:33:54 +00005027 BOOL usedDefaultChar = FALSE;
5028 BOOL *pusedDefaultChar;
5029 int mbcssize;
5030 Py_ssize_t n;
5031 PyObject *exc = NULL;
5032 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005033
5034 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005035
Victor Stinner554f3f02010-06-16 23:33:54 +00005036 /* check and handle 'errors' arg */
5037 if (errors==NULL || strcmp(errors, "strict")==0) {
5038 flags = WC_NO_BEST_FIT_CHARS;
5039 pusedDefaultChar = &usedDefaultChar;
5040 } else if (strcmp(errors, "replace")==0) {
5041 flags = 0;
5042 pusedDefaultChar = NULL;
5043 } else {
5044 PyErr_Format(PyExc_ValueError,
5045 "mbcs encoding does not support errors='%s'",
5046 errors);
5047 return -1;
5048 }
5049
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005050 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005051 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005052 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5053 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 if (mbcssize == 0) {
5055 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5056 return -1;
5057 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005058 /* If we used a default char, then we failed! */
5059 if (pusedDefaultChar && *pusedDefaultChar)
5060 goto mbcs_encode_error;
5061 } else {
5062 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005063 }
5064
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005065 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 /* Create string object */
5067 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5068 if (*repr == NULL)
5069 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005070 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005071 }
5072 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 /* Extend string object */
5074 n = PyBytes_Size(*repr);
5075 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5076 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005077 }
5078
5079 /* Do the conversion */
5080 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005081 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005082 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5083 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5085 return -1;
5086 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005087 if (pusedDefaultChar && *pusedDefaultChar)
5088 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005089 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005090 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005091
5092mbcs_encode_error:
5093 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5094 Py_XDECREF(exc);
5095 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005096}
5097
5098PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 Py_ssize_t size,
5100 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005101{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005102 PyObject *repr = NULL;
5103 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005104
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005105#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005106 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005107 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005108 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005109 else
5110#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005111 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005112
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005113 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 Py_XDECREF(repr);
5115 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005116 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005117
5118#ifdef NEED_RETRY
5119 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 p += INT_MAX;
5121 size -= INT_MAX;
5122 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005123 }
5124#endif
5125
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005126 return repr;
5127}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005128
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005129PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5130{
5131 if (!PyUnicode_Check(unicode)) {
5132 PyErr_BadArgument();
5133 return NULL;
5134 }
5135 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005136 PyUnicode_GET_SIZE(unicode),
5137 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005138}
5139
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005140#undef NEED_RETRY
5141
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005142#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005143
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144/* --- Character Mapping Codec -------------------------------------------- */
5145
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 Py_ssize_t size,
5148 PyObject *mapping,
5149 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005151 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005152 Py_ssize_t startinpos;
5153 Py_ssize_t endinpos;
5154 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005155 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 PyUnicodeObject *v;
5157 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005158 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005159 PyObject *errorHandler = NULL;
5160 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005161 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005162 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005163
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 /* Default to Latin-1 */
5165 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167
5168 v = _PyUnicode_New(size);
5169 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005175 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 mapstring = PyUnicode_AS_UNICODE(mapping);
5177 maplen = PyUnicode_GET_SIZE(mapping);
5178 while (s < e) {
5179 unsigned char ch = *s;
5180 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 if (ch < maplen)
5183 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 if (x == 0xfffe) {
5186 /* undefined mapping */
5187 outpos = p-PyUnicode_AS_UNICODE(v);
5188 startinpos = s-starts;
5189 endinpos = startinpos+1;
5190 if (unicode_decode_call_errorhandler(
5191 errors, &errorHandler,
5192 "charmap", "character maps to <undefined>",
5193 &starts, &e, &startinpos, &endinpos, &exc, &s,
5194 &v, &outpos, &p)) {
5195 goto onError;
5196 }
5197 continue;
5198 }
5199 *p++ = x;
5200 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005201 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005202 }
5203 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 while (s < e) {
5205 unsigned char ch = *s;
5206 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005207
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5209 w = PyLong_FromLong((long)ch);
5210 if (w == NULL)
5211 goto onError;
5212 x = PyObject_GetItem(mapping, w);
5213 Py_DECREF(w);
5214 if (x == NULL) {
5215 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5216 /* No mapping found means: mapping is undefined. */
5217 PyErr_Clear();
5218 x = Py_None;
5219 Py_INCREF(x);
5220 } else
5221 goto onError;
5222 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005223
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 /* Apply mapping */
5225 if (PyLong_Check(x)) {
5226 long value = PyLong_AS_LONG(x);
5227 if (value < 0 || value > 65535) {
5228 PyErr_SetString(PyExc_TypeError,
5229 "character mapping must be in range(65536)");
5230 Py_DECREF(x);
5231 goto onError;
5232 }
5233 *p++ = (Py_UNICODE)value;
5234 }
5235 else if (x == Py_None) {
5236 /* undefined mapping */
5237 outpos = p-PyUnicode_AS_UNICODE(v);
5238 startinpos = s-starts;
5239 endinpos = startinpos+1;
5240 if (unicode_decode_call_errorhandler(
5241 errors, &errorHandler,
5242 "charmap", "character maps to <undefined>",
5243 &starts, &e, &startinpos, &endinpos, &exc, &s,
5244 &v, &outpos, &p)) {
5245 Py_DECREF(x);
5246 goto onError;
5247 }
5248 Py_DECREF(x);
5249 continue;
5250 }
5251 else if (PyUnicode_Check(x)) {
5252 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005253
Benjamin Peterson29060642009-01-31 22:14:21 +00005254 if (targetsize == 1)
5255 /* 1-1 mapping */
5256 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005257
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 else if (targetsize > 1) {
5259 /* 1-n mapping */
5260 if (targetsize > extrachars) {
5261 /* resize first */
5262 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5263 Py_ssize_t needed = (targetsize - extrachars) + \
5264 (targetsize << 2);
5265 extrachars += needed;
5266 /* XXX overflow detection missing */
5267 if (_PyUnicode_Resize(&v,
5268 PyUnicode_GET_SIZE(v) + needed) < 0) {
5269 Py_DECREF(x);
5270 goto onError;
5271 }
5272 p = PyUnicode_AS_UNICODE(v) + oldpos;
5273 }
5274 Py_UNICODE_COPY(p,
5275 PyUnicode_AS_UNICODE(x),
5276 targetsize);
5277 p += targetsize;
5278 extrachars -= targetsize;
5279 }
5280 /* 1-0 mapping: skip the character */
5281 }
5282 else {
5283 /* wrong return value */
5284 PyErr_SetString(PyExc_TypeError,
5285 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005286 Py_DECREF(x);
5287 goto onError;
5288 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 Py_DECREF(x);
5290 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 }
5293 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005294 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5295 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005296 Py_XDECREF(errorHandler);
5297 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005299
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005301 Py_XDECREF(errorHandler);
5302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 Py_XDECREF(v);
5304 return NULL;
5305}
5306
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005307/* Charmap encoding: the lookup table */
5308
5309struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005310 PyObject_HEAD
5311 unsigned char level1[32];
5312 int count2, count3;
5313 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005314};
5315
5316static PyObject*
5317encoding_map_size(PyObject *obj, PyObject* args)
5318{
5319 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005320 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005322}
5323
5324static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005325 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 PyDoc_STR("Return the size (in bytes) of this object") },
5327 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005328};
5329
5330static void
5331encoding_map_dealloc(PyObject* o)
5332{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005333 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005334}
5335
5336static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005337 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 "EncodingMap", /*tp_name*/
5339 sizeof(struct encoding_map), /*tp_basicsize*/
5340 0, /*tp_itemsize*/
5341 /* methods */
5342 encoding_map_dealloc, /*tp_dealloc*/
5343 0, /*tp_print*/
5344 0, /*tp_getattr*/
5345 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005346 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 0, /*tp_repr*/
5348 0, /*tp_as_number*/
5349 0, /*tp_as_sequence*/
5350 0, /*tp_as_mapping*/
5351 0, /*tp_hash*/
5352 0, /*tp_call*/
5353 0, /*tp_str*/
5354 0, /*tp_getattro*/
5355 0, /*tp_setattro*/
5356 0, /*tp_as_buffer*/
5357 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5358 0, /*tp_doc*/
5359 0, /*tp_traverse*/
5360 0, /*tp_clear*/
5361 0, /*tp_richcompare*/
5362 0, /*tp_weaklistoffset*/
5363 0, /*tp_iter*/
5364 0, /*tp_iternext*/
5365 encoding_map_methods, /*tp_methods*/
5366 0, /*tp_members*/
5367 0, /*tp_getset*/
5368 0, /*tp_base*/
5369 0, /*tp_dict*/
5370 0, /*tp_descr_get*/
5371 0, /*tp_descr_set*/
5372 0, /*tp_dictoffset*/
5373 0, /*tp_init*/
5374 0, /*tp_alloc*/
5375 0, /*tp_new*/
5376 0, /*tp_free*/
5377 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005378};
5379
5380PyObject*
5381PyUnicode_BuildEncodingMap(PyObject* string)
5382{
5383 Py_UNICODE *decode;
5384 PyObject *result;
5385 struct encoding_map *mresult;
5386 int i;
5387 int need_dict = 0;
5388 unsigned char level1[32];
5389 unsigned char level2[512];
5390 unsigned char *mlevel1, *mlevel2, *mlevel3;
5391 int count2 = 0, count3 = 0;
5392
5393 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5394 PyErr_BadArgument();
5395 return NULL;
5396 }
5397 decode = PyUnicode_AS_UNICODE(string);
5398 memset(level1, 0xFF, sizeof level1);
5399 memset(level2, 0xFF, sizeof level2);
5400
5401 /* If there isn't a one-to-one mapping of NULL to \0,
5402 or if there are non-BMP characters, we need to use
5403 a mapping dictionary. */
5404 if (decode[0] != 0)
5405 need_dict = 1;
5406 for (i = 1; i < 256; i++) {
5407 int l1, l2;
5408 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005409#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005410 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005411#endif
5412 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005413 need_dict = 1;
5414 break;
5415 }
5416 if (decode[i] == 0xFFFE)
5417 /* unmapped character */
5418 continue;
5419 l1 = decode[i] >> 11;
5420 l2 = decode[i] >> 7;
5421 if (level1[l1] == 0xFF)
5422 level1[l1] = count2++;
5423 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005424 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005425 }
5426
5427 if (count2 >= 0xFF || count3 >= 0xFF)
5428 need_dict = 1;
5429
5430 if (need_dict) {
5431 PyObject *result = PyDict_New();
5432 PyObject *key, *value;
5433 if (!result)
5434 return NULL;
5435 for (i = 0; i < 256; i++) {
5436 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005437 key = PyLong_FromLong(decode[i]);
5438 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005439 if (!key || !value)
5440 goto failed1;
5441 if (PyDict_SetItem(result, key, value) == -1)
5442 goto failed1;
5443 Py_DECREF(key);
5444 Py_DECREF(value);
5445 }
5446 return result;
5447 failed1:
5448 Py_XDECREF(key);
5449 Py_XDECREF(value);
5450 Py_DECREF(result);
5451 return NULL;
5452 }
5453
5454 /* Create a three-level trie */
5455 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5456 16*count2 + 128*count3 - 1);
5457 if (!result)
5458 return PyErr_NoMemory();
5459 PyObject_Init(result, &EncodingMapType);
5460 mresult = (struct encoding_map*)result;
5461 mresult->count2 = count2;
5462 mresult->count3 = count3;
5463 mlevel1 = mresult->level1;
5464 mlevel2 = mresult->level23;
5465 mlevel3 = mresult->level23 + 16*count2;
5466 memcpy(mlevel1, level1, 32);
5467 memset(mlevel2, 0xFF, 16*count2);
5468 memset(mlevel3, 0, 128*count3);
5469 count3 = 0;
5470 for (i = 1; i < 256; i++) {
5471 int o1, o2, o3, i2, i3;
5472 if (decode[i] == 0xFFFE)
5473 /* unmapped character */
5474 continue;
5475 o1 = decode[i]>>11;
5476 o2 = (decode[i]>>7) & 0xF;
5477 i2 = 16*mlevel1[o1] + o2;
5478 if (mlevel2[i2] == 0xFF)
5479 mlevel2[i2] = count3++;
5480 o3 = decode[i] & 0x7F;
5481 i3 = 128*mlevel2[i2] + o3;
5482 mlevel3[i3] = i;
5483 }
5484 return result;
5485}
5486
5487static int
5488encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5489{
5490 struct encoding_map *map = (struct encoding_map*)mapping;
5491 int l1 = c>>11;
5492 int l2 = (c>>7) & 0xF;
5493 int l3 = c & 0x7F;
5494 int i;
5495
5496#ifdef Py_UNICODE_WIDE
5497 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005499 }
5500#endif
5501 if (c == 0)
5502 return 0;
5503 /* level 1*/
5504 i = map->level1[l1];
5505 if (i == 0xFF) {
5506 return -1;
5507 }
5508 /* level 2*/
5509 i = map->level23[16*i+l2];
5510 if (i == 0xFF) {
5511 return -1;
5512 }
5513 /* level 3 */
5514 i = map->level23[16*map->count2 + 128*i + l3];
5515 if (i == 0) {
5516 return -1;
5517 }
5518 return i;
5519}
5520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005521/* Lookup the character ch in the mapping. If the character
5522 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005523 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525{
Christian Heimes217cfd12007-12-02 14:31:20 +00005526 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005527 PyObject *x;
5528
5529 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005530 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005531 x = PyObject_GetItem(mapping, w);
5532 Py_DECREF(w);
5533 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5535 /* No mapping found means: mapping is undefined. */
5536 PyErr_Clear();
5537 x = Py_None;
5538 Py_INCREF(x);
5539 return x;
5540 } else
5541 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005543 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005545 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 long value = PyLong_AS_LONG(x);
5547 if (value < 0 || value > 255) {
5548 PyErr_SetString(PyExc_TypeError,
5549 "character mapping must be in range(256)");
5550 Py_DECREF(x);
5551 return NULL;
5552 }
5553 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005555 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 /* wrong return value */
5559 PyErr_Format(PyExc_TypeError,
5560 "character mapping must return integer, bytes or None, not %.400s",
5561 x->ob_type->tp_name);
5562 Py_DECREF(x);
5563 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 }
5565}
5566
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005567static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005568charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005569{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005570 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5571 /* exponentially overallocate to minimize reallocations */
5572 if (requiredsize < 2*outsize)
5573 requiredsize = 2*outsize;
5574 if (_PyBytes_Resize(outobj, requiredsize))
5575 return -1;
5576 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005577}
5578
Benjamin Peterson14339b62009-01-31 16:36:08 +00005579typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005581}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005582/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005583 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005584 space is available. Return a new reference to the object that
5585 was put in the output buffer, or Py_None, if the mapping was undefined
5586 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005587 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005588static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005589charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005592 PyObject *rep;
5593 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005594 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005595
Christian Heimes90aa7642007-12-19 02:45:37 +00005596 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005597 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005599 if (res == -1)
5600 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 if (outsize<requiredsize)
5602 if (charmapencode_resize(outobj, outpos, requiredsize))
5603 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005604 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 outstart[(*outpos)++] = (char)res;
5606 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005607 }
5608
5609 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005610 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005612 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 Py_DECREF(rep);
5614 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005615 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 if (PyLong_Check(rep)) {
5617 Py_ssize_t requiredsize = *outpos+1;
5618 if (outsize<requiredsize)
5619 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5620 Py_DECREF(rep);
5621 return enc_EXCEPTION;
5622 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005623 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005625 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 else {
5627 const char *repchars = PyBytes_AS_STRING(rep);
5628 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5629 Py_ssize_t requiredsize = *outpos+repsize;
5630 if (outsize<requiredsize)
5631 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5632 Py_DECREF(rep);
5633 return enc_EXCEPTION;
5634 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005635 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 memcpy(outstart + *outpos, repchars, repsize);
5637 *outpos += repsize;
5638 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005640 Py_DECREF(rep);
5641 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642}
5643
5644/* handle an error in PyUnicode_EncodeCharmap
5645 Return 0 on success, -1 on error */
5646static
5647int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005648 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005649 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005650 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005651 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005652{
5653 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005654 Py_ssize_t repsize;
5655 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 Py_UNICODE *uni2;
5657 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005658 Py_ssize_t collstartpos = *inpos;
5659 Py_ssize_t collendpos = *inpos+1;
5660 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661 char *encoding = "charmap";
5662 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005663 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665 /* find all unencodable characters */
5666 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005667 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005668 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 int res = encoding_map_lookup(p[collendpos], mapping);
5670 if (res != -1)
5671 break;
5672 ++collendpos;
5673 continue;
5674 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005675
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 rep = charmapencode_lookup(p[collendpos], mapping);
5677 if (rep==NULL)
5678 return -1;
5679 else if (rep!=Py_None) {
5680 Py_DECREF(rep);
5681 break;
5682 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005683 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685 }
5686 /* cache callback name lookup
5687 * (if not done yet, i.e. it's the first error) */
5688 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 if ((errors==NULL) || (!strcmp(errors, "strict")))
5690 *known_errorHandler = 1;
5691 else if (!strcmp(errors, "replace"))
5692 *known_errorHandler = 2;
5693 else if (!strcmp(errors, "ignore"))
5694 *known_errorHandler = 3;
5695 else if (!strcmp(errors, "xmlcharrefreplace"))
5696 *known_errorHandler = 4;
5697 else
5698 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699 }
5700 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005701 case 1: /* strict */
5702 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5703 return -1;
5704 case 2: /* replace */
5705 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 x = charmapencode_output('?', mapping, res, respos);
5707 if (x==enc_EXCEPTION) {
5708 return -1;
5709 }
5710 else if (x==enc_FAILED) {
5711 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5712 return -1;
5713 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005714 }
5715 /* fall through */
5716 case 3: /* ignore */
5717 *inpos = collendpos;
5718 break;
5719 case 4: /* xmlcharrefreplace */
5720 /* generate replacement (temporarily (mis)uses p) */
5721 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 char buffer[2+29+1+1];
5723 char *cp;
5724 sprintf(buffer, "&#%d;", (int)p[collpos]);
5725 for (cp = buffer; *cp; ++cp) {
5726 x = charmapencode_output(*cp, mapping, res, respos);
5727 if (x==enc_EXCEPTION)
5728 return -1;
5729 else if (x==enc_FAILED) {
5730 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5731 return -1;
5732 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005733 }
5734 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005735 *inpos = collendpos;
5736 break;
5737 default:
5738 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 encoding, reason, p, size, exceptionObject,
5740 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005741 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005743 if (PyBytes_Check(repunicode)) {
5744 /* Directly copy bytes result to output. */
5745 Py_ssize_t outsize = PyBytes_Size(*res);
5746 Py_ssize_t requiredsize;
5747 repsize = PyBytes_Size(repunicode);
5748 requiredsize = *respos + repsize;
5749 if (requiredsize > outsize)
5750 /* Make room for all additional bytes. */
5751 if (charmapencode_resize(res, respos, requiredsize)) {
5752 Py_DECREF(repunicode);
5753 return -1;
5754 }
5755 memcpy(PyBytes_AsString(*res) + *respos,
5756 PyBytes_AsString(repunicode), repsize);
5757 *respos += repsize;
5758 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005759 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005760 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005761 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005762 /* generate replacement */
5763 repsize = PyUnicode_GET_SIZE(repunicode);
5764 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 x = charmapencode_output(*uni2, mapping, res, respos);
5766 if (x==enc_EXCEPTION) {
5767 return -1;
5768 }
5769 else if (x==enc_FAILED) {
5770 Py_DECREF(repunicode);
5771 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5772 return -1;
5773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005774 }
5775 *inpos = newpos;
5776 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 }
5778 return 0;
5779}
5780
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 Py_ssize_t size,
5783 PyObject *mapping,
5784 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005786 /* output object */
5787 PyObject *res = NULL;
5788 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005789 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005791 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 PyObject *errorHandler = NULL;
5793 PyObject *exc = NULL;
5794 /* the following variable is used for caching string comparisons
5795 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5796 * 3=ignore, 4=xmlcharrefreplace */
5797 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798
5799 /* Default to Latin-1 */
5800 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005803 /* allocate enough for a simple encoding without
5804 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005805 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 if (res == NULL)
5807 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005808 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 /* try to encode it */
5813 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5814 if (x==enc_EXCEPTION) /* error */
5815 goto onError;
5816 if (x==enc_FAILED) { /* unencodable character */
5817 if (charmap_encoding_error(p, size, &inpos, mapping,
5818 &exc,
5819 &known_errorHandler, &errorHandler, errors,
5820 &res, &respos)) {
5821 goto onError;
5822 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005823 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 else
5825 /* done with this character => adjust input position */
5826 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005830 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005831 if (_PyBytes_Resize(&res, respos) < 0)
5832 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005833
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 Py_XDECREF(exc);
5835 Py_XDECREF(errorHandler);
5836 return res;
5837
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 Py_XDECREF(res);
5840 Py_XDECREF(exc);
5841 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 return NULL;
5843}
5844
5845PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847{
5848 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 PyErr_BadArgument();
5850 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 }
5852 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 PyUnicode_GET_SIZE(unicode),
5854 mapping,
5855 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856}
5857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005858/* create or adjust a UnicodeTranslateError */
5859static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 const Py_UNICODE *unicode, Py_ssize_t size,
5861 Py_ssize_t startpos, Py_ssize_t endpos,
5862 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005865 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 }
5868 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5870 goto onError;
5871 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5872 goto onError;
5873 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5874 goto onError;
5875 return;
5876 onError:
5877 Py_DECREF(*exceptionObject);
5878 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 }
5880}
5881
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882/* raises a UnicodeTranslateError */
5883static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 const Py_UNICODE *unicode, Py_ssize_t size,
5885 Py_ssize_t startpos, Py_ssize_t endpos,
5886 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005887{
5888 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005892}
5893
5894/* error handling callback helper:
5895 build arguments, call the callback and check the arguments,
5896 put the result into newpos and return the replacement string, which
5897 has to be freed by the caller */
5898static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 PyObject **errorHandler,
5900 const char *reason,
5901 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5902 Py_ssize_t startpos, Py_ssize_t endpos,
5903 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005905 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005907 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005908 PyObject *restuple;
5909 PyObject *resunicode;
5910
5911 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005915 }
5916
5917 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005920 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005921
5922 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005924 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005927 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 Py_DECREF(restuple);
5929 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930 }
5931 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 &resunicode, &i_newpos)) {
5933 Py_DECREF(restuple);
5934 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005935 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005936 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005938 else
5939 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005940 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5942 Py_DECREF(restuple);
5943 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005944 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005945 Py_INCREF(resunicode);
5946 Py_DECREF(restuple);
5947 return resunicode;
5948}
5949
5950/* Lookup the character ch in the mapping and put the result in result,
5951 which must be decrefed by the caller.
5952 Return 0 on success, -1 on error */
5953static
5954int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5955{
Christian Heimes217cfd12007-12-02 14:31:20 +00005956 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005957 PyObject *x;
5958
5959 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961 x = PyObject_GetItem(mapping, w);
5962 Py_DECREF(w);
5963 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5965 /* No mapping found means: use 1:1 mapping. */
5966 PyErr_Clear();
5967 *result = NULL;
5968 return 0;
5969 } else
5970 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005971 }
5972 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 *result = x;
5974 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005975 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005976 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 long value = PyLong_AS_LONG(x);
5978 long max = PyUnicode_GetMax();
5979 if (value < 0 || value > max) {
5980 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005981 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 Py_DECREF(x);
5983 return -1;
5984 }
5985 *result = x;
5986 return 0;
5987 }
5988 else if (PyUnicode_Check(x)) {
5989 *result = x;
5990 return 0;
5991 }
5992 else {
5993 /* wrong return value */
5994 PyErr_SetString(PyExc_TypeError,
5995 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005996 Py_DECREF(x);
5997 return -1;
5998 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005999}
6000/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 if not reallocate and adjust various state variables.
6002 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006003static
Walter Dörwald4894c302003-10-24 14:25:28 +00006004int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006006{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006007 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006008 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 /* remember old output position */
6010 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6011 /* exponentially overallocate to minimize reallocations */
6012 if (requiredsize < 2 * oldsize)
6013 requiredsize = 2 * oldsize;
6014 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6015 return -1;
6016 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017 }
6018 return 0;
6019}
6020/* lookup the character, put the result in the output string and adjust
6021 various state variables. Return a new reference to the object that
6022 was put in the output buffer in *result, or Py_None, if the mapping was
6023 undefined (in which case no character was written).
6024 The called must decref result.
6025 Return 0 on success, -1 on error. */
6026static
Walter Dörwald4894c302003-10-24 14:25:28 +00006027int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6029 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006030{
Walter Dörwald4894c302003-10-24 14:25:28 +00006031 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 /* not found => default to 1:1 mapping */
6035 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006036 }
6037 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006039 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 /* no overflow check, because we know that the space is enough */
6041 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 }
6043 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6045 if (repsize==1) {
6046 /* no overflow check, because we know that the space is enough */
6047 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6048 }
6049 else if (repsize!=0) {
6050 /* more than one character */
6051 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6052 (insize - (curinp-startinp)) +
6053 repsize - 1;
6054 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6055 return -1;
6056 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6057 *outp += repsize;
6058 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 }
6060 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006062 return 0;
6063}
6064
6065PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 Py_ssize_t size,
6067 PyObject *mapping,
6068 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070 /* output object */
6071 PyObject *res = NULL;
6072 /* pointers to the beginning and end+1 of input */
6073 const Py_UNICODE *startp = p;
6074 const Py_UNICODE *endp = p + size;
6075 /* pointer into the output */
6076 Py_UNICODE *str;
6077 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006078 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 char *reason = "character maps to <undefined>";
6080 PyObject *errorHandler = NULL;
6081 PyObject *exc = NULL;
6082 /* the following variable is used for caching string comparisons
6083 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6084 * 3=ignore, 4=xmlcharrefreplace */
6085 int known_errorHandler = -1;
6086
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 PyErr_BadArgument();
6089 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006091
6092 /* allocate enough for a simple 1:1 translation without
6093 replacements, if we need more, we'll resize */
6094 res = PyUnicode_FromUnicode(NULL, size);
6095 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006101 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 /* try to encode it */
6103 PyObject *x = NULL;
6104 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6105 Py_XDECREF(x);
6106 goto onError;
6107 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006108 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 if (x!=Py_None) /* it worked => adjust input pointer */
6110 ++p;
6111 else { /* untranslatable character */
6112 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6113 Py_ssize_t repsize;
6114 Py_ssize_t newpos;
6115 Py_UNICODE *uni2;
6116 /* startpos for collecting untranslatable chars */
6117 const Py_UNICODE *collstart = p;
6118 const Py_UNICODE *collend = p+1;
6119 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 /* find all untranslatable characters */
6122 while (collend < endp) {
6123 if (charmaptranslate_lookup(*collend, mapping, &x))
6124 goto onError;
6125 Py_XDECREF(x);
6126 if (x!=Py_None)
6127 break;
6128 ++collend;
6129 }
6130 /* cache callback name lookup
6131 * (if not done yet, i.e. it's the first error) */
6132 if (known_errorHandler==-1) {
6133 if ((errors==NULL) || (!strcmp(errors, "strict")))
6134 known_errorHandler = 1;
6135 else if (!strcmp(errors, "replace"))
6136 known_errorHandler = 2;
6137 else if (!strcmp(errors, "ignore"))
6138 known_errorHandler = 3;
6139 else if (!strcmp(errors, "xmlcharrefreplace"))
6140 known_errorHandler = 4;
6141 else
6142 known_errorHandler = 0;
6143 }
6144 switch (known_errorHandler) {
6145 case 1: /* strict */
6146 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006147 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 case 2: /* replace */
6149 /* No need to check for space, this is a 1:1 replacement */
6150 for (coll = collstart; coll<collend; ++coll)
6151 *str++ = '?';
6152 /* fall through */
6153 case 3: /* ignore */
6154 p = collend;
6155 break;
6156 case 4: /* xmlcharrefreplace */
6157 /* generate replacement (temporarily (mis)uses p) */
6158 for (p = collstart; p < collend; ++p) {
6159 char buffer[2+29+1+1];
6160 char *cp;
6161 sprintf(buffer, "&#%d;", (int)*p);
6162 if (charmaptranslate_makespace(&res, &str,
6163 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6164 goto onError;
6165 for (cp = buffer; *cp; ++cp)
6166 *str++ = *cp;
6167 }
6168 p = collend;
6169 break;
6170 default:
6171 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6172 reason, startp, size, &exc,
6173 collstart-startp, collend-startp, &newpos);
6174 if (repunicode == NULL)
6175 goto onError;
6176 /* generate replacement */
6177 repsize = PyUnicode_GET_SIZE(repunicode);
6178 if (charmaptranslate_makespace(&res, &str,
6179 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6180 Py_DECREF(repunicode);
6181 goto onError;
6182 }
6183 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6184 *str++ = *uni2;
6185 p = startp + newpos;
6186 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006187 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006188 }
6189 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006190 /* Resize if we allocated to much */
6191 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006192 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 if (PyUnicode_Resize(&res, respos) < 0)
6194 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006195 }
6196 Py_XDECREF(exc);
6197 Py_XDECREF(errorHandler);
6198 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201 Py_XDECREF(res);
6202 Py_XDECREF(exc);
6203 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 return NULL;
6205}
6206
6207PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 PyObject *mapping,
6209 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210{
6211 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006212
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 str = PyUnicode_FromObject(str);
6214 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 PyUnicode_GET_SIZE(str),
6218 mapping,
6219 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 Py_DECREF(str);
6221 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006222
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 Py_XDECREF(str);
6225 return NULL;
6226}
Tim Petersced69f82003-09-16 20:30:58 +00006227
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006228PyObject *
6229PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6230 Py_ssize_t length)
6231{
6232 PyObject *result;
6233 Py_UNICODE *p; /* write pointer into result */
6234 Py_ssize_t i;
6235 /* Copy to a new string */
6236 result = (PyObject *)_PyUnicode_New(length);
6237 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6238 if (result == NULL)
6239 return result;
6240 p = PyUnicode_AS_UNICODE(result);
6241 /* Iterate over code points */
6242 for (i = 0; i < length; i++) {
6243 Py_UNICODE ch =s[i];
6244 if (ch > 127) {
6245 int decimal = Py_UNICODE_TODECIMAL(ch);
6246 if (decimal >= 0)
6247 p[i] = '0' + decimal;
6248 }
6249 }
6250 return result;
6251}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006252/* --- Decimal Encoder ---------------------------------------------------- */
6253
6254int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 Py_ssize_t length,
6256 char *output,
6257 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006258{
6259 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006260 PyObject *errorHandler = NULL;
6261 PyObject *exc = NULL;
6262 const char *encoding = "decimal";
6263 const char *reason = "invalid decimal Unicode string";
6264 /* the following variable is used for caching string comparisons
6265 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6266 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006267
6268 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 PyErr_BadArgument();
6270 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006271 }
6272
6273 p = s;
6274 end = s + length;
6275 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 register Py_UNICODE ch = *p;
6277 int decimal;
6278 PyObject *repunicode;
6279 Py_ssize_t repsize;
6280 Py_ssize_t newpos;
6281 Py_UNICODE *uni2;
6282 Py_UNICODE *collstart;
6283 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006284
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006286 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 ++p;
6288 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006289 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 decimal = Py_UNICODE_TODECIMAL(ch);
6291 if (decimal >= 0) {
6292 *output++ = '0' + decimal;
6293 ++p;
6294 continue;
6295 }
6296 if (0 < ch && ch < 256) {
6297 *output++ = (char)ch;
6298 ++p;
6299 continue;
6300 }
6301 /* All other characters are considered unencodable */
6302 collstart = p;
6303 collend = p+1;
6304 while (collend < end) {
6305 if ((0 < *collend && *collend < 256) ||
6306 !Py_UNICODE_ISSPACE(*collend) ||
6307 Py_UNICODE_TODECIMAL(*collend))
6308 break;
6309 }
6310 /* cache callback name lookup
6311 * (if not done yet, i.e. it's the first error) */
6312 if (known_errorHandler==-1) {
6313 if ((errors==NULL) || (!strcmp(errors, "strict")))
6314 known_errorHandler = 1;
6315 else if (!strcmp(errors, "replace"))
6316 known_errorHandler = 2;
6317 else if (!strcmp(errors, "ignore"))
6318 known_errorHandler = 3;
6319 else if (!strcmp(errors, "xmlcharrefreplace"))
6320 known_errorHandler = 4;
6321 else
6322 known_errorHandler = 0;
6323 }
6324 switch (known_errorHandler) {
6325 case 1: /* strict */
6326 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6327 goto onError;
6328 case 2: /* replace */
6329 for (p = collstart; p < collend; ++p)
6330 *output++ = '?';
6331 /* fall through */
6332 case 3: /* ignore */
6333 p = collend;
6334 break;
6335 case 4: /* xmlcharrefreplace */
6336 /* generate replacement (temporarily (mis)uses p) */
6337 for (p = collstart; p < collend; ++p)
6338 output += sprintf(output, "&#%d;", (int)*p);
6339 p = collend;
6340 break;
6341 default:
6342 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6343 encoding, reason, s, length, &exc,
6344 collstart-s, collend-s, &newpos);
6345 if (repunicode == NULL)
6346 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006347 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006348 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006349 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6350 Py_DECREF(repunicode);
6351 goto onError;
6352 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 /* generate replacement */
6354 repsize = PyUnicode_GET_SIZE(repunicode);
6355 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6356 Py_UNICODE ch = *uni2;
6357 if (Py_UNICODE_ISSPACE(ch))
6358 *output++ = ' ';
6359 else {
6360 decimal = Py_UNICODE_TODECIMAL(ch);
6361 if (decimal >= 0)
6362 *output++ = '0' + decimal;
6363 else if (0 < ch && ch < 256)
6364 *output++ = (char)ch;
6365 else {
6366 Py_DECREF(repunicode);
6367 raise_encode_exception(&exc, encoding,
6368 s, length, collstart-s, collend-s, reason);
6369 goto onError;
6370 }
6371 }
6372 }
6373 p = s + newpos;
6374 Py_DECREF(repunicode);
6375 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006376 }
6377 /* 0-terminate the output string */
6378 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379 Py_XDECREF(exc);
6380 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006381 return 0;
6382
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384 Py_XDECREF(exc);
6385 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006386 return -1;
6387}
6388
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389/* --- Helpers ------------------------------------------------------------ */
6390
Eric Smith8c663262007-08-25 02:26:07 +00006391#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006392#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006393
Thomas Wouters477c8d52006-05-27 19:21:47 +00006394#include "stringlib/count.h"
6395#include "stringlib/find.h"
6396#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006397#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006398
Eric Smith5807c412008-05-11 21:00:57 +00006399#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006400#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006401#include "stringlib/localeutil.h"
6402
Thomas Wouters477c8d52006-05-27 19:21:47 +00006403/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006404#define ADJUST_INDICES(start, end, len) \
6405 if (end > len) \
6406 end = len; \
6407 else if (end < 0) { \
6408 end += len; \
6409 if (end < 0) \
6410 end = 0; \
6411 } \
6412 if (start < 0) { \
6413 start += len; \
6414 if (start < 0) \
6415 start = 0; \
6416 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006417
Martin v. Löwis18e16552006-02-15 17:27:45 +00006418Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006419 PyObject *substr,
6420 Py_ssize_t start,
6421 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006423 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006424 PyUnicodeObject* str_obj;
6425 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006426
Thomas Wouters477c8d52006-05-27 19:21:47 +00006427 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6428 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006430 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6431 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 Py_DECREF(str_obj);
6433 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 }
Tim Petersced69f82003-09-16 20:30:58 +00006435
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006436 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006437 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006438 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6439 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006440 );
6441
6442 Py_DECREF(sub_obj);
6443 Py_DECREF(str_obj);
6444
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 return result;
6446}
6447
Martin v. Löwis18e16552006-02-15 17:27:45 +00006448Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006449 PyObject *sub,
6450 Py_ssize_t start,
6451 Py_ssize_t end,
6452 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006454 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006455
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006457 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006459 sub = PyUnicode_FromObject(sub);
6460 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 Py_DECREF(str);
6462 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 }
Tim Petersced69f82003-09-16 20:30:58 +00006464
Thomas Wouters477c8d52006-05-27 19:21:47 +00006465 if (direction > 0)
6466 result = stringlib_find_slice(
6467 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6468 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6469 start, end
6470 );
6471 else
6472 result = stringlib_rfind_slice(
6473 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6474 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6475 start, end
6476 );
6477
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006479 Py_DECREF(sub);
6480
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 return result;
6482}
6483
Tim Petersced69f82003-09-16 20:30:58 +00006484static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 PyUnicodeObject *substring,
6487 Py_ssize_t start,
6488 Py_ssize_t end,
6489 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 if (substring->length == 0)
6492 return 1;
6493
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006494 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 end -= substring->length;
6496 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498
6499 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 if (Py_UNICODE_MATCH(self, end, substring))
6501 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 } else {
6503 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 }
6506
6507 return 0;
6508}
6509
Martin v. Löwis18e16552006-02-15 17:27:45 +00006510Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 PyObject *substr,
6512 Py_ssize_t start,
6513 Py_ssize_t end,
6514 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006516 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006517
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 str = PyUnicode_FromObject(str);
6519 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 substr = PyUnicode_FromObject(substr);
6522 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 Py_DECREF(str);
6524 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 }
Tim Petersced69f82003-09-16 20:30:58 +00006526
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 (PyUnicodeObject *)substr,
6529 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 Py_DECREF(str);
6531 Py_DECREF(substr);
6532 return result;
6533}
6534
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535/* Apply fixfct filter to the Unicode object self and return a
6536 reference to the modified object */
6537
Tim Petersced69f82003-09-16 20:30:58 +00006538static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541{
6542
6543 PyUnicodeObject *u;
6544
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006545 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006548
6549 Py_UNICODE_COPY(u->str, self->str, self->length);
6550
Tim Peters7a29bd52001-09-12 03:03:31 +00006551 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 /* fixfct should return TRUE if it modified the buffer. If
6553 FALSE, return a reference to the original buffer instead
6554 (to save space, not time) */
6555 Py_INCREF(self);
6556 Py_DECREF(u);
6557 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 }
6559 return (PyObject*) u;
6560}
6561
Tim Petersced69f82003-09-16 20:30:58 +00006562static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563int fixupper(PyUnicodeObject *self)
6564{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006565 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 Py_UNICODE *s = self->str;
6567 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006568
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006571
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 ch = Py_UNICODE_TOUPPER(*s);
6573 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 *s = ch;
6576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 s++;
6578 }
6579
6580 return status;
6581}
6582
Tim Petersced69f82003-09-16 20:30:58 +00006583static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584int fixlower(PyUnicodeObject *self)
6585{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006586 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 Py_UNICODE *s = self->str;
6588 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006589
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006592
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 ch = Py_UNICODE_TOLOWER(*s);
6594 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 *s = ch;
6597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 s++;
6599 }
6600
6601 return status;
6602}
6603
Tim Petersced69f82003-09-16 20:30:58 +00006604static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605int fixswapcase(PyUnicodeObject *self)
6606{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006607 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 Py_UNICODE *s = self->str;
6609 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006610
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 while (len-- > 0) {
6612 if (Py_UNICODE_ISUPPER(*s)) {
6613 *s = Py_UNICODE_TOLOWER(*s);
6614 status = 1;
6615 } else if (Py_UNICODE_ISLOWER(*s)) {
6616 *s = Py_UNICODE_TOUPPER(*s);
6617 status = 1;
6618 }
6619 s++;
6620 }
6621
6622 return status;
6623}
6624
Tim Petersced69f82003-09-16 20:30:58 +00006625static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626int fixcapitalize(PyUnicodeObject *self)
6627{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006628 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006629 Py_UNICODE *s = self->str;
6630 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006631
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006632 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006634 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 *s = Py_UNICODE_TOUPPER(*s);
6636 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006638 s++;
6639 while (--len > 0) {
6640 if (Py_UNICODE_ISUPPER(*s)) {
6641 *s = Py_UNICODE_TOLOWER(*s);
6642 status = 1;
6643 }
6644 s++;
6645 }
6646 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
6649static
6650int fixtitle(PyUnicodeObject *self)
6651{
6652 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6653 register Py_UNICODE *e;
6654 int previous_is_cased;
6655
6656 /* Shortcut for single character strings */
6657 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6659 if (*p != ch) {
6660 *p = ch;
6661 return 1;
6662 }
6663 else
6664 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 }
Tim Petersced69f82003-09-16 20:30:58 +00006666
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 e = p + PyUnicode_GET_SIZE(self);
6668 previous_is_cased = 0;
6669 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006671
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 if (previous_is_cased)
6673 *p = Py_UNICODE_TOLOWER(ch);
6674 else
6675 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006676
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 if (Py_UNICODE_ISLOWER(ch) ||
6678 Py_UNICODE_ISUPPER(ch) ||
6679 Py_UNICODE_ISTITLE(ch))
6680 previous_is_cased = 1;
6681 else
6682 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 }
6684 return 1;
6685}
6686
Tim Peters8ce9f162004-08-27 01:49:32 +00006687PyObject *
6688PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689{
Skip Montanaro6543b452004-09-16 03:28:13 +00006690 const Py_UNICODE blank = ' ';
6691 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006692 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006693 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006694 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6695 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006696 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6697 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006698 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006699 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700
Tim Peters05eba1f2004-08-27 21:32:02 +00006701 fseq = PySequence_Fast(seq, "");
6702 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006703 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006704 }
6705
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006706 /* NOTE: the following code can't call back into Python code,
6707 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006708 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006709
Tim Peters05eba1f2004-08-27 21:32:02 +00006710 seqlen = PySequence_Fast_GET_SIZE(fseq);
6711 /* If empty sequence, return u"". */
6712 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006713 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6714 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006715 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006716 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006717 /* If singleton sequence with an exact Unicode, return that. */
6718 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 item = items[0];
6720 if (PyUnicode_CheckExact(item)) {
6721 Py_INCREF(item);
6722 res = (PyUnicodeObject *)item;
6723 goto Done;
6724 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006725 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006726 else {
6727 /* Set up sep and seplen */
6728 if (separator == NULL) {
6729 sep = &blank;
6730 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006731 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006732 else {
6733 if (!PyUnicode_Check(separator)) {
6734 PyErr_Format(PyExc_TypeError,
6735 "separator: expected str instance,"
6736 " %.80s found",
6737 Py_TYPE(separator)->tp_name);
6738 goto onError;
6739 }
6740 sep = PyUnicode_AS_UNICODE(separator);
6741 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006742 }
6743 }
6744
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006745 /* There are at least two things to join, or else we have a subclass
6746 * of str in the sequence.
6747 * Do a pre-pass to figure out the total amount of space we'll
6748 * need (sz), and see whether all argument are strings.
6749 */
6750 sz = 0;
6751 for (i = 0; i < seqlen; i++) {
6752 const Py_ssize_t old_sz = sz;
6753 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 if (!PyUnicode_Check(item)) {
6755 PyErr_Format(PyExc_TypeError,
6756 "sequence item %zd: expected str instance,"
6757 " %.80s found",
6758 i, Py_TYPE(item)->tp_name);
6759 goto onError;
6760 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006761 sz += PyUnicode_GET_SIZE(item);
6762 if (i != 0)
6763 sz += seplen;
6764 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6765 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006767 goto onError;
6768 }
6769 }
Tim Petersced69f82003-09-16 20:30:58 +00006770
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006771 res = _PyUnicode_New(sz);
6772 if (res == NULL)
6773 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006774
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006775 /* Catenate everything. */
6776 res_p = PyUnicode_AS_UNICODE(res);
6777 for (i = 0; i < seqlen; ++i) {
6778 Py_ssize_t itemlen;
6779 item = items[i];
6780 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 /* Copy item, and maybe the separator. */
6782 if (i) {
6783 Py_UNICODE_COPY(res_p, sep, seplen);
6784 res_p += seplen;
6785 }
6786 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6787 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006788 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006789
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006791 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 return (PyObject *)res;
6793
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006795 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006796 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 return NULL;
6798}
6799
Tim Petersced69f82003-09-16 20:30:58 +00006800static
6801PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 Py_ssize_t left,
6803 Py_ssize_t right,
6804 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805{
6806 PyUnicodeObject *u;
6807
6808 if (left < 0)
6809 left = 0;
6810 if (right < 0)
6811 right = 0;
6812
Tim Peters7a29bd52001-09-12 03:03:31 +00006813 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 Py_INCREF(self);
6815 return self;
6816 }
6817
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006818 if (left > PY_SSIZE_T_MAX - self->length ||
6819 right > PY_SSIZE_T_MAX - (left + self->length)) {
6820 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6821 return NULL;
6822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 u = _PyUnicode_New(left + self->length + right);
6824 if (u) {
6825 if (left)
6826 Py_UNICODE_FILL(u->str, fill, left);
6827 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6828 if (right)
6829 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6830 }
6831
6832 return u;
6833}
6834
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006835PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838
6839 string = PyUnicode_FromObject(string);
6840 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006843 list = stringlib_splitlines(
6844 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6845 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846
6847 Py_DECREF(string);
6848 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849}
6850
Tim Petersced69f82003-09-16 20:30:58 +00006851static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 PyUnicodeObject *substring,
6854 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006857 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006860 return stringlib_split_whitespace(
6861 (PyObject*) self, self->str, self->length, maxcount
6862 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006864 return stringlib_split(
6865 (PyObject*) self, self->str, self->length,
6866 substring->str, substring->length,
6867 maxcount
6868 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869}
6870
Tim Petersced69f82003-09-16 20:30:58 +00006871static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006872PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 PyUnicodeObject *substring,
6874 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006875{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006876 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006877 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006878
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006879 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006880 return stringlib_rsplit_whitespace(
6881 (PyObject*) self, self->str, self->length, maxcount
6882 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006883
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006884 return stringlib_rsplit(
6885 (PyObject*) self, self->str, self->length,
6886 substring->str, substring->length,
6887 maxcount
6888 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006889}
6890
6891static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 PyUnicodeObject *str1,
6894 PyUnicodeObject *str2,
6895 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896{
6897 PyUnicodeObject *u;
6898
6899 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006901 else if (maxcount == 0 || self->length == 0)
6902 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903
Thomas Wouters477c8d52006-05-27 19:21:47 +00006904 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006905 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006906 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006907 if (str1->length == 0)
6908 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006909 if (str1->length == 1) {
6910 /* replace characters */
6911 Py_UNICODE u1, u2;
6912 if (!findchar(self->str, self->length, str1->str[0]))
6913 goto nothing;
6914 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6915 if (!u)
6916 return NULL;
6917 Py_UNICODE_COPY(u->str, self->str, self->length);
6918 u1 = str1->str[0];
6919 u2 = str2->str[0];
6920 for (i = 0; i < u->length; i++)
6921 if (u->str[i] == u1) {
6922 if (--maxcount < 0)
6923 break;
6924 u->str[i] = u2;
6925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006927 i = stringlib_find(
6928 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006930 if (i < 0)
6931 goto nothing;
6932 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6933 if (!u)
6934 return NULL;
6935 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006936
6937 /* change everything in-place, starting with this one */
6938 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6939 i += str1->length;
6940
6941 while ( --maxcount > 0) {
6942 i = stringlib_find(self->str+i, self->length-i,
6943 str1->str, str1->length,
6944 i);
6945 if (i == -1)
6946 break;
6947 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6948 i += str1->length;
6949 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006952
6953 Py_ssize_t n, i, j, e;
6954 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 Py_UNICODE *p;
6956
6957 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006958 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6959 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006960 if (n == 0)
6961 goto nothing;
6962 /* new_size = self->length + n * (str2->length - str1->length)); */
6963 delta = (str2->length - str1->length);
6964 if (delta == 0) {
6965 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006967 product = n * (str2->length - str1->length);
6968 if ((product / (str2->length - str1->length)) != n) {
6969 PyErr_SetString(PyExc_OverflowError,
6970 "replace string is too long");
6971 return NULL;
6972 }
6973 new_size = self->length + product;
6974 if (new_size < 0) {
6975 PyErr_SetString(PyExc_OverflowError,
6976 "replace string is too long");
6977 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 }
6979 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006980 u = _PyUnicode_New(new_size);
6981 if (!u)
6982 return NULL;
6983 i = 0;
6984 p = u->str;
6985 e = self->length - str1->length;
6986 if (str1->length > 0) {
6987 while (n-- > 0) {
6988 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006989 j = stringlib_find(self->str+i, self->length-i,
6990 str1->str, str1->length,
6991 i);
6992 if (j == -1)
6993 break;
6994 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006995 /* copy unchanged part [i:j] */
6996 Py_UNICODE_COPY(p, self->str+i, j-i);
6997 p += j - i;
6998 }
6999 /* copy substitution string */
7000 if (str2->length > 0) {
7001 Py_UNICODE_COPY(p, str2->str, str2->length);
7002 p += str2->length;
7003 }
7004 i = j + str1->length;
7005 }
7006 if (i < self->length)
7007 /* copy tail [i:] */
7008 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7009 } else {
7010 /* interleave */
7011 while (n > 0) {
7012 Py_UNICODE_COPY(p, str2->str, str2->length);
7013 p += str2->length;
7014 if (--n <= 0)
7015 break;
7016 *p++ = self->str[i++];
7017 }
7018 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7019 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007022
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007024 /* nothing to replace; return original string (when possible) */
7025 if (PyUnicode_CheckExact(self)) {
7026 Py_INCREF(self);
7027 return (PyObject *) self;
7028 }
7029 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030}
7031
7032/* --- Unicode Object Methods --------------------------------------------- */
7033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007034PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036\n\
7037Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007038characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039
7040static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007041unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 return fixup(self, fixtitle);
7044}
7045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007046PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048\n\
7049Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007050have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051
7052static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007053unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 return fixup(self, fixcapitalize);
7056}
7057
7058#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007059PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061\n\
7062Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007063normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064
7065static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007066unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067{
7068 PyObject *list;
7069 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007070 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 /* Split into words */
7073 list = split(self, NULL, -1);
7074 if (!list)
7075 return NULL;
7076
7077 /* Capitalize each word */
7078 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7079 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 if (item == NULL)
7082 goto onError;
7083 Py_DECREF(PyList_GET_ITEM(list, i));
7084 PyList_SET_ITEM(list, i, item);
7085 }
7086
7087 /* Join the words to form a new string */
7088 item = PyUnicode_Join(NULL, list);
7089
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091 Py_DECREF(list);
7092 return (PyObject *)item;
7093}
7094#endif
7095
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007096/* Argument converter. Coerces to a single unicode character */
7097
7098static int
7099convert_uc(PyObject *obj, void *addr)
7100{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007101 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7102 PyObject *uniobj;
7103 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007104
Benjamin Peterson14339b62009-01-31 16:36:08 +00007105 uniobj = PyUnicode_FromObject(obj);
7106 if (uniobj == NULL) {
7107 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007108 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007109 return 0;
7110 }
7111 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7112 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007114 Py_DECREF(uniobj);
7115 return 0;
7116 }
7117 unistr = PyUnicode_AS_UNICODE(uniobj);
7118 *fillcharloc = unistr[0];
7119 Py_DECREF(uniobj);
7120 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007121}
7122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007123PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007126Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007127done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128
7129static PyObject *
7130unicode_center(PyUnicodeObject *self, PyObject *args)
7131{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007132 Py_ssize_t marg, left;
7133 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007134 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135
Thomas Woutersde017742006-02-16 19:34:37 +00007136 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 return NULL;
7138
Tim Peters7a29bd52001-09-12 03:03:31 +00007139 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 Py_INCREF(self);
7141 return (PyObject*) self;
7142 }
7143
7144 marg = width - self->length;
7145 left = marg / 2 + (marg & width & 1);
7146
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007147 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148}
7149
Marc-André Lemburge5034372000-08-08 08:04:29 +00007150#if 0
7151
7152/* This code should go into some future Unicode collation support
7153 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007154 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007155
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007156/* speedy UTF-16 code point order comparison */
7157/* gleaned from: */
7158/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7159
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007160static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007161{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007162 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007163 0, 0, 0, 0, 0, 0, 0, 0,
7164 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007165 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007166};
7167
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168static int
7169unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7170{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007171 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007172
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 Py_UNICODE *s1 = str1->str;
7174 Py_UNICODE *s2 = str2->str;
7175
7176 len1 = str1->length;
7177 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007178
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007180 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007181
7182 c1 = *s1++;
7183 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007184
Benjamin Peterson29060642009-01-31 22:14:21 +00007185 if (c1 > (1<<11) * 26)
7186 c1 += utf16Fixup[c1>>11];
7187 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007188 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007189 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007190
7191 if (c1 != c2)
7192 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007193
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007194 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 }
7196
7197 return (len1 < len2) ? -1 : (len1 != len2);
7198}
7199
Marc-André Lemburge5034372000-08-08 08:04:29 +00007200#else
7201
7202static int
7203unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7204{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007205 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007206
7207 Py_UNICODE *s1 = str1->str;
7208 Py_UNICODE *s2 = str2->str;
7209
7210 len1 = str1->length;
7211 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007212
Marc-André Lemburge5034372000-08-08 08:04:29 +00007213 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007214 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007215
Fredrik Lundh45714e92001-06-26 16:39:36 +00007216 c1 = *s1++;
7217 c2 = *s2++;
7218
7219 if (c1 != c2)
7220 return (c1 < c2) ? -1 : 1;
7221
Marc-André Lemburge5034372000-08-08 08:04:29 +00007222 len1--; len2--;
7223 }
7224
7225 return (len1 < len2) ? -1 : (len1 != len2);
7226}
7227
7228#endif
7229
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007233 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7234 return unicode_compare((PyUnicodeObject *)left,
7235 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007236 PyErr_Format(PyExc_TypeError,
7237 "Can't compare %.100s and %.100s",
7238 left->ob_type->tp_name,
7239 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 return -1;
7241}
7242
Martin v. Löwis5b222132007-06-10 09:51:05 +00007243int
7244PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7245{
7246 int i;
7247 Py_UNICODE *id;
7248 assert(PyUnicode_Check(uni));
7249 id = PyUnicode_AS_UNICODE(uni);
7250 /* Compare Unicode string and source character set string */
7251 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007252 if (id[i] != str[i])
7253 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007254 /* This check keeps Python strings that end in '\0' from comparing equal
7255 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007256 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007258 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007260 return 0;
7261}
7262
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007263
Benjamin Peterson29060642009-01-31 22:14:21 +00007264#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007265 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007266
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007267PyObject *PyUnicode_RichCompare(PyObject *left,
7268 PyObject *right,
7269 int op)
7270{
7271 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007272
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007273 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7274 PyObject *v;
7275 if (((PyUnicodeObject *) left)->length !=
7276 ((PyUnicodeObject *) right)->length) {
7277 if (op == Py_EQ) {
7278 Py_INCREF(Py_False);
7279 return Py_False;
7280 }
7281 if (op == Py_NE) {
7282 Py_INCREF(Py_True);
7283 return Py_True;
7284 }
7285 }
7286 if (left == right)
7287 result = 0;
7288 else
7289 result = unicode_compare((PyUnicodeObject *)left,
7290 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007291
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007292 /* Convert the return value to a Boolean */
7293 switch (op) {
7294 case Py_EQ:
7295 v = TEST_COND(result == 0);
7296 break;
7297 case Py_NE:
7298 v = TEST_COND(result != 0);
7299 break;
7300 case Py_LE:
7301 v = TEST_COND(result <= 0);
7302 break;
7303 case Py_GE:
7304 v = TEST_COND(result >= 0);
7305 break;
7306 case Py_LT:
7307 v = TEST_COND(result == -1);
7308 break;
7309 case Py_GT:
7310 v = TEST_COND(result == 1);
7311 break;
7312 default:
7313 PyErr_BadArgument();
7314 return NULL;
7315 }
7316 Py_INCREF(v);
7317 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007318 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007319
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007320 Py_INCREF(Py_NotImplemented);
7321 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007322}
7323
Guido van Rossum403d68b2000-03-13 15:55:09 +00007324int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007326{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007327 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007328 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007329
7330 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007331 sub = PyUnicode_FromObject(element);
7332 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007333 PyErr_Format(PyExc_TypeError,
7334 "'in <string>' requires string as left operand, not %s",
7335 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007336 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007337 }
7338
Thomas Wouters477c8d52006-05-27 19:21:47 +00007339 str = PyUnicode_FromObject(container);
7340 if (!str) {
7341 Py_DECREF(sub);
7342 return -1;
7343 }
7344
7345 result = stringlib_contains_obj(str, sub);
7346
7347 Py_DECREF(str);
7348 Py_DECREF(sub);
7349
Guido van Rossum403d68b2000-03-13 15:55:09 +00007350 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007351}
7352
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353/* Concat to string or Unicode object giving a new Unicode object. */
7354
7355PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357{
7358 PyUnicodeObject *u = NULL, *v = NULL, *w;
7359
7360 /* Coerce the two arguments */
7361 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7362 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7365 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
7368 /* Shortcuts */
7369 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 Py_DECREF(v);
7371 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 }
7373 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 Py_DECREF(u);
7375 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 }
7377
7378 /* Concat the two Unicode strings */
7379 w = _PyUnicode_New(u->length + v->length);
7380 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382 Py_UNICODE_COPY(w->str, u->str, u->length);
7383 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7384
7385 Py_DECREF(u);
7386 Py_DECREF(v);
7387 return (PyObject *)w;
7388
Benjamin Peterson29060642009-01-31 22:14:21 +00007389 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390 Py_XDECREF(u);
7391 Py_XDECREF(v);
7392 return NULL;
7393}
7394
Walter Dörwald1ab83302007-05-18 17:15:44 +00007395void
7396PyUnicode_Append(PyObject **pleft, PyObject *right)
7397{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007398 PyObject *new;
7399 if (*pleft == NULL)
7400 return;
7401 if (right == NULL || !PyUnicode_Check(*pleft)) {
7402 Py_DECREF(*pleft);
7403 *pleft = NULL;
7404 return;
7405 }
7406 new = PyUnicode_Concat(*pleft, right);
7407 Py_DECREF(*pleft);
7408 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007409}
7410
7411void
7412PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7413{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007414 PyUnicode_Append(pleft, right);
7415 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007416}
7417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007418PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007421Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007422string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007423interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424
7425static PyObject *
7426unicode_count(PyUnicodeObject *self, PyObject *args)
7427{
7428 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007429 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007430 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431 PyObject *result;
7432
Guido van Rossumb8872e62000-05-09 14:14:27 +00007433 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435 return NULL;
7436
7437 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007438 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007441
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007442 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007443 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007444 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007445 substring->str, substring->length,
7446 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007447 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
7449 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007450
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 return result;
7452}
7453
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007454PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007455 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007457Encode S using the codec registered for encoding. Default encoding\n\
7458is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007459handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007460a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7461'xmlcharrefreplace' as well as any other name registered with\n\
7462codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463
7464static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007465unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007467 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 char *encoding = NULL;
7469 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007470
Benjamin Peterson308d6372009-09-18 21:42:35 +00007471 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7472 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007474 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007475}
7476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007477PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479\n\
7480Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007481If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
7483static PyObject*
7484unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7485{
7486 Py_UNICODE *e;
7487 Py_UNICODE *p;
7488 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007489 Py_UNICODE *qe;
7490 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491 PyUnicodeObject *u;
7492 int tabsize = 8;
7493
7494 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496
Thomas Wouters7e474022000-07-16 12:04:32 +00007497 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007498 i = 0; /* chars up to and including most recent \n or \r */
7499 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7500 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501 for (p = self->str; p < e; p++)
7502 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 if (tabsize > 0) {
7504 incr = tabsize - (j % tabsize); /* cannot overflow */
7505 if (j > PY_SSIZE_T_MAX - incr)
7506 goto overflow1;
7507 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007508 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 if (j > PY_SSIZE_T_MAX - 1)
7512 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 j++;
7514 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 if (i > PY_SSIZE_T_MAX - j)
7516 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007518 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519 }
7520 }
7521
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007522 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007524
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525 /* Second pass: create output string and fill it */
7526 u = _PyUnicode_New(i + j);
7527 if (!u)
7528 return NULL;
7529
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007530 j = 0; /* same as in first pass */
7531 q = u->str; /* next output char */
7532 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533
7534 for (p = self->str; p < e; p++)
7535 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 if (tabsize > 0) {
7537 i = tabsize - (j % tabsize);
7538 j += i;
7539 while (i--) {
7540 if (q >= qe)
7541 goto overflow2;
7542 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007543 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007545 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007546 else {
7547 if (q >= qe)
7548 goto overflow2;
7549 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007550 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 if (*p == '\n' || *p == '\r')
7552 j = 0;
7553 }
7554
7555 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007556
7557 overflow2:
7558 Py_DECREF(u);
7559 overflow1:
7560 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562}
7563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007564PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007565 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566\n\
7567Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007568such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569arguments start and end are interpreted as in slice notation.\n\
7570\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007571Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
7573static PyObject *
7574unicode_find(PyUnicodeObject *self, PyObject *args)
7575{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007576 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007577 Py_ssize_t start;
7578 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007579 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580
Christian Heimes9cd17752007-11-18 19:35:23 +00007581 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583
Thomas Wouters477c8d52006-05-27 19:21:47 +00007584 result = stringlib_find_slice(
7585 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7586 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7587 start, end
7588 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589
7590 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007591
Christian Heimes217cfd12007-12-02 14:31:20 +00007592 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593}
7594
7595static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007596unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597{
7598 if (index < 0 || index >= self->length) {
7599 PyErr_SetString(PyExc_IndexError, "string index out of range");
7600 return NULL;
7601 }
7602
7603 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7604}
7605
Guido van Rossumc2504932007-09-18 19:42:40 +00007606/* Believe it or not, this produces the same value for ASCII strings
7607 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007608static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007609unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610{
Guido van Rossumc2504932007-09-18 19:42:40 +00007611 Py_ssize_t len;
7612 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007613 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007614
7615 if (self->hash != -1)
7616 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007617 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007618 p = self->str;
7619 x = *p << 7;
7620 while (--len >= 0)
7621 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007622 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007623 if (x == -1)
7624 x = -2;
7625 self->hash = x;
7626 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627}
7628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007629PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007632Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
7634static PyObject *
7635unicode_index(PyUnicodeObject *self, PyObject *args)
7636{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007637 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007638 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007639 Py_ssize_t start;
7640 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
Christian Heimes9cd17752007-11-18 19:35:23 +00007642 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644
Thomas Wouters477c8d52006-05-27 19:21:47 +00007645 result = stringlib_find_slice(
7646 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7647 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7648 start, end
7649 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650
7651 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007652
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653 if (result < 0) {
7654 PyErr_SetString(PyExc_ValueError, "substring not found");
7655 return NULL;
7656 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007657
Christian Heimes217cfd12007-12-02 14:31:20 +00007658 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659}
7660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007661PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007662 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007664Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007665at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666
7667static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007668unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669{
7670 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7671 register const Py_UNICODE *e;
7672 int cased;
7673
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 /* Shortcut for single character strings */
7675 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007678 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007679 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007681
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 e = p + PyUnicode_GET_SIZE(self);
7683 cased = 0;
7684 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007686
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7688 return PyBool_FromLong(0);
7689 else if (!cased && Py_UNICODE_ISLOWER(ch))
7690 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007692 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693}
7694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007695PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007698Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007699at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700
7701static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007702unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703{
7704 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7705 register const Py_UNICODE *e;
7706 int cased;
7707
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708 /* Shortcut for single character strings */
7709 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007712 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007713 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007715
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 e = p + PyUnicode_GET_SIZE(self);
7717 cased = 0;
7718 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007720
Benjamin Peterson29060642009-01-31 22:14:21 +00007721 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7722 return PyBool_FromLong(0);
7723 else if (!cased && Py_UNICODE_ISUPPER(ch))
7724 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007726 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727}
7728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007729PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007732Return True if S is a titlecased string and there is at least one\n\
7733character in S, i.e. upper- and titlecase characters may only\n\
7734follow uncased characters and lowercase characters only cased ones.\n\
7735Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736
7737static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007738unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739{
7740 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7741 register const Py_UNICODE *e;
7742 int cased, previous_is_cased;
7743
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 /* Shortcut for single character strings */
7745 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7747 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007749 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007750 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007752
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 e = p + PyUnicode_GET_SIZE(self);
7754 cased = 0;
7755 previous_is_cased = 0;
7756 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007758
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7760 if (previous_is_cased)
7761 return PyBool_FromLong(0);
7762 previous_is_cased = 1;
7763 cased = 1;
7764 }
7765 else if (Py_UNICODE_ISLOWER(ch)) {
7766 if (!previous_is_cased)
7767 return PyBool_FromLong(0);
7768 previous_is_cased = 1;
7769 cased = 1;
7770 }
7771 else
7772 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007774 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775}
7776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007777PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007780Return True if all characters in S are whitespace\n\
7781and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782
7783static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007784unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785{
7786 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7787 register const Py_UNICODE *e;
7788
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 /* Shortcut for single character strings */
7790 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 Py_UNICODE_ISSPACE(*p))
7792 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007794 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007795 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007797
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 e = p + PyUnicode_GET_SIZE(self);
7799 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 if (!Py_UNICODE_ISSPACE(*p))
7801 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007803 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804}
7805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007806PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007808\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007809Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007810and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007811
7812static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007813unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007814{
7815 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7816 register const Py_UNICODE *e;
7817
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007818 /* Shortcut for single character strings */
7819 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 Py_UNICODE_ISALPHA(*p))
7821 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007822
7823 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007824 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007826
7827 e = p + PyUnicode_GET_SIZE(self);
7828 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 if (!Py_UNICODE_ISALPHA(*p))
7830 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007831 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007832 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007833}
7834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007835PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007837\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007838Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007839and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007840
7841static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007842unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007843{
7844 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7845 register const Py_UNICODE *e;
7846
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007847 /* Shortcut for single character strings */
7848 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 Py_UNICODE_ISALNUM(*p))
7850 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007851
7852 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007853 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007855
7856 e = p + PyUnicode_GET_SIZE(self);
7857 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 if (!Py_UNICODE_ISALNUM(*p))
7859 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007860 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007861 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007862}
7863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007864PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007867Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007868False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869
7870static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007871unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872{
7873 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7874 register const Py_UNICODE *e;
7875
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876 /* Shortcut for single character strings */
7877 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 Py_UNICODE_ISDECIMAL(*p))
7879 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007881 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007882 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007884
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885 e = p + PyUnicode_GET_SIZE(self);
7886 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 if (!Py_UNICODE_ISDECIMAL(*p))
7888 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007890 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891}
7892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007893PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007896Return True if all characters in S are digits\n\
7897and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898
7899static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007900unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901{
7902 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7903 register const Py_UNICODE *e;
7904
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 /* Shortcut for single character strings */
7906 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 Py_UNICODE_ISDIGIT(*p))
7908 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007910 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007911 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007913
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914 e = p + PyUnicode_GET_SIZE(self);
7915 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 if (!Py_UNICODE_ISDIGIT(*p))
7917 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007919 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920}
7921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007922PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007925Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007926False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927
7928static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007929unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930{
7931 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7932 register const Py_UNICODE *e;
7933
Guido van Rossumd57fd912000-03-10 22:53:23 +00007934 /* Shortcut for single character strings */
7935 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 Py_UNICODE_ISNUMERIC(*p))
7937 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007939 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007940 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007942
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943 e = p + PyUnicode_GET_SIZE(self);
7944 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 if (!Py_UNICODE_ISNUMERIC(*p))
7946 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007948 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949}
7950
Martin v. Löwis47383402007-08-15 07:32:56 +00007951int
7952PyUnicode_IsIdentifier(PyObject *self)
7953{
7954 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7955 register const Py_UNICODE *e;
7956
7957 /* Special case for empty strings */
7958 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007960
7961 /* PEP 3131 says that the first character must be in
7962 XID_Start and subsequent characters in XID_Continue,
7963 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007964 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007965 letters, digits, underscore). However, given the current
7966 definition of XID_Start and XID_Continue, it is sufficient
7967 to check just for these, except that _ must be allowed
7968 as starting an identifier. */
7969 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7970 return 0;
7971
7972 e = p + PyUnicode_GET_SIZE(self);
7973 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 if (!_PyUnicode_IsXidContinue(*p))
7975 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007976 }
7977 return 1;
7978}
7979
7980PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007982\n\
7983Return True if S is a valid identifier according\n\
7984to the language definition.");
7985
7986static PyObject*
7987unicode_isidentifier(PyObject *self)
7988{
7989 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7990}
7991
Georg Brandl559e5d72008-06-11 18:37:52 +00007992PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007994\n\
7995Return True if all characters in S are considered\n\
7996printable in repr() or S is empty, False otherwise.");
7997
7998static PyObject*
7999unicode_isprintable(PyObject *self)
8000{
8001 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8002 register const Py_UNICODE *e;
8003
8004 /* Shortcut for single character strings */
8005 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8006 Py_RETURN_TRUE;
8007 }
8008
8009 e = p + PyUnicode_GET_SIZE(self);
8010 for (; p < e; p++) {
8011 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8012 Py_RETURN_FALSE;
8013 }
8014 }
8015 Py_RETURN_TRUE;
8016}
8017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008018PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008019 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020\n\
8021Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008022iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023
8024static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008025unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008027 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028}
8029
Martin v. Löwis18e16552006-02-15 17:27:45 +00008030static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031unicode_length(PyUnicodeObject *self)
8032{
8033 return self->length;
8034}
8035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008036PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008039Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008040done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041
8042static PyObject *
8043unicode_ljust(PyUnicodeObject *self, PyObject *args)
8044{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008045 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008046 Py_UNICODE fillchar = ' ';
8047
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008048 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 return NULL;
8050
Tim Peters7a29bd52001-09-12 03:03:31 +00008051 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 Py_INCREF(self);
8053 return (PyObject*) self;
8054 }
8055
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008056 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057}
8058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008059PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008062Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063
8064static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008065unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067 return fixup(self, fixlower);
8068}
8069
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008070#define LEFTSTRIP 0
8071#define RIGHTSTRIP 1
8072#define BOTHSTRIP 2
8073
8074/* Arrays indexed by above */
8075static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8076
8077#define STRIPNAME(i) (stripformat[i]+3)
8078
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008079/* externally visible for str.strip(unicode) */
8080PyObject *
8081_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8082{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008083 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8084 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8085 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8086 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8087 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008088
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008090
Benjamin Peterson14339b62009-01-31 16:36:08 +00008091 i = 0;
8092 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8094 i++;
8095 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008096 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008097
Benjamin Peterson14339b62009-01-31 16:36:08 +00008098 j = len;
8099 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 do {
8101 j--;
8102 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8103 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008104 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008105
Benjamin Peterson14339b62009-01-31 16:36:08 +00008106 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 Py_INCREF(self);
8108 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008109 }
8110 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008112}
8113
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114
8115static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008116do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008118 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8119 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008120
Benjamin Peterson14339b62009-01-31 16:36:08 +00008121 i = 0;
8122 if (striptype != RIGHTSTRIP) {
8123 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8124 i++;
8125 }
8126 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008127
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128 j = len;
8129 if (striptype != LEFTSTRIP) {
8130 do {
8131 j--;
8132 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8133 j++;
8134 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008135
Benjamin Peterson14339b62009-01-31 16:36:08 +00008136 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8137 Py_INCREF(self);
8138 return (PyObject*)self;
8139 }
8140 else
8141 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142}
8143
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008144
8145static PyObject *
8146do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8147{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008148 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008149
Benjamin Peterson14339b62009-01-31 16:36:08 +00008150 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8151 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008152
Benjamin Peterson14339b62009-01-31 16:36:08 +00008153 if (sep != NULL && sep != Py_None) {
8154 if (PyUnicode_Check(sep))
8155 return _PyUnicode_XStrip(self, striptype, sep);
8156 else {
8157 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 "%s arg must be None or str",
8159 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 return NULL;
8161 }
8162 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008163
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008165}
8166
8167
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008168PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008170\n\
8171Return a copy of the string S with leading and trailing\n\
8172whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008173If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008174
8175static PyObject *
8176unicode_strip(PyUnicodeObject *self, PyObject *args)
8177{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008178 if (PyTuple_GET_SIZE(args) == 0)
8179 return do_strip(self, BOTHSTRIP); /* Common case */
8180 else
8181 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008182}
8183
8184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008185PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008187\n\
8188Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008189If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008190
8191static PyObject *
8192unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8193{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008194 if (PyTuple_GET_SIZE(args) == 0)
8195 return do_strip(self, LEFTSTRIP); /* Common case */
8196 else
8197 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008198}
8199
8200
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008201PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008203\n\
8204Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008205If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008206
8207static PyObject *
8208unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8209{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 if (PyTuple_GET_SIZE(args) == 0)
8211 return do_strip(self, RIGHTSTRIP); /* Common case */
8212 else
8213 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008214}
8215
8216
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008218unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219{
8220 PyUnicodeObject *u;
8221 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008222 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008223 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224
Georg Brandl222de0f2009-04-12 12:01:50 +00008225 if (len < 1) {
8226 Py_INCREF(unicode_empty);
8227 return (PyObject *)unicode_empty;
8228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229
Tim Peters7a29bd52001-09-12 03:03:31 +00008230 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 /* no repeat, return original string */
8232 Py_INCREF(str);
8233 return (PyObject*) str;
8234 }
Tim Peters8f422462000-09-09 06:13:41 +00008235
8236 /* ensure # of chars needed doesn't overflow int and # of bytes
8237 * needed doesn't overflow size_t
8238 */
8239 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008240 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008241 PyErr_SetString(PyExc_OverflowError,
8242 "repeated string is too long");
8243 return NULL;
8244 }
8245 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8246 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8247 PyErr_SetString(PyExc_OverflowError,
8248 "repeated string is too long");
8249 return NULL;
8250 }
8251 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 if (!u)
8253 return NULL;
8254
8255 p = u->str;
8256
Georg Brandl222de0f2009-04-12 12:01:50 +00008257 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008258 Py_UNICODE_FILL(p, str->str[0], len);
8259 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008260 Py_ssize_t done = str->length; /* number of characters copied this far */
8261 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008263 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008264 Py_UNICODE_COPY(p+done, p, n);
8265 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 }
8268
8269 return (PyObject*) u;
8270}
8271
8272PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 PyObject *subobj,
8274 PyObject *replobj,
8275 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276{
8277 PyObject *self;
8278 PyObject *str1;
8279 PyObject *str2;
8280 PyObject *result;
8281
8282 self = PyUnicode_FromObject(obj);
8283 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 str1 = PyUnicode_FromObject(subobj);
8286 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 Py_DECREF(self);
8288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 }
8290 str2 = PyUnicode_FromObject(replobj);
8291 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 Py_DECREF(self);
8293 Py_DECREF(str1);
8294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 }
Tim Petersced69f82003-09-16 20:30:58 +00008296 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 (PyUnicodeObject *)str1,
8298 (PyUnicodeObject *)str2,
8299 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 Py_DECREF(self);
8301 Py_DECREF(str1);
8302 Py_DECREF(str2);
8303 return result;
8304}
8305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008306PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008307 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308\n\
8309Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008310old replaced by new. If the optional argument count is\n\
8311given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312
8313static PyObject*
8314unicode_replace(PyUnicodeObject *self, PyObject *args)
8315{
8316 PyUnicodeObject *str1;
8317 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008318 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 PyObject *result;
8320
Martin v. Löwis18e16552006-02-15 17:27:45 +00008321 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 return NULL;
8323 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8324 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008327 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 Py_DECREF(str1);
8329 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331
8332 result = replace(self, str1, str2, maxcount);
8333
8334 Py_DECREF(str1);
8335 Py_DECREF(str2);
8336 return result;
8337}
8338
8339static
8340PyObject *unicode_repr(PyObject *unicode)
8341{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008342 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008343 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008344 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8345 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8346
8347 /* XXX(nnorwitz): rather than over-allocating, it would be
8348 better to choose a different scheme. Perhaps scan the
8349 first N-chars of the string and allocate based on that size.
8350 */
8351 /* Initial allocation is based on the longest-possible unichr
8352 escape.
8353
8354 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8355 unichr, so in this case it's the longest unichr escape. In
8356 narrow (UTF-16) builds this is five chars per source unichr
8357 since there are two unichrs in the surrogate pair, so in narrow
8358 (UTF-16) builds it's not the longest unichr escape.
8359
8360 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8361 so in the narrow (UTF-16) build case it's the longest unichr
8362 escape.
8363 */
8364
Walter Dörwald1ab83302007-05-18 17:15:44 +00008365 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008367#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008369#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008371#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008373 if (repr == NULL)
8374 return NULL;
8375
Walter Dörwald1ab83302007-05-18 17:15:44 +00008376 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008377
8378 /* Add quote */
8379 *p++ = (findchar(s, size, '\'') &&
8380 !findchar(s, size, '"')) ? '"' : '\'';
8381 while (size-- > 0) {
8382 Py_UNICODE ch = *s++;
8383
8384 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008385 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008386 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008387 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008388 continue;
8389 }
8390
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008392 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008393 *p++ = '\\';
8394 *p++ = 't';
8395 }
8396 else if (ch == '\n') {
8397 *p++ = '\\';
8398 *p++ = 'n';
8399 }
8400 else if (ch == '\r') {
8401 *p++ = '\\';
8402 *p++ = 'r';
8403 }
8404
8405 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008406 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008407 *p++ = '\\';
8408 *p++ = 'x';
8409 *p++ = hexdigits[(ch >> 4) & 0x000F];
8410 *p++ = hexdigits[ch & 0x000F];
8411 }
8412
Georg Brandl559e5d72008-06-11 18:37:52 +00008413 /* Copy ASCII characters as-is */
8414 else if (ch < 0x7F) {
8415 *p++ = ch;
8416 }
8417
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008419 else {
8420 Py_UCS4 ucs = ch;
8421
8422#ifndef Py_UNICODE_WIDE
8423 Py_UNICODE ch2 = 0;
8424 /* Get code point from surrogate pair */
8425 if (size > 0) {
8426 ch2 = *s;
8427 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008429 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008431 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008432 size--;
8433 }
8434 }
8435#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008436 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008437 (categories Z* and C* except ASCII space)
8438 */
8439 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8440 /* Map 8-bit characters to '\xhh' */
8441 if (ucs <= 0xff) {
8442 *p++ = '\\';
8443 *p++ = 'x';
8444 *p++ = hexdigits[(ch >> 4) & 0x000F];
8445 *p++ = hexdigits[ch & 0x000F];
8446 }
8447 /* Map 21-bit characters to '\U00xxxxxx' */
8448 else if (ucs >= 0x10000) {
8449 *p++ = '\\';
8450 *p++ = 'U';
8451 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8452 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8453 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8454 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8455 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8456 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8457 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8458 *p++ = hexdigits[ucs & 0x0000000F];
8459 }
8460 /* Map 16-bit characters to '\uxxxx' */
8461 else {
8462 *p++ = '\\';
8463 *p++ = 'u';
8464 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8465 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8466 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8467 *p++ = hexdigits[ucs & 0x000F];
8468 }
8469 }
8470 /* Copy characters as-is */
8471 else {
8472 *p++ = ch;
8473#ifndef Py_UNICODE_WIDE
8474 if (ucs >= 0x10000)
8475 *p++ = ch2;
8476#endif
8477 }
8478 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008479 }
8480 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008481 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008482
8483 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008484 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008485 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486}
8487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008488PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490\n\
8491Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008492such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493arguments start and end are interpreted as in slice notation.\n\
8494\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008495Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496
8497static PyObject *
8498unicode_rfind(PyUnicodeObject *self, PyObject *args)
8499{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008500 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008501 Py_ssize_t start;
8502 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008503 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504
Christian Heimes9cd17752007-11-18 19:35:23 +00008505 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507
Thomas Wouters477c8d52006-05-27 19:21:47 +00008508 result = stringlib_rfind_slice(
8509 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8510 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8511 start, end
8512 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513
8514 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008515
Christian Heimes217cfd12007-12-02 14:31:20 +00008516 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517}
8518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008519PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008522Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523
8524static PyObject *
8525unicode_rindex(PyUnicodeObject *self, PyObject *args)
8526{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008527 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008528 Py_ssize_t start;
8529 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008530 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531
Christian Heimes9cd17752007-11-18 19:35:23 +00008532 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008533 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534
Thomas Wouters477c8d52006-05-27 19:21:47 +00008535 result = stringlib_rfind_slice(
8536 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8537 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8538 start, end
8539 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540
8541 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008542
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 if (result < 0) {
8544 PyErr_SetString(PyExc_ValueError, "substring not found");
8545 return NULL;
8546 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008547 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548}
8549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008550PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008553Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008554done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555
8556static PyObject *
8557unicode_rjust(PyUnicodeObject *self, PyObject *args)
8558{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008559 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008560 Py_UNICODE fillchar = ' ';
8561
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008562 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 return NULL;
8564
Tim Peters7a29bd52001-09-12 03:03:31 +00008565 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566 Py_INCREF(self);
8567 return (PyObject*) self;
8568 }
8569
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008570 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571}
8572
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 PyObject *sep,
8575 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576{
8577 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008578
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 s = PyUnicode_FromObject(s);
8580 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008581 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 if (sep != NULL) {
8583 sep = PyUnicode_FromObject(sep);
8584 if (sep == NULL) {
8585 Py_DECREF(s);
8586 return NULL;
8587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 }
8589
8590 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8591
8592 Py_DECREF(s);
8593 Py_XDECREF(sep);
8594 return result;
8595}
8596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008597PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599\n\
8600Return a list of the words in S, using sep as the\n\
8601delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008602splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008603whitespace string is a separator and empty strings are\n\
8604removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605
8606static PyObject*
8607unicode_split(PyUnicodeObject *self, PyObject *args)
8608{
8609 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008610 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
Martin v. Löwis18e16552006-02-15 17:27:45 +00008612 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 return NULL;
8614
8615 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621}
8622
Thomas Wouters477c8d52006-05-27 19:21:47 +00008623PyObject *
8624PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8625{
8626 PyObject* str_obj;
8627 PyObject* sep_obj;
8628 PyObject* out;
8629
8630 str_obj = PyUnicode_FromObject(str_in);
8631 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008633 sep_obj = PyUnicode_FromObject(sep_in);
8634 if (!sep_obj) {
8635 Py_DECREF(str_obj);
8636 return NULL;
8637 }
8638
8639 out = stringlib_partition(
8640 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8641 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8642 );
8643
8644 Py_DECREF(sep_obj);
8645 Py_DECREF(str_obj);
8646
8647 return out;
8648}
8649
8650
8651PyObject *
8652PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8653{
8654 PyObject* str_obj;
8655 PyObject* sep_obj;
8656 PyObject* out;
8657
8658 str_obj = PyUnicode_FromObject(str_in);
8659 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008661 sep_obj = PyUnicode_FromObject(sep_in);
8662 if (!sep_obj) {
8663 Py_DECREF(str_obj);
8664 return NULL;
8665 }
8666
8667 out = stringlib_rpartition(
8668 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8669 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8670 );
8671
8672 Py_DECREF(sep_obj);
8673 Py_DECREF(str_obj);
8674
8675 return out;
8676}
8677
8678PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008680\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008681Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008682the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008683found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008684
8685static PyObject*
8686unicode_partition(PyUnicodeObject *self, PyObject *separator)
8687{
8688 return PyUnicode_Partition((PyObject *)self, separator);
8689}
8690
8691PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008692 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008693\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008694Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008695the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008696separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008697
8698static PyObject*
8699unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8700{
8701 return PyUnicode_RPartition((PyObject *)self, separator);
8702}
8703
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008704PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 PyObject *sep,
8706 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008707{
8708 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008709
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008710 s = PyUnicode_FromObject(s);
8711 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008712 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 if (sep != NULL) {
8714 sep = PyUnicode_FromObject(sep);
8715 if (sep == NULL) {
8716 Py_DECREF(s);
8717 return NULL;
8718 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008719 }
8720
8721 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8722
8723 Py_DECREF(s);
8724 Py_XDECREF(sep);
8725 return result;
8726}
8727
8728PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008730\n\
8731Return a list of the words in S, using sep as the\n\
8732delimiter string, starting at the end of the string and\n\
8733working to the front. If maxsplit is given, at most maxsplit\n\
8734splits are done. If sep is not specified, any whitespace string\n\
8735is a separator.");
8736
8737static PyObject*
8738unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8739{
8740 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008741 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008742
Martin v. Löwis18e16552006-02-15 17:27:45 +00008743 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008744 return NULL;
8745
8746 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008748 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008750 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008752}
8753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008754PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756\n\
8757Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008758Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008759is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760
8761static PyObject*
8762unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8763{
Guido van Rossum86662912000-04-11 15:38:46 +00008764 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765
Guido van Rossum86662912000-04-11 15:38:46 +00008766 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767 return NULL;
8768
Guido van Rossum86662912000-04-11 15:38:46 +00008769 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770}
8771
8772static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008773PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774{
Walter Dörwald346737f2007-05-31 10:44:43 +00008775 if (PyUnicode_CheckExact(self)) {
8776 Py_INCREF(self);
8777 return self;
8778 } else
8779 /* Subtype -- return genuine unicode string with the same value. */
8780 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8781 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782}
8783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008784PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786\n\
8787Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008788and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789
8790static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008791unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 return fixup(self, fixswapcase);
8794}
8795
Georg Brandlceee0772007-11-27 23:48:05 +00008796PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008798\n\
8799Return a translation table usable for str.translate().\n\
8800If there is only one argument, it must be a dictionary mapping Unicode\n\
8801ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008802Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008803If there are two arguments, they must be strings of equal length, and\n\
8804in the resulting dictionary, each character in x will be mapped to the\n\
8805character at the same position in y. If there is a third argument, it\n\
8806must be a string, whose characters will be mapped to None in the result.");
8807
8808static PyObject*
8809unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8810{
8811 PyObject *x, *y = NULL, *z = NULL;
8812 PyObject *new = NULL, *key, *value;
8813 Py_ssize_t i = 0;
8814 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008815
Georg Brandlceee0772007-11-27 23:48:05 +00008816 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8817 return NULL;
8818 new = PyDict_New();
8819 if (!new)
8820 return NULL;
8821 if (y != NULL) {
8822 /* x must be a string too, of equal length */
8823 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8824 if (!PyUnicode_Check(x)) {
8825 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8826 "be a string if there is a second argument");
8827 goto err;
8828 }
8829 if (PyUnicode_GET_SIZE(x) != ylen) {
8830 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8831 "arguments must have equal length");
8832 goto err;
8833 }
8834 /* create entries for translating chars in x to those in y */
8835 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008836 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8837 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008838 if (!key || !value)
8839 goto err;
8840 res = PyDict_SetItem(new, key, value);
8841 Py_DECREF(key);
8842 Py_DECREF(value);
8843 if (res < 0)
8844 goto err;
8845 }
8846 /* create entries for deleting chars in z */
8847 if (z != NULL) {
8848 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008849 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008850 if (!key)
8851 goto err;
8852 res = PyDict_SetItem(new, key, Py_None);
8853 Py_DECREF(key);
8854 if (res < 0)
8855 goto err;
8856 }
8857 }
8858 } else {
8859 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008860 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008861 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8862 "to maketrans it must be a dict");
8863 goto err;
8864 }
8865 /* copy entries into the new dict, converting string keys to int keys */
8866 while (PyDict_Next(x, &i, &key, &value)) {
8867 if (PyUnicode_Check(key)) {
8868 /* convert string keys to integer keys */
8869 PyObject *newkey;
8870 if (PyUnicode_GET_SIZE(key) != 1) {
8871 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8872 "table must be of length 1");
8873 goto err;
8874 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008875 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008876 if (!newkey)
8877 goto err;
8878 res = PyDict_SetItem(new, newkey, value);
8879 Py_DECREF(newkey);
8880 if (res < 0)
8881 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008882 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008883 /* just keep integer keys */
8884 if (PyDict_SetItem(new, key, value) < 0)
8885 goto err;
8886 } else {
8887 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8888 "be strings or integers");
8889 goto err;
8890 }
8891 }
8892 }
8893 return new;
8894 err:
8895 Py_DECREF(new);
8896 return NULL;
8897}
8898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008899PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901\n\
8902Return a copy of the string S, where all characters have been mapped\n\
8903through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008904Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008905Unmapped characters are left untouched. Characters mapped to None\n\
8906are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907
8908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008909unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910{
Georg Brandlceee0772007-11-27 23:48:05 +00008911 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912}
8913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008914PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008917Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918
8919static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008920unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922 return fixup(self, fixupper);
8923}
8924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008925PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008928Pad a numeric string S with zeros on the left, to fill a field\n\
8929of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930
8931static PyObject *
8932unicode_zfill(PyUnicodeObject *self, PyObject *args)
8933{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008934 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 PyUnicodeObject *u;
8936
Martin v. Löwis18e16552006-02-15 17:27:45 +00008937 Py_ssize_t width;
8938 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 return NULL;
8940
8941 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008942 if (PyUnicode_CheckExact(self)) {
8943 Py_INCREF(self);
8944 return (PyObject*) self;
8945 }
8946 else
8947 return PyUnicode_FromUnicode(
8948 PyUnicode_AS_UNICODE(self),
8949 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 }
8952
8953 fill = width - self->length;
8954
8955 u = pad(self, fill, 0, '0');
8956
Walter Dörwald068325e2002-04-15 13:36:47 +00008957 if (u == NULL)
8958 return NULL;
8959
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960 if (u->str[fill] == '+' || u->str[fill] == '-') {
8961 /* move sign to beginning of string */
8962 u->str[0] = u->str[fill];
8963 u->str[fill] = '0';
8964 }
8965
8966 return (PyObject*) u;
8967}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968
8969#if 0
8970static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008971unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972{
Christian Heimes2202f872008-02-06 14:31:34 +00008973 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008975
8976static PyObject *
8977unicode__decimal2ascii(PyObject *self)
8978{
8979 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
8980 PyUnicode_GET_SIZE(self));
8981}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982#endif
8983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008984PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008987Return True if S starts with the specified prefix, False otherwise.\n\
8988With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008989With optional end, stop comparing S at that position.\n\
8990prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991
8992static PyObject *
8993unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008996 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008998 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008999 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009000 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009002 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9004 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009005 if (PyTuple_Check(subobj)) {
9006 Py_ssize_t i;
9007 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9008 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009010 if (substring == NULL)
9011 return NULL;
9012 result = tailmatch(self, substring, start, end, -1);
9013 Py_DECREF(substring);
9014 if (result) {
9015 Py_RETURN_TRUE;
9016 }
9017 }
9018 /* nothing matched */
9019 Py_RETURN_FALSE;
9020 }
9021 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009024 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009026 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027}
9028
9029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009030PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009033Return True if S ends with the specified suffix, False otherwise.\n\
9034With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009035With optional end, stop comparing S at that position.\n\
9036suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037
9038static PyObject *
9039unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009042 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009044 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009045 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009046 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009048 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009049 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9050 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009051 if (PyTuple_Check(subobj)) {
9052 Py_ssize_t i;
9053 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9054 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009055 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009056 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009058 result = tailmatch(self, substring, start, end, +1);
9059 Py_DECREF(substring);
9060 if (result) {
9061 Py_RETURN_TRUE;
9062 }
9063 }
9064 Py_RETURN_FALSE;
9065 }
9066 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009070 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009072 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073}
9074
Eric Smith8c663262007-08-25 02:26:07 +00009075#include "stringlib/string_format.h"
9076
9077PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009078 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009079\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009080Return a formatted version of S, using substitutions from args and kwargs.\n\
9081The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009082
Eric Smith27bbca62010-11-04 17:06:58 +00009083PyDoc_STRVAR(format_map__doc__,
9084 "S.format_map(mapping) -> str\n\
9085\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009086Return a formatted version of S, using substitutions from mapping.\n\
9087The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009088
Eric Smith4a7d76d2008-05-30 18:10:19 +00009089static PyObject *
9090unicode__format__(PyObject* self, PyObject* args)
9091{
9092 PyObject *format_spec;
9093
9094 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9095 return NULL;
9096
9097 return _PyUnicode_FormatAdvanced(self,
9098 PyUnicode_AS_UNICODE(format_spec),
9099 PyUnicode_GET_SIZE(format_spec));
9100}
9101
Eric Smith8c663262007-08-25 02:26:07 +00009102PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009104\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009105Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009106
9107static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009108unicode__sizeof__(PyUnicodeObject *v)
9109{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009110 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9111 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009112}
9113
9114PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009116
9117static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009118unicode_getnewargs(PyUnicodeObject *v)
9119{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009120 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009121}
9122
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123static PyMethodDef unicode_methods[] = {
9124
9125 /* Order is according to common usage: often used methods should
9126 appear first, since lookup is done sequentially. */
9127
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009128 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009129 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9130 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009131 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009132 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9133 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9134 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9135 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9136 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9137 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9138 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009139 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009140 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9141 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9142 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009143 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009144 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9145 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9146 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009147 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009148 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009149 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009150 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009151 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9152 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9153 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9154 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9155 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9156 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9157 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9158 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9159 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9160 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9161 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9162 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9163 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9164 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009165 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009166 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009167 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009168 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009169 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009170 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009171 {"maketrans", (PyCFunction) unicode_maketrans,
9172 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009173 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009174#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009175 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176#endif
9177
9178#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009179 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009180 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009181 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182#endif
9183
Benjamin Peterson14339b62009-01-31 16:36:08 +00009184 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 {NULL, NULL}
9186};
9187
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009188static PyObject *
9189unicode_mod(PyObject *v, PyObject *w)
9190{
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 if (!PyUnicode_Check(v)) {
9192 Py_INCREF(Py_NotImplemented);
9193 return Py_NotImplemented;
9194 }
9195 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009196}
9197
9198static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009199 0, /*nb_add*/
9200 0, /*nb_subtract*/
9201 0, /*nb_multiply*/
9202 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009203};
9204
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009206 (lenfunc) unicode_length, /* sq_length */
9207 PyUnicode_Concat, /* sq_concat */
9208 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9209 (ssizeargfunc) unicode_getitem, /* sq_item */
9210 0, /* sq_slice */
9211 0, /* sq_ass_item */
9212 0, /* sq_ass_slice */
9213 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214};
9215
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009216static PyObject*
9217unicode_subscript(PyUnicodeObject* self, PyObject* item)
9218{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009219 if (PyIndex_Check(item)) {
9220 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009221 if (i == -1 && PyErr_Occurred())
9222 return NULL;
9223 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009224 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009225 return unicode_getitem(self, i);
9226 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009227 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009228 Py_UNICODE* source_buf;
9229 Py_UNICODE* result_buf;
9230 PyObject* result;
9231
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009232 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009233 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009234 return NULL;
9235 }
9236
9237 if (slicelength <= 0) {
9238 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009239 } else if (start == 0 && step == 1 && slicelength == self->length &&
9240 PyUnicode_CheckExact(self)) {
9241 Py_INCREF(self);
9242 return (PyObject *)self;
9243 } else if (step == 1) {
9244 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009245 } else {
9246 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009247 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9248 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009249
Benjamin Peterson29060642009-01-31 22:14:21 +00009250 if (result_buf == NULL)
9251 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009252
9253 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9254 result_buf[i] = source_buf[cur];
9255 }
Tim Petersced69f82003-09-16 20:30:58 +00009256
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009257 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009258 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009259 return result;
9260 }
9261 } else {
9262 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9263 return NULL;
9264 }
9265}
9266
9267static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009268 (lenfunc)unicode_length, /* mp_length */
9269 (binaryfunc)unicode_subscript, /* mp_subscript */
9270 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009271};
9272
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273
Guido van Rossumd57fd912000-03-10 22:53:23 +00009274/* Helpers for PyUnicode_Format() */
9275
9276static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009277getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009279 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009281 (*p_argidx)++;
9282 if (arglen < 0)
9283 return args;
9284 else
9285 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286 }
9287 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009288 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289 return NULL;
9290}
9291
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009292/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009294static PyObject *
9295formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009297 char *p;
9298 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009300
Guido van Rossumd57fd912000-03-10 22:53:23 +00009301 x = PyFloat_AsDouble(v);
9302 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009303 return NULL;
9304
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009306 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009307
Eric Smith0923d1d2009-04-16 20:16:10 +00009308 p = PyOS_double_to_string(x, type, prec,
9309 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009310 if (p == NULL)
9311 return NULL;
9312 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009313 PyMem_Free(p);
9314 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315}
9316
Tim Peters38fd5b62000-09-21 05:43:11 +00009317static PyObject*
9318formatlong(PyObject *val, int flags, int prec, int type)
9319{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009320 char *buf;
9321 int len;
9322 PyObject *str; /* temporary string object. */
9323 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009324
Benjamin Peterson14339b62009-01-31 16:36:08 +00009325 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9326 if (!str)
9327 return NULL;
9328 result = PyUnicode_FromStringAndSize(buf, len);
9329 Py_DECREF(str);
9330 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009331}
9332
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333static int
9334formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009335 size_t buflen,
9336 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009338 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009339 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009340 if (PyUnicode_GET_SIZE(v) == 1) {
9341 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9342 buf[1] = '\0';
9343 return 1;
9344 }
9345#ifndef Py_UNICODE_WIDE
9346 if (PyUnicode_GET_SIZE(v) == 2) {
9347 /* Decode a valid surrogate pair */
9348 int c0 = PyUnicode_AS_UNICODE(v)[0];
9349 int c1 = PyUnicode_AS_UNICODE(v)[1];
9350 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9351 0xDC00 <= c1 && c1 <= 0xDFFF) {
9352 buf[0] = c0;
9353 buf[1] = c1;
9354 buf[2] = '\0';
9355 return 2;
9356 }
9357 }
9358#endif
9359 goto onError;
9360 }
9361 else {
9362 /* Integer input truncated to a character */
9363 long x;
9364 x = PyLong_AsLong(v);
9365 if (x == -1 && PyErr_Occurred())
9366 goto onError;
9367
9368 if (x < 0 || x > 0x10ffff) {
9369 PyErr_SetString(PyExc_OverflowError,
9370 "%c arg not in range(0x110000)");
9371 return -1;
9372 }
9373
9374#ifndef Py_UNICODE_WIDE
9375 if (x > 0xffff) {
9376 x -= 0x10000;
9377 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9378 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9379 return 2;
9380 }
9381#endif
9382 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009383 buf[1] = '\0';
9384 return 1;
9385 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009386
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009388 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009390 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391}
9392
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009393/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009394 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009395*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009396#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009397
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400{
9401 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009402 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403 int args_owned = 0;
9404 PyUnicodeObject *result = NULL;
9405 PyObject *dict = NULL;
9406 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009407
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009409 PyErr_BadInternalCall();
9410 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411 }
9412 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009413 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415 fmt = PyUnicode_AS_UNICODE(uformat);
9416 fmtcnt = PyUnicode_GET_SIZE(uformat);
9417
9418 reslen = rescnt = fmtcnt + 100;
9419 result = _PyUnicode_New(reslen);
9420 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009421 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422 res = PyUnicode_AS_UNICODE(result);
9423
9424 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009425 arglen = PyTuple_Size(args);
9426 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427 }
9428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 arglen = -1;
9430 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009432 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009433 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009434 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435
9436 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009437 if (*fmt != '%') {
9438 if (--rescnt < 0) {
9439 rescnt = fmtcnt + 100;
9440 reslen += rescnt;
9441 if (_PyUnicode_Resize(&result, reslen) < 0)
9442 goto onError;
9443 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9444 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009445 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009446 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009447 }
9448 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009449 /* Got a format specifier */
9450 int flags = 0;
9451 Py_ssize_t width = -1;
9452 int prec = -1;
9453 Py_UNICODE c = '\0';
9454 Py_UNICODE fill;
9455 int isnumok;
9456 PyObject *v = NULL;
9457 PyObject *temp = NULL;
9458 Py_UNICODE *pbuf;
9459 Py_UNICODE sign;
9460 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009461 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462
Benjamin Peterson29060642009-01-31 22:14:21 +00009463 fmt++;
9464 if (*fmt == '(') {
9465 Py_UNICODE *keystart;
9466 Py_ssize_t keylen;
9467 PyObject *key;
9468 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009469
Benjamin Peterson29060642009-01-31 22:14:21 +00009470 if (dict == NULL) {
9471 PyErr_SetString(PyExc_TypeError,
9472 "format requires a mapping");
9473 goto onError;
9474 }
9475 ++fmt;
9476 --fmtcnt;
9477 keystart = fmt;
9478 /* Skip over balanced parentheses */
9479 while (pcount > 0 && --fmtcnt >= 0) {
9480 if (*fmt == ')')
9481 --pcount;
9482 else if (*fmt == '(')
9483 ++pcount;
9484 fmt++;
9485 }
9486 keylen = fmt - keystart - 1;
9487 if (fmtcnt < 0 || pcount > 0) {
9488 PyErr_SetString(PyExc_ValueError,
9489 "incomplete format key");
9490 goto onError;
9491 }
9492#if 0
9493 /* keys are converted to strings using UTF-8 and
9494 then looked up since Python uses strings to hold
9495 variables names etc. in its namespaces and we
9496 wouldn't want to break common idioms. */
9497 key = PyUnicode_EncodeUTF8(keystart,
9498 keylen,
9499 NULL);
9500#else
9501 key = PyUnicode_FromUnicode(keystart, keylen);
9502#endif
9503 if (key == NULL)
9504 goto onError;
9505 if (args_owned) {
9506 Py_DECREF(args);
9507 args_owned = 0;
9508 }
9509 args = PyObject_GetItem(dict, key);
9510 Py_DECREF(key);
9511 if (args == NULL) {
9512 goto onError;
9513 }
9514 args_owned = 1;
9515 arglen = -1;
9516 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009517 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009518 while (--fmtcnt >= 0) {
9519 switch (c = *fmt++) {
9520 case '-': flags |= F_LJUST; continue;
9521 case '+': flags |= F_SIGN; continue;
9522 case ' ': flags |= F_BLANK; continue;
9523 case '#': flags |= F_ALT; continue;
9524 case '0': flags |= F_ZERO; continue;
9525 }
9526 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009527 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009528 if (c == '*') {
9529 v = getnextarg(args, arglen, &argidx);
9530 if (v == NULL)
9531 goto onError;
9532 if (!PyLong_Check(v)) {
9533 PyErr_SetString(PyExc_TypeError,
9534 "* wants int");
9535 goto onError;
9536 }
9537 width = PyLong_AsLong(v);
9538 if (width == -1 && PyErr_Occurred())
9539 goto onError;
9540 if (width < 0) {
9541 flags |= F_LJUST;
9542 width = -width;
9543 }
9544 if (--fmtcnt >= 0)
9545 c = *fmt++;
9546 }
9547 else if (c >= '0' && c <= '9') {
9548 width = c - '0';
9549 while (--fmtcnt >= 0) {
9550 c = *fmt++;
9551 if (c < '0' || c > '9')
9552 break;
9553 if ((width*10) / 10 != width) {
9554 PyErr_SetString(PyExc_ValueError,
9555 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009556 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009557 }
9558 width = width*10 + (c - '0');
9559 }
9560 }
9561 if (c == '.') {
9562 prec = 0;
9563 if (--fmtcnt >= 0)
9564 c = *fmt++;
9565 if (c == '*') {
9566 v = getnextarg(args, arglen, &argidx);
9567 if (v == NULL)
9568 goto onError;
9569 if (!PyLong_Check(v)) {
9570 PyErr_SetString(PyExc_TypeError,
9571 "* wants int");
9572 goto onError;
9573 }
9574 prec = PyLong_AsLong(v);
9575 if (prec == -1 && PyErr_Occurred())
9576 goto onError;
9577 if (prec < 0)
9578 prec = 0;
9579 if (--fmtcnt >= 0)
9580 c = *fmt++;
9581 }
9582 else if (c >= '0' && c <= '9') {
9583 prec = c - '0';
9584 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009585 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 if (c < '0' || c > '9')
9587 break;
9588 if ((prec*10) / 10 != prec) {
9589 PyErr_SetString(PyExc_ValueError,
9590 "prec too big");
9591 goto onError;
9592 }
9593 prec = prec*10 + (c - '0');
9594 }
9595 }
9596 } /* prec */
9597 if (fmtcnt >= 0) {
9598 if (c == 'h' || c == 'l' || c == 'L') {
9599 if (--fmtcnt >= 0)
9600 c = *fmt++;
9601 }
9602 }
9603 if (fmtcnt < 0) {
9604 PyErr_SetString(PyExc_ValueError,
9605 "incomplete format");
9606 goto onError;
9607 }
9608 if (c != '%') {
9609 v = getnextarg(args, arglen, &argidx);
9610 if (v == NULL)
9611 goto onError;
9612 }
9613 sign = 0;
9614 fill = ' ';
9615 switch (c) {
9616
9617 case '%':
9618 pbuf = formatbuf;
9619 /* presume that buffer length is at least 1 */
9620 pbuf[0] = '%';
9621 len = 1;
9622 break;
9623
9624 case 's':
9625 case 'r':
9626 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009627 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 temp = v;
9629 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009630 }
9631 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009632 if (c == 's')
9633 temp = PyObject_Str(v);
9634 else if (c == 'r')
9635 temp = PyObject_Repr(v);
9636 else
9637 temp = PyObject_ASCII(v);
9638 if (temp == NULL)
9639 goto onError;
9640 if (PyUnicode_Check(temp))
9641 /* nothing to do */;
9642 else {
9643 Py_DECREF(temp);
9644 PyErr_SetString(PyExc_TypeError,
9645 "%s argument has non-string str()");
9646 goto onError;
9647 }
9648 }
9649 pbuf = PyUnicode_AS_UNICODE(temp);
9650 len = PyUnicode_GET_SIZE(temp);
9651 if (prec >= 0 && len > prec)
9652 len = prec;
9653 break;
9654
9655 case 'i':
9656 case 'd':
9657 case 'u':
9658 case 'o':
9659 case 'x':
9660 case 'X':
9661 if (c == 'i')
9662 c = 'd';
9663 isnumok = 0;
9664 if (PyNumber_Check(v)) {
9665 PyObject *iobj=NULL;
9666
9667 if (PyLong_Check(v)) {
9668 iobj = v;
9669 Py_INCREF(iobj);
9670 }
9671 else {
9672 iobj = PyNumber_Long(v);
9673 }
9674 if (iobj!=NULL) {
9675 if (PyLong_Check(iobj)) {
9676 isnumok = 1;
9677 temp = formatlong(iobj, flags, prec, c);
9678 Py_DECREF(iobj);
9679 if (!temp)
9680 goto onError;
9681 pbuf = PyUnicode_AS_UNICODE(temp);
9682 len = PyUnicode_GET_SIZE(temp);
9683 sign = 1;
9684 }
9685 else {
9686 Py_DECREF(iobj);
9687 }
9688 }
9689 }
9690 if (!isnumok) {
9691 PyErr_Format(PyExc_TypeError,
9692 "%%%c format: a number is required, "
9693 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9694 goto onError;
9695 }
9696 if (flags & F_ZERO)
9697 fill = '0';
9698 break;
9699
9700 case 'e':
9701 case 'E':
9702 case 'f':
9703 case 'F':
9704 case 'g':
9705 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009706 temp = formatfloat(v, flags, prec, c);
9707 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009708 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009709 pbuf = PyUnicode_AS_UNICODE(temp);
9710 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009711 sign = 1;
9712 if (flags & F_ZERO)
9713 fill = '0';
9714 break;
9715
9716 case 'c':
9717 pbuf = formatbuf;
9718 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9719 if (len < 0)
9720 goto onError;
9721 break;
9722
9723 default:
9724 PyErr_Format(PyExc_ValueError,
9725 "unsupported format character '%c' (0x%x) "
9726 "at index %zd",
9727 (31<=c && c<=126) ? (char)c : '?',
9728 (int)c,
9729 (Py_ssize_t)(fmt - 1 -
9730 PyUnicode_AS_UNICODE(uformat)));
9731 goto onError;
9732 }
9733 if (sign) {
9734 if (*pbuf == '-' || *pbuf == '+') {
9735 sign = *pbuf++;
9736 len--;
9737 }
9738 else if (flags & F_SIGN)
9739 sign = '+';
9740 else if (flags & F_BLANK)
9741 sign = ' ';
9742 else
9743 sign = 0;
9744 }
9745 if (width < len)
9746 width = len;
9747 if (rescnt - (sign != 0) < width) {
9748 reslen -= rescnt;
9749 rescnt = width + fmtcnt + 100;
9750 reslen += rescnt;
9751 if (reslen < 0) {
9752 Py_XDECREF(temp);
9753 PyErr_NoMemory();
9754 goto onError;
9755 }
9756 if (_PyUnicode_Resize(&result, reslen) < 0) {
9757 Py_XDECREF(temp);
9758 goto onError;
9759 }
9760 res = PyUnicode_AS_UNICODE(result)
9761 + reslen - rescnt;
9762 }
9763 if (sign) {
9764 if (fill != ' ')
9765 *res++ = sign;
9766 rescnt--;
9767 if (width > len)
9768 width--;
9769 }
9770 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9771 assert(pbuf[0] == '0');
9772 assert(pbuf[1] == c);
9773 if (fill != ' ') {
9774 *res++ = *pbuf++;
9775 *res++ = *pbuf++;
9776 }
9777 rescnt -= 2;
9778 width -= 2;
9779 if (width < 0)
9780 width = 0;
9781 len -= 2;
9782 }
9783 if (width > len && !(flags & F_LJUST)) {
9784 do {
9785 --rescnt;
9786 *res++ = fill;
9787 } while (--width > len);
9788 }
9789 if (fill == ' ') {
9790 if (sign)
9791 *res++ = sign;
9792 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9793 assert(pbuf[0] == '0');
9794 assert(pbuf[1] == c);
9795 *res++ = *pbuf++;
9796 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009797 }
9798 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009799 Py_UNICODE_COPY(res, pbuf, len);
9800 res += len;
9801 rescnt -= len;
9802 while (--width >= len) {
9803 --rescnt;
9804 *res++ = ' ';
9805 }
9806 if (dict && (argidx < arglen) && c != '%') {
9807 PyErr_SetString(PyExc_TypeError,
9808 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009809 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009810 goto onError;
9811 }
9812 Py_XDECREF(temp);
9813 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814 } /* until end */
9815 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009816 PyErr_SetString(PyExc_TypeError,
9817 "not all arguments converted during string formatting");
9818 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819 }
9820
Thomas Woutersa96affe2006-03-12 00:29:36 +00009821 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009822 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009824 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009825 }
9826 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827 return (PyObject *)result;
9828
Benjamin Peterson29060642009-01-31 22:14:21 +00009829 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830 Py_XDECREF(result);
9831 Py_DECREF(uformat);
9832 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834 }
9835 return NULL;
9836}
9837
Jeremy Hylton938ace62002-07-17 16:30:39 +00009838static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009839unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9840
Tim Peters6d6c1a32001-08-02 04:15:00 +00009841static PyObject *
9842unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9843{
Benjamin Peterson29060642009-01-31 22:14:21 +00009844 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009845 static char *kwlist[] = {"object", "encoding", "errors", 0};
9846 char *encoding = NULL;
9847 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009848
Benjamin Peterson14339b62009-01-31 16:36:08 +00009849 if (type != &PyUnicode_Type)
9850 return unicode_subtype_new(type, args, kwds);
9851 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009852 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009853 return NULL;
9854 if (x == NULL)
9855 return (PyObject *)_PyUnicode_New(0);
9856 if (encoding == NULL && errors == NULL)
9857 return PyObject_Str(x);
9858 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009859 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009860}
9861
Guido van Rossume023fe02001-08-30 03:12:59 +00009862static PyObject *
9863unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9864{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009865 PyUnicodeObject *tmp, *pnew;
9866 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009867
Benjamin Peterson14339b62009-01-31 16:36:08 +00009868 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9869 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9870 if (tmp == NULL)
9871 return NULL;
9872 assert(PyUnicode_Check(tmp));
9873 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9874 if (pnew == NULL) {
9875 Py_DECREF(tmp);
9876 return NULL;
9877 }
9878 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9879 if (pnew->str == NULL) {
9880 _Py_ForgetReference((PyObject *)pnew);
9881 PyObject_Del(pnew);
9882 Py_DECREF(tmp);
9883 return PyErr_NoMemory();
9884 }
9885 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9886 pnew->length = n;
9887 pnew->hash = tmp->hash;
9888 Py_DECREF(tmp);
9889 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009890}
9891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009892PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009893 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009894\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009895Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009896encoding defaults to the current default string encoding.\n\
9897errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009898
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009899static PyObject *unicode_iter(PyObject *seq);
9900
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009902 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009903 "str", /* tp_name */
9904 sizeof(PyUnicodeObject), /* tp_size */
9905 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009907 (destructor)unicode_dealloc, /* tp_dealloc */
9908 0, /* tp_print */
9909 0, /* tp_getattr */
9910 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009911 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009912 unicode_repr, /* tp_repr */
9913 &unicode_as_number, /* tp_as_number */
9914 &unicode_as_sequence, /* tp_as_sequence */
9915 &unicode_as_mapping, /* tp_as_mapping */
9916 (hashfunc) unicode_hash, /* tp_hash*/
9917 0, /* tp_call*/
9918 (reprfunc) unicode_str, /* tp_str */
9919 PyObject_GenericGetAttr, /* tp_getattro */
9920 0, /* tp_setattro */
9921 0, /* tp_as_buffer */
9922 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009923 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009924 unicode_doc, /* tp_doc */
9925 0, /* tp_traverse */
9926 0, /* tp_clear */
9927 PyUnicode_RichCompare, /* tp_richcompare */
9928 0, /* tp_weaklistoffset */
9929 unicode_iter, /* tp_iter */
9930 0, /* tp_iternext */
9931 unicode_methods, /* tp_methods */
9932 0, /* tp_members */
9933 0, /* tp_getset */
9934 &PyBaseObject_Type, /* tp_base */
9935 0, /* tp_dict */
9936 0, /* tp_descr_get */
9937 0, /* tp_descr_set */
9938 0, /* tp_dictoffset */
9939 0, /* tp_init */
9940 0, /* tp_alloc */
9941 unicode_new, /* tp_new */
9942 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943};
9944
9945/* Initialize the Unicode implementation */
9946
Thomas Wouters78890102000-07-22 19:25:51 +00009947void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009949 int i;
9950
Thomas Wouters477c8d52006-05-27 19:21:47 +00009951 /* XXX - move this array to unicodectype.c ? */
9952 Py_UNICODE linebreak[] = {
9953 0x000A, /* LINE FEED */
9954 0x000D, /* CARRIAGE RETURN */
9955 0x001C, /* FILE SEPARATOR */
9956 0x001D, /* GROUP SEPARATOR */
9957 0x001E, /* RECORD SEPARATOR */
9958 0x0085, /* NEXT LINE */
9959 0x2028, /* LINE SEPARATOR */
9960 0x2029, /* PARAGRAPH SEPARATOR */
9961 };
9962
Fred Drakee4315f52000-05-09 19:53:39 +00009963 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009964 free_list = NULL;
9965 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009967 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009968 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009969
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009970 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009971 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009972 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009973 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009974
9975 /* initialize the linebreak bloom filter */
9976 bloom_linebreak = make_bloom_mask(
9977 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9978 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009979
9980 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981}
9982
9983/* Finalize the Unicode implementation */
9984
Christian Heimesa156e092008-02-16 07:38:31 +00009985int
9986PyUnicode_ClearFreeList(void)
9987{
9988 int freelist_size = numfree;
9989 PyUnicodeObject *u;
9990
9991 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009992 PyUnicodeObject *v = u;
9993 u = *(PyUnicodeObject **)u;
9994 if (v->str)
9995 PyObject_DEL(v->str);
9996 Py_XDECREF(v->defenc);
9997 PyObject_Del(v);
9998 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009999 }
10000 free_list = NULL;
10001 assert(numfree == 0);
10002 return freelist_size;
10003}
10004
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005void
Thomas Wouters78890102000-07-22 19:25:51 +000010006_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010008 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010010 Py_XDECREF(unicode_empty);
10011 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010012
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010013 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010014 if (unicode_latin1[i]) {
10015 Py_DECREF(unicode_latin1[i]);
10016 unicode_latin1[i] = NULL;
10017 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010018 }
Christian Heimesa156e092008-02-16 07:38:31 +000010019 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010021
Walter Dörwald16807132007-05-25 13:52:07 +000010022void
10023PyUnicode_InternInPlace(PyObject **p)
10024{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010025 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10026 PyObject *t;
10027 if (s == NULL || !PyUnicode_Check(s))
10028 Py_FatalError(
10029 "PyUnicode_InternInPlace: unicode strings only please!");
10030 /* If it's a subclass, we don't really know what putting
10031 it in the interned dict might do. */
10032 if (!PyUnicode_CheckExact(s))
10033 return;
10034 if (PyUnicode_CHECK_INTERNED(s))
10035 return;
10036 if (interned == NULL) {
10037 interned = PyDict_New();
10038 if (interned == NULL) {
10039 PyErr_Clear(); /* Don't leave an exception */
10040 return;
10041 }
10042 }
10043 /* It might be that the GetItem call fails even
10044 though the key is present in the dictionary,
10045 namely when this happens during a stack overflow. */
10046 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010047 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010048 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010049
Benjamin Peterson29060642009-01-31 22:14:21 +000010050 if (t) {
10051 Py_INCREF(t);
10052 Py_DECREF(*p);
10053 *p = t;
10054 return;
10055 }
Walter Dörwald16807132007-05-25 13:52:07 +000010056
Benjamin Peterson14339b62009-01-31 16:36:08 +000010057 PyThreadState_GET()->recursion_critical = 1;
10058 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10059 PyErr_Clear();
10060 PyThreadState_GET()->recursion_critical = 0;
10061 return;
10062 }
10063 PyThreadState_GET()->recursion_critical = 0;
10064 /* The two references in interned are not counted by refcnt.
10065 The deallocator will take care of this */
10066 Py_REFCNT(s) -= 2;
10067 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010068}
10069
10070void
10071PyUnicode_InternImmortal(PyObject **p)
10072{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010073 PyUnicode_InternInPlace(p);
10074 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10075 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10076 Py_INCREF(*p);
10077 }
Walter Dörwald16807132007-05-25 13:52:07 +000010078}
10079
10080PyObject *
10081PyUnicode_InternFromString(const char *cp)
10082{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010083 PyObject *s = PyUnicode_FromString(cp);
10084 if (s == NULL)
10085 return NULL;
10086 PyUnicode_InternInPlace(&s);
10087 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010088}
10089
10090void _Py_ReleaseInternedUnicodeStrings(void)
10091{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010092 PyObject *keys;
10093 PyUnicodeObject *s;
10094 Py_ssize_t i, n;
10095 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010096
Benjamin Peterson14339b62009-01-31 16:36:08 +000010097 if (interned == NULL || !PyDict_Check(interned))
10098 return;
10099 keys = PyDict_Keys(interned);
10100 if (keys == NULL || !PyList_Check(keys)) {
10101 PyErr_Clear();
10102 return;
10103 }
Walter Dörwald16807132007-05-25 13:52:07 +000010104
Benjamin Peterson14339b62009-01-31 16:36:08 +000010105 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10106 detector, interned unicode strings are not forcibly deallocated;
10107 rather, we give them their stolen references back, and then clear
10108 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010109
Benjamin Peterson14339b62009-01-31 16:36:08 +000010110 n = PyList_GET_SIZE(keys);
10111 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010112 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010113 for (i = 0; i < n; i++) {
10114 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10115 switch (s->state) {
10116 case SSTATE_NOT_INTERNED:
10117 /* XXX Shouldn't happen */
10118 break;
10119 case SSTATE_INTERNED_IMMORTAL:
10120 Py_REFCNT(s) += 1;
10121 immortal_size += s->length;
10122 break;
10123 case SSTATE_INTERNED_MORTAL:
10124 Py_REFCNT(s) += 2;
10125 mortal_size += s->length;
10126 break;
10127 default:
10128 Py_FatalError("Inconsistent interned string state.");
10129 }
10130 s->state = SSTATE_NOT_INTERNED;
10131 }
10132 fprintf(stderr, "total size of all interned strings: "
10133 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10134 "mortal/immortal\n", mortal_size, immortal_size);
10135 Py_DECREF(keys);
10136 PyDict_Clear(interned);
10137 Py_DECREF(interned);
10138 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010139}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010140
10141
10142/********************* Unicode Iterator **************************/
10143
10144typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010145 PyObject_HEAD
10146 Py_ssize_t it_index;
10147 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010148} unicodeiterobject;
10149
10150static void
10151unicodeiter_dealloc(unicodeiterobject *it)
10152{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010153 _PyObject_GC_UNTRACK(it);
10154 Py_XDECREF(it->it_seq);
10155 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010156}
10157
10158static int
10159unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10160{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010161 Py_VISIT(it->it_seq);
10162 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010163}
10164
10165static PyObject *
10166unicodeiter_next(unicodeiterobject *it)
10167{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010168 PyUnicodeObject *seq;
10169 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010170
Benjamin Peterson14339b62009-01-31 16:36:08 +000010171 assert(it != NULL);
10172 seq = it->it_seq;
10173 if (seq == NULL)
10174 return NULL;
10175 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010176
Benjamin Peterson14339b62009-01-31 16:36:08 +000010177 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10178 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010179 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010180 if (item != NULL)
10181 ++it->it_index;
10182 return item;
10183 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010184
Benjamin Peterson14339b62009-01-31 16:36:08 +000010185 Py_DECREF(seq);
10186 it->it_seq = NULL;
10187 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010188}
10189
10190static PyObject *
10191unicodeiter_len(unicodeiterobject *it)
10192{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010193 Py_ssize_t len = 0;
10194 if (it->it_seq)
10195 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10196 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010197}
10198
10199PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10200
10201static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010202 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010204 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010205};
10206
10207PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010208 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10209 "str_iterator", /* tp_name */
10210 sizeof(unicodeiterobject), /* tp_basicsize */
10211 0, /* tp_itemsize */
10212 /* methods */
10213 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10214 0, /* tp_print */
10215 0, /* tp_getattr */
10216 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010217 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010218 0, /* tp_repr */
10219 0, /* tp_as_number */
10220 0, /* tp_as_sequence */
10221 0, /* tp_as_mapping */
10222 0, /* tp_hash */
10223 0, /* tp_call */
10224 0, /* tp_str */
10225 PyObject_GenericGetAttr, /* tp_getattro */
10226 0, /* tp_setattro */
10227 0, /* tp_as_buffer */
10228 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10229 0, /* tp_doc */
10230 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10231 0, /* tp_clear */
10232 0, /* tp_richcompare */
10233 0, /* tp_weaklistoffset */
10234 PyObject_SelfIter, /* tp_iter */
10235 (iternextfunc)unicodeiter_next, /* tp_iternext */
10236 unicodeiter_methods, /* tp_methods */
10237 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010238};
10239
10240static PyObject *
10241unicode_iter(PyObject *seq)
10242{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010243 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010244
Benjamin Peterson14339b62009-01-31 16:36:08 +000010245 if (!PyUnicode_Check(seq)) {
10246 PyErr_BadInternalCall();
10247 return NULL;
10248 }
10249 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10250 if (it == NULL)
10251 return NULL;
10252 it->it_index = 0;
10253 Py_INCREF(seq);
10254 it->it_seq = (PyUnicodeObject *)seq;
10255 _PyObject_GC_TRACK(it);
10256 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010257}
10258
Martin v. Löwis5b222132007-06-10 09:51:05 +000010259size_t
10260Py_UNICODE_strlen(const Py_UNICODE *u)
10261{
10262 int res = 0;
10263 while(*u++)
10264 res++;
10265 return res;
10266}
10267
10268Py_UNICODE*
10269Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10270{
10271 Py_UNICODE *u = s1;
10272 while ((*u++ = *s2++));
10273 return s1;
10274}
10275
10276Py_UNICODE*
10277Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10278{
10279 Py_UNICODE *u = s1;
10280 while ((*u++ = *s2++))
10281 if (n-- == 0)
10282 break;
10283 return s1;
10284}
10285
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010286Py_UNICODE*
10287Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10288{
10289 Py_UNICODE *u1 = s1;
10290 u1 += Py_UNICODE_strlen(u1);
10291 Py_UNICODE_strcpy(u1, s2);
10292 return s1;
10293}
10294
Martin v. Löwis5b222132007-06-10 09:51:05 +000010295int
10296Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10297{
10298 while (*s1 && *s2 && *s1 == *s2)
10299 s1++, s2++;
10300 if (*s1 && *s2)
10301 return (*s1 < *s2) ? -1 : +1;
10302 if (*s1)
10303 return 1;
10304 if (*s2)
10305 return -1;
10306 return 0;
10307}
10308
Victor Stinneref8d95c2010-08-16 22:03:11 +000010309int
10310Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10311{
10312 register Py_UNICODE u1, u2;
10313 for (; n != 0; n--) {
10314 u1 = *s1;
10315 u2 = *s2;
10316 if (u1 != u2)
10317 return (u1 < u2) ? -1 : +1;
10318 if (u1 == '\0')
10319 return 0;
10320 s1++;
10321 s2++;
10322 }
10323 return 0;
10324}
10325
Martin v. Löwis5b222132007-06-10 09:51:05 +000010326Py_UNICODE*
10327Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10328{
10329 const Py_UNICODE *p;
10330 for (p = s; *p; p++)
10331 if (*p == c)
10332 return (Py_UNICODE*)p;
10333 return NULL;
10334}
10335
Victor Stinner331ea922010-08-10 16:37:20 +000010336Py_UNICODE*
10337Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10338{
10339 const Py_UNICODE *p;
10340 p = s + Py_UNICODE_strlen(s);
10341 while (p != s) {
10342 p--;
10343 if (*p == c)
10344 return (Py_UNICODE*)p;
10345 }
10346 return NULL;
10347}
10348
Victor Stinner71133ff2010-09-01 23:43:53 +000010349Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010350PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010351{
10352 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10353 Py_UNICODE *copy;
10354 Py_ssize_t size;
10355
10356 /* Ensure we won't overflow the size. */
10357 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10358 PyErr_NoMemory();
10359 return NULL;
10360 }
10361 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10362 size *= sizeof(Py_UNICODE);
10363 copy = PyMem_Malloc(size);
10364 if (copy == NULL) {
10365 PyErr_NoMemory();
10366 return NULL;
10367 }
10368 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10369 return copy;
10370}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010371
Georg Brandl66c221e2010-10-14 07:04:07 +000010372/* A _string module, to export formatter_parser and formatter_field_name_split
10373 to the string.Formatter class implemented in Python. */
10374
10375static PyMethodDef _string_methods[] = {
10376 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10377 METH_O, PyDoc_STR("split the argument as a field name")},
10378 {"formatter_parser", (PyCFunction) formatter_parser,
10379 METH_O, PyDoc_STR("parse the argument as a format string")},
10380 {NULL, NULL}
10381};
10382
10383static struct PyModuleDef _string_module = {
10384 PyModuleDef_HEAD_INIT,
10385 "_string",
10386 PyDoc_STR("string helper module"),
10387 0,
10388 _string_methods,
10389 NULL,
10390 NULL,
10391 NULL,
10392 NULL
10393};
10394
10395PyMODINIT_FUNC
10396PyInit__string(void)
10397{
10398 return PyModule_Create(&_string_module);
10399}
10400
10401
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010402#ifdef __cplusplus
10403}
10404#endif