blob: 069be7b6c63e1def01dd96c3e404ce15be889516 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146 PyObject **errorHandler,const char *encoding, const char *reason,
147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
Victor Stinner31be90b2010-04-22 19:38:16 +0000150static void raise_encode_exception(PyObject **exceptionObject,
151 const char *encoding,
152 const Py_UNICODE *unicode, Py_ssize_t size,
153 Py_ssize_t startpos, Py_ssize_t endpos,
154 const char *reason);
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000160/* 0x000B, * LINE TABULATION */
161/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x001C, * FILE SEPARATOR */
166/* 0x001D, * GROUP SEPARATOR */
167/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000168 0, 0, 0, 0, 1, 1, 1, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000173
Benjamin Peterson14339b62009-01-31 16:36:08 +0000174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000182};
183
184
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000185Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000186PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000188#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000189 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 /* This is actually an illegal character, so it should
192 not be passed to unichr. */
193 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#endif
195}
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200 to keep things simple, we use a single bitmask, using the least 5
201 bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
Antoine Pitrouf068f942010-01-13 14:19:12 +0000205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
Antoine Pitrouf068f942010-01-13 14:19:12 +0000219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221
Benjamin Peterson29060642009-01-31 22:14:21 +0000222#define BLOOM_LINEBREAK(ch) \
223 ((ch) < 128U ? ascii_linebreak[(ch)] : \
224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228 /* calculate simple bloom-style bitmask for a given unicode string */
229
Antoine Pitrouf068f942010-01-13 14:19:12 +0000230 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 Py_ssize_t i;
232
233 mask = 0;
234 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000235 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236
237 return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242 Py_ssize_t i;
243
244 for (i = 0; i < setlen; i++)
245 if (set[i] == chr)
246 return 1;
247
248 return 0;
249}
250
Benjamin Peterson29060642009-01-31 22:14:21 +0000251#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254/* --- Unicode Object ----------------------------------------------------- */
255
256static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000257int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000258 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259{
260 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000261
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000266 /* Resizing shared object (unicode_empty or single character
267 objects) in-place is not allowed. Use PyUnicode_Resize()
268 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 (unicode->length == 1 &&
272 unicode->str[0] < 256U &&
273 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000275 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 return -1;
277 }
278
Thomas Wouters477c8d52006-05-27 19:21:47 +0000279 /* We allocate one more byte to make sure the string is Ux0000 terminated.
280 The overallocation is also used by fastsearch, which assumes that it's
281 safe to look at str[length] (without making any assumptions about what
282 it contains). */
283
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000285 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000286 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 PyErr_NoMemory();
290 return -1;
291 }
292 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294
Benjamin Peterson29060642009-01-31 22:14:21 +0000295 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000298 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 }
300 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000301
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 return 0;
303}
304
305/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000306 Ux0000 terminated; some code (e.g. new_identifier)
307 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308
309 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000310 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312*/
313
314static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316{
317 register PyUnicodeObject *unicode;
318
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (length == 0 && unicode_empty != NULL) {
321 Py_INCREF(unicode_empty);
322 return unicode_empty;
323 }
324
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000325 /* Ensure we won't overflow the size. */
326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327 return (PyUnicodeObject *)PyErr_NoMemory();
328 }
329
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000331 if (free_list) {
332 unicode = free_list;
333 free_list = *(PyUnicodeObject **)unicode;
334 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 if (unicode->str) {
336 /* Keep-Alive optimization: we only upsize the buffer,
337 never downsize it. */
338 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000339 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 PyObject_DEL(unicode->str);
341 unicode->str = NULL;
342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 }
348 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000351 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 if (unicode == NULL)
354 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
358
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000359 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 PyErr_NoMemory();
361 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000363 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000364 * the caller fails before initializing str -- unicode_resize()
365 * reads str[0], and the Keep-Alive optimization can keep memory
366 * allocated for str alive across a call to unicode_dealloc(unicode).
367 * We don't want unicode_resize to read uninitialized memory in
368 * that case.
369 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000374 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377
Benjamin Peterson29060642009-01-31 22:14:21 +0000378 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000379 /* XXX UNREF/NEWREF interface should be more symmetrical */
380 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000382 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384}
385
386static
Guido van Rossum9475a232001-10-05 20:51:39 +0000387void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388{
Walter Dörwald16807132007-05-25 13:52:07 +0000389 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_NOT_INTERNED:
391 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_INTERNED_MORTAL:
394 /* revive dead object temporarily for DelItem */
395 Py_REFCNT(unicode) = 3;
396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397 Py_FatalError(
398 "deletion of interned string failed");
399 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000400
Benjamin Peterson29060642009-01-31 22:14:21 +0000401 case SSTATE_INTERNED_IMMORTAL:
402 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 default:
405 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406 }
407
Guido van Rossum604ddf82001-12-06 20:03:56 +0000408 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000410 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412 PyObject_DEL(unicode->str);
413 unicode->str = NULL;
414 unicode->length = 0;
415 }
416 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000417 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000418 }
419 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000420 *(PyUnicodeObject **)unicode = free_list;
421 free_list = unicode;
422 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000425 PyObject_DEL(unicode->str);
426 Py_XDECREF(unicode->defenc);
427 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 }
429}
430
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433{
434 register PyUnicodeObject *v;
435
436 /* Argument checks */
437 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000441 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 PyErr_BadInternalCall();
444 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445 }
446
447 /* Resizing unicode_empty and single character objects is not
448 possible since these are being shared. We simply return a fresh
449 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000450 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 (v == unicode_empty || v->length == 1)) {
452 PyUnicodeObject *w = _PyUnicode_New(length);
453 if (w == NULL)
454 return -1;
455 Py_UNICODE_COPY(w->str, v->str,
456 length < v->length ? length : v->length);
457 Py_DECREF(*unicode);
458 *unicode = w;
459 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 }
461
462 /* Note that we don't have to modify *unicode for unshared Unicode
463 objects, since we can modify them in-place. */
464 return unicode_resize(v, length);
465}
466
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000473 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474{
475 PyUnicodeObject *unicode;
476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 /* If the Unicode data is known at construction time, we can apply
478 some optimizations which share commonly used objects. */
479 if (u != NULL) {
480
Benjamin Peterson29060642009-01-31 22:14:21 +0000481 /* Optimization for empty strings */
482 if (size == 0 && unicode_empty != NULL) {
483 Py_INCREF(unicode_empty);
484 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000485 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000486
487 /* Single character Unicode objects in the Latin-1 range are
488 shared when using this constructor */
489 if (size == 1 && *u < 256) {
490 unicode = unicode_latin1[*u];
491 if (!unicode) {
492 unicode = _PyUnicode_New(1);
493 if (!unicode)
494 return NULL;
495 unicode->str[0] = *u;
496 unicode_latin1[*u] = unicode;
497 }
498 Py_INCREF(unicode);
499 return (PyObject *)unicode;
500 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000501 }
Tim Petersced69f82003-09-16 20:30:58 +0000502
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
506
507 /* Copy the Unicode data into the new object */
508 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000509 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 return (PyObject *)unicode;
512}
513
Walter Dörwaldd2034312007-05-18 16:29:38 +0000514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000515{
516 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000517
Benjamin Peterson14339b62009-01-31 16:36:08 +0000518 if (size < 0) {
519 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 return NULL;
522 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000525 some optimizations which share commonly used objects.
526 Also, this means the input must be UTF-8, so fall back to the
527 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (u != NULL) {
529
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 /* Optimization for empty strings */
531 if (size == 0 && unicode_empty != NULL) {
532 Py_INCREF(unicode_empty);
533 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000534 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000535
536 /* Single characters are shared when using this constructor.
537 Restrict to ASCII, since the input must be UTF-8. */
538 if (size == 1 && Py_CHARMASK(*u) < 128) {
539 unicode = unicode_latin1[Py_CHARMASK(*u)];
540 if (!unicode) {
541 unicode = _PyUnicode_New(1);
542 if (!unicode)
543 return NULL;
544 unicode->str[0] = Py_CHARMASK(*u);
545 unicode_latin1[Py_CHARMASK(*u)] = unicode;
546 }
547 Py_INCREF(unicode);
548 return (PyObject *)unicode;
549 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000550
551 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000552 }
553
Walter Dörwald55507312007-05-18 13:12:10 +0000554 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 if (!unicode)
556 return NULL;
557
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 return (PyObject *)unicode;
559}
560
Walter Dörwaldd2034312007-05-18 16:29:38 +0000561PyObject *PyUnicode_FromString(const char *u)
562{
563 size_t size = strlen(u);
564 if (size > PY_SSIZE_T_MAX) {
565 PyErr_SetString(PyExc_OverflowError, "input too long");
566 return NULL;
567 }
568
569 return PyUnicode_FromStringAndSize(u, size);
570}
571
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572#ifdef HAVE_WCHAR_H
573
Mark Dickinson081dfee2009-03-18 14:47:41 +0000574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581 to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584 Py_ssize_t size)
585{
586 PyUnicodeObject *unicode;
587 register Py_ssize_t i;
588 Py_ssize_t alloc;
589 const wchar_t *orig_w;
590
591 if (w == NULL) {
592 if (size == 0)
593 return PyUnicode_FromStringAndSize(NULL, 0);
594 PyErr_BadInternalCall();
595 return NULL;
596 }
597
598 if (size == -1) {
599 size = wcslen(w);
600 }
601
602 alloc = size;
603 orig_w = w;
604 for (i = size; i > 0; i--) {
605 if (*w > 0xFFFF)
606 alloc++;
607 w++;
608 }
609 w = orig_w;
610 unicode = _PyUnicode_New(alloc);
611 if (!unicode)
612 return NULL;
613
614 /* Copy the wchar_t data into the new object */
615 {
616 register Py_UNICODE *u;
617 u = PyUnicode_AS_UNICODE(unicode);
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF) {
620 wchar_t ordinal = *w++;
621 ordinal -= 0x10000;
622 *u++ = 0xD800 | (ordinal >> 10);
623 *u++ = 0xDC00 | (ordinal & 0x3FF);
624 }
625 else
626 *u++ = *w++;
627 }
628 }
629 return (PyObject *)unicode;
630}
631
632#else
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000635 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636{
637 PyUnicodeObject *unicode;
638
639 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000640 if (size == 0)
641 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000642 PyErr_BadInternalCall();
643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644 }
645
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == -1) {
647 size = wcslen(w);
648 }
649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = _PyUnicode_New(size);
651 if (!unicode)
652 return NULL;
653
654 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000657#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000659 register Py_UNICODE *u;
660 register Py_ssize_t i;
661 u = PyUnicode_AS_UNICODE(unicode);
662 for (i = size; i > 0; i--)
663 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665#endif
666
667 return (PyObject *)unicode;
668}
669
Mark Dickinson081dfee2009-03-18 14:47:41 +0000670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
Walter Dörwald346737f2007-05-31 10:44:43 +0000674static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000677{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000678 *fmt++ = '%';
679 if (width) {
680 if (zeropad)
681 *fmt++ = '0';
682 fmt += sprintf(fmt, "%d", width);
683 }
684 if (precision)
685 fmt += sprintf(fmt, ".%d", precision);
686 if (longflag)
687 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000688 else if (longlongflag) {
689 /* longlongflag should only ever be nonzero on machines with
690 HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692 char *f = PY_FORMAT_LONG_LONG;
693 while (*f)
694 *fmt++ = *f++;
695#else
696 /* we shouldn't ever get here */
697 assert(0);
698 *fmt++ = 'l';
699#endif
700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000701 else if (size_tflag) {
702 char *f = PY_FORMAT_SIZE_T;
703 while (*f)
704 *fmt++ = *f++;
705 }
706 *fmt++ = c;
707 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000708}
709
Walter Dörwaldd2034312007-05-18 16:29:38 +0000710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld. 21 characters
715 allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 va_list count;
726 Py_ssize_t callcount = 0;
727 PyObject **callresults = NULL;
728 PyObject **callresult = NULL;
729 Py_ssize_t n = 0;
730 int width = 0;
731 int precision = 0;
732 int zeropad;
733 const char* f;
734 Py_UNICODE *s;
735 PyObject *string;
736 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000737 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000738 /* use abuffer instead of buffer, if we need more space
739 * (which can happen if there's a format specifier with width). */
740 char *abuffer = NULL;
741 char *realbuffer;
742 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000746 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000747 /* step 1: count the number of %S/%R/%A/%s format specifications
748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000752 if (*f == '%') {
753 if (*(f+1)=='%')
754 continue;
755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
756 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000757 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000758 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 ;
761 if (*f == 's')
762 ++callcount;
763 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000764 else if (128 <= (unsigned char)*f) {
765 PyErr_Format(PyExc_ValueError,
766 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000767 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000768 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000769 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 }
772 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 if (callcount) {
775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776 if (!callresults) {
777 PyErr_NoMemory();
778 return NULL;
779 }
780 callresult = callresults;
781 }
782 /* step 3: figure out how large a buffer we need */
783 for (f = format; *f; f++) {
784 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000785#ifdef HAVE_LONG_LONG
786 int longlongflag = 0;
787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000788 const char* p = f;
789 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000790 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796 * they don't affect the amount of space we reserve.
797 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000798 if (*f == 'l') {
799 if (f[1] == 'd' || f[1] == 'u') {
800 ++f;
801 }
802#ifdef HAVE_LONG_LONG
803 else if (f[1] == 'l' &&
804 (f[2] == 'd' || f[2] == 'u')) {
805 longlongflag = 1;
806 f += 2;
807 }
808#endif
809 }
810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000811 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000812 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
Benjamin Peterson14339b62009-01-31 16:36:08 +0000814 switch (*f) {
815 case 'c':
816 (void)va_arg(count, int);
817 /* fall through... */
818 case '%':
819 n++;
820 break;
821 case 'd': case 'u': case 'i': case 'x':
822 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000823#ifdef HAVE_LONG_LONG
824 if (longlongflag) {
825 if (width < MAX_LONG_LONG_CHARS)
826 width = MAX_LONG_LONG_CHARS;
827 }
828 else
829#endif
830 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
831 including sign. Decimal takes the most space. This
832 isn't enough for octal. If a width is specified we
833 need more (which we allocate later). */
834 if (width < MAX_LONG_CHARS)
835 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000836 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000837 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000838 if (abuffersize < width)
839 abuffersize = width;
840 break;
841 case 's':
842 {
843 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000844 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000845 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
846 if (!str)
847 goto fail;
848 n += PyUnicode_GET_SIZE(str);
849 /* Remember the str and switch to the next slot */
850 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000851 break;
852 }
853 case 'U':
854 {
855 PyObject *obj = va_arg(count, PyObject *);
856 assert(obj && PyUnicode_Check(obj));
857 n += PyUnicode_GET_SIZE(obj);
858 break;
859 }
860 case 'V':
861 {
862 PyObject *obj = va_arg(count, PyObject *);
863 const char *str = va_arg(count, const char *);
864 assert(obj || str);
865 assert(!obj || PyUnicode_Check(obj));
866 if (obj)
867 n += PyUnicode_GET_SIZE(obj);
868 else
869 n += strlen(str);
870 break;
871 }
872 case 'S':
873 {
874 PyObject *obj = va_arg(count, PyObject *);
875 PyObject *str;
876 assert(obj);
877 str = PyObject_Str(obj);
878 if (!str)
879 goto fail;
880 n += PyUnicode_GET_SIZE(str);
881 /* Remember the str and switch to the next slot */
882 *callresult++ = str;
883 break;
884 }
885 case 'R':
886 {
887 PyObject *obj = va_arg(count, PyObject *);
888 PyObject *repr;
889 assert(obj);
890 repr = PyObject_Repr(obj);
891 if (!repr)
892 goto fail;
893 n += PyUnicode_GET_SIZE(repr);
894 /* Remember the repr and switch to the next slot */
895 *callresult++ = repr;
896 break;
897 }
898 case 'A':
899 {
900 PyObject *obj = va_arg(count, PyObject *);
901 PyObject *ascii;
902 assert(obj);
903 ascii = PyObject_ASCII(obj);
904 if (!ascii)
905 goto fail;
906 n += PyUnicode_GET_SIZE(ascii);
907 /* Remember the repr and switch to the next slot */
908 *callresult++ = ascii;
909 break;
910 }
911 case 'p':
912 (void) va_arg(count, int);
913 /* maximum 64-bit pointer representation:
914 * 0xffffffffffffffff
915 * so 19 characters is enough.
916 * XXX I count 18 -- what's the extra for?
917 */
918 n += 19;
919 break;
920 default:
921 /* if we stumble upon an unknown
922 formatting code, copy the rest of
923 the format string to the output
924 string. (we cannot just skip the
925 code, since there's no way to know
926 what's in the argument list) */
927 n += strlen(p);
928 goto expand;
929 }
930 } else
931 n++;
932 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000933 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000934 if (abuffersize > ITEM_BUFFER_LEN) {
935 /* add 1 for sprintf's trailing null byte */
936 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000937 if (!abuffer) {
938 PyErr_NoMemory();
939 goto fail;
940 }
941 realbuffer = abuffer;
942 }
943 else
944 realbuffer = buffer;
945 /* step 4: fill the buffer */
946 /* Since we've analyzed how much space we need for the worst case,
947 we don't have to resize the string.
948 There can be no errors beyond this point. */
949 string = PyUnicode_FromUnicode(NULL, n);
950 if (!string)
951 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000952
Benjamin Peterson14339b62009-01-31 16:36:08 +0000953 s = PyUnicode_AS_UNICODE(string);
954 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000955
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 for (f = format; *f; f++) {
957 if (*f == '%') {
958 const char* p = f++;
959 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000960 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000961 int size_tflag = 0;
962 zeropad = (*f == '0');
963 /* parse the width.precision part */
964 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000965 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000966 width = (width*10) + *f++ - '0';
967 precision = 0;
968 if (*f == '.') {
969 f++;
David Malcolm96960882010-11-05 17:23:41 +0000970 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 precision = (precision*10) + *f++ - '0';
972 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000973 /* Handle %ld, %lu, %lld and %llu. */
974 if (*f == 'l') {
975 if (f[1] == 'd' || f[1] == 'u') {
976 longflag = 1;
977 ++f;
978 }
979#ifdef HAVE_LONG_LONG
980 else if (f[1] == 'l' &&
981 (f[2] == 'd' || f[2] == 'u')) {
982 longlongflag = 1;
983 f += 2;
984 }
985#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000986 }
987 /* handle the size_t flag. */
988 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
989 size_tflag = 1;
990 ++f;
991 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000992
Benjamin Peterson14339b62009-01-31 16:36:08 +0000993 switch (*f) {
994 case 'c':
995 *s++ = va_arg(vargs, int);
996 break;
997 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000998 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
999 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001000 if (longflag)
1001 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001002#ifdef HAVE_LONG_LONG
1003 else if (longlongflag)
1004 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1005#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001006 else if (size_tflag)
1007 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1008 else
1009 sprintf(realbuffer, fmt, va_arg(vargs, int));
1010 appendstring(realbuffer);
1011 break;
1012 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1014 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001015 if (longflag)
1016 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001017#ifdef HAVE_LONG_LONG
1018 else if (longlongflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs,
1020 unsigned PY_LONG_LONG));
1021#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001022 else if (size_tflag)
1023 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1024 else
1025 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1026 appendstring(realbuffer);
1027 break;
1028 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001029 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001030 sprintf(realbuffer, fmt, va_arg(vargs, int));
1031 appendstring(realbuffer);
1032 break;
1033 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001034 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 sprintf(realbuffer, fmt, va_arg(vargs, int));
1036 appendstring(realbuffer);
1037 break;
1038 case 's':
1039 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001040 /* unused, since we already have the result */
1041 (void) va_arg(vargs, char *);
1042 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1043 PyUnicode_GET_SIZE(*callresult));
1044 s += PyUnicode_GET_SIZE(*callresult);
1045 /* We're done with the unicode()/repr() => forget it */
1046 Py_DECREF(*callresult);
1047 /* switch to next unicode()/repr() result */
1048 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001049 break;
1050 }
1051 case 'U':
1052 {
1053 PyObject *obj = va_arg(vargs, PyObject *);
1054 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1055 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1056 s += size;
1057 break;
1058 }
1059 case 'V':
1060 {
1061 PyObject *obj = va_arg(vargs, PyObject *);
1062 const char *str = va_arg(vargs, const char *);
1063 if (obj) {
1064 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1065 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1066 s += size;
1067 } else {
1068 appendstring(str);
1069 }
1070 break;
1071 }
1072 case 'S':
1073 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001074 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001075 {
1076 Py_UNICODE *ucopy;
1077 Py_ssize_t usize;
1078 Py_ssize_t upos;
1079 /* unused, since we already have the result */
1080 (void) va_arg(vargs, PyObject *);
1081 ucopy = PyUnicode_AS_UNICODE(*callresult);
1082 usize = PyUnicode_GET_SIZE(*callresult);
1083 for (upos = 0; upos<usize;)
1084 *s++ = ucopy[upos++];
1085 /* We're done with the unicode()/repr() => forget it */
1086 Py_DECREF(*callresult);
1087 /* switch to next unicode()/repr() result */
1088 ++callresult;
1089 break;
1090 }
1091 case 'p':
1092 sprintf(buffer, "%p", va_arg(vargs, void*));
1093 /* %p is ill-defined: ensure leading 0x. */
1094 if (buffer[1] == 'X')
1095 buffer[1] = 'x';
1096 else if (buffer[1] != 'x') {
1097 memmove(buffer+2, buffer, strlen(buffer)+1);
1098 buffer[0] = '0';
1099 buffer[1] = 'x';
1100 }
1101 appendstring(buffer);
1102 break;
1103 case '%':
1104 *s++ = '%';
1105 break;
1106 default:
1107 appendstring(p);
1108 goto end;
1109 }
Victor Stinner1205f272010-09-11 00:54:47 +00001110 }
Victor Stinner1205f272010-09-11 00:54:47 +00001111 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001112 *s++ = *f;
1113 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001114
Benjamin Peterson29060642009-01-31 22:14:21 +00001115 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001116 if (callresults)
1117 PyObject_Free(callresults);
1118 if (abuffer)
1119 PyObject_Free(abuffer);
1120 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1121 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001122 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001123 if (callresults) {
1124 PyObject **callresult2 = callresults;
1125 while (callresult2 < callresult) {
1126 Py_DECREF(*callresult2);
1127 ++callresult2;
1128 }
1129 PyObject_Free(callresults);
1130 }
1131 if (abuffer)
1132 PyObject_Free(abuffer);
1133 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001134}
1135
1136#undef appendstring
1137
1138PyObject *
1139PyUnicode_FromFormat(const char *format, ...)
1140{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001141 PyObject* ret;
1142 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001143
1144#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001145 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001146#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001147 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001148#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 ret = PyUnicode_FromFormatV(format, vargs);
1150 va_end(vargs);
1151 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001152}
1153
Victor Stinner5593d8a2010-10-02 11:11:27 +00001154/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1155 convert a Unicode object to a wide character string.
1156
1157 - If w is NULL: return the number of wide characters (including the nul
1158 character) required to convert the unicode object. Ignore size argument.
1159
1160 - Otherwise: return the number of wide characters (excluding the nul
1161 character) written into w. Write at most size wide characters (including
1162 the nul character). */
1163static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001164unicode_aswidechar(PyUnicodeObject *unicode,
1165 wchar_t *w,
1166 Py_ssize_t size)
1167{
1168#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001169 Py_ssize_t res;
1170 if (w != NULL) {
1171 res = PyUnicode_GET_SIZE(unicode);
1172 if (size > res)
1173 size = res + 1;
1174 else
1175 res = size;
1176 memcpy(w, unicode->str, size * sizeof(wchar_t));
1177 return res;
1178 }
1179 else
1180 return PyUnicode_GET_SIZE(unicode) + 1;
1181#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1182 register const Py_UNICODE *u;
1183 const Py_UNICODE *uend;
1184 const wchar_t *worig, *wend;
1185 Py_ssize_t nchar;
1186
Victor Stinner137c34c2010-09-29 10:25:54 +00001187 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001188 uend = u + PyUnicode_GET_SIZE(unicode);
1189 if (w != NULL) {
1190 worig = w;
1191 wend = w + size;
1192 while (u != uend && w != wend) {
1193 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1194 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1195 {
1196 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1197 u += 2;
1198 }
1199 else {
1200 *w = *u;
1201 u++;
1202 }
1203 w++;
1204 }
1205 if (w != wend)
1206 *w = L'\0';
1207 return w - worig;
1208 }
1209 else {
1210 nchar = 1; /* nul character at the end */
1211 while (u != uend) {
1212 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1213 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1214 u += 2;
1215 else
1216 u++;
1217 nchar++;
1218 }
1219 }
1220 return nchar;
1221#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1222 register Py_UNICODE *u, *uend, ordinal;
1223 register Py_ssize_t i;
1224 wchar_t *worig, *wend;
1225 Py_ssize_t nchar;
1226
1227 u = PyUnicode_AS_UNICODE(unicode);
1228 uend = u + PyUnicode_GET_SIZE(u);
1229 if (w != NULL) {
1230 worig = w;
1231 wend = w + size;
1232 while (u != uend && w != wend) {
1233 ordinal = *u;
1234 if (ordinal > 0xffff) {
1235 ordinal -= 0x10000;
1236 *w++ = 0xD800 | (ordinal >> 10);
1237 *w++ = 0xDC00 | (ordinal & 0x3FF);
1238 }
1239 else
1240 *w++ = ordinal;
1241 u++;
1242 }
1243 if (w != wend)
1244 *w = 0;
1245 return w - worig;
1246 }
1247 else {
1248 nchar = 1; /* nul character */
1249 while (u != uend) {
1250 if (*u > 0xffff)
1251 nchar += 2;
1252 else
1253 nchar++;
1254 u++;
1255 }
1256 return nchar;
1257 }
1258#else
1259# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001260#endif
1261}
1262
1263Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001264PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001265 wchar_t *w,
1266 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267{
1268 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001269 PyErr_BadInternalCall();
1270 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001272 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273}
1274
Victor Stinner137c34c2010-09-29 10:25:54 +00001275wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001276PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001277 Py_ssize_t *size)
1278{
1279 wchar_t* buffer;
1280 Py_ssize_t buflen;
1281
1282 if (unicode == NULL) {
1283 PyErr_BadInternalCall();
1284 return NULL;
1285 }
1286
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001287 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001288 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001289 PyErr_NoMemory();
1290 return NULL;
1291 }
1292
Victor Stinner137c34c2010-09-29 10:25:54 +00001293 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1294 if (buffer == NULL) {
1295 PyErr_NoMemory();
1296 return NULL;
1297 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001298 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001299 if (size != NULL)
1300 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001301 return buffer;
1302}
1303
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304#endif
1305
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001306PyObject *PyUnicode_FromOrdinal(int ordinal)
1307{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001308 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001309
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001310 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001311 PyErr_SetString(PyExc_ValueError,
1312 "chr() arg not in range(0x110000)");
1313 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001314 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001315
1316#ifndef Py_UNICODE_WIDE
1317 if (ordinal > 0xffff) {
1318 ordinal -= 0x10000;
1319 s[0] = 0xD800 | (ordinal >> 10);
1320 s[1] = 0xDC00 | (ordinal & 0x3FF);
1321 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001322 }
1323#endif
1324
Hye-Shik Chang40574832004-04-06 07:24:51 +00001325 s[0] = (Py_UNICODE)ordinal;
1326 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001327}
1328
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329PyObject *PyUnicode_FromObject(register PyObject *obj)
1330{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001331 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001332 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001333 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001334 Py_INCREF(obj);
1335 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001336 }
1337 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001338 /* For a Unicode subtype that's not a Unicode object,
1339 return a true Unicode object with the same data. */
1340 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1341 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001342 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001343 PyErr_Format(PyExc_TypeError,
1344 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001345 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001346 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001347}
1348
1349PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001350 const char *encoding,
1351 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001352{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001353 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001354 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001355
Guido van Rossumd57fd912000-03-10 22:53:23 +00001356 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001357 PyErr_BadInternalCall();
1358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001360
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001361 /* Decoding bytes objects is the most common case and should be fast */
1362 if (PyBytes_Check(obj)) {
1363 if (PyBytes_GET_SIZE(obj) == 0) {
1364 Py_INCREF(unicode_empty);
1365 v = (PyObject *) unicode_empty;
1366 }
1367 else {
1368 v = PyUnicode_Decode(
1369 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1370 encoding, errors);
1371 }
1372 return v;
1373 }
1374
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001375 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 PyErr_SetString(PyExc_TypeError,
1377 "decoding str is not supported");
1378 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001379 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001380
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001381 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1382 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1383 PyErr_Format(PyExc_TypeError,
1384 "coercing to str: need bytes, bytearray "
1385 "or buffer-like object, %.80s found",
1386 Py_TYPE(obj)->tp_name);
1387 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001388 }
Tim Petersced69f82003-09-16 20:30:58 +00001389
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001390 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001391 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001392 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393 }
Tim Petersced69f82003-09-16 20:30:58 +00001394 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001395 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001396
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001397 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001398 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399}
1400
Victor Stinner600d3be2010-06-10 12:00:55 +00001401/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001402 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1403 1 on success. */
1404static int
1405normalize_encoding(const char *encoding,
1406 char *lower,
1407 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001408{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001409 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001410 char *l;
1411 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001413 e = encoding;
1414 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001415 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001416 while (*e) {
1417 if (l == l_end)
1418 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001419 if (Py_ISUPPER(*e)) {
1420 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001421 }
1422 else if (*e == '_') {
1423 *l++ = '-';
1424 e++;
1425 }
1426 else {
1427 *l++ = *e++;
1428 }
1429 }
1430 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001431 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001432}
1433
1434PyObject *PyUnicode_Decode(const char *s,
1435 Py_ssize_t size,
1436 const char *encoding,
1437 const char *errors)
1438{
1439 PyObject *buffer = NULL, *unicode;
1440 Py_buffer info;
1441 char lower[11]; /* Enough for any encoding shortcut */
1442
1443 if (encoding == NULL)
1444 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001445
1446 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001447 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1448 if (strcmp(lower, "utf-8") == 0)
1449 return PyUnicode_DecodeUTF8(s, size, errors);
1450 else if ((strcmp(lower, "latin-1") == 0) ||
1451 (strcmp(lower, "iso-8859-1") == 0))
1452 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001453#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001454 else if (strcmp(lower, "mbcs") == 0)
1455 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001456#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001457 else if (strcmp(lower, "ascii") == 0)
1458 return PyUnicode_DecodeASCII(s, size, errors);
1459 else if (strcmp(lower, "utf-16") == 0)
1460 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1461 else if (strcmp(lower, "utf-32") == 0)
1462 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1463 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464
1465 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001466 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001467 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001468 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001469 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001470 if (buffer == NULL)
1471 goto onError;
1472 unicode = PyCodec_Decode(buffer, encoding, errors);
1473 if (unicode == NULL)
1474 goto onError;
1475 if (!PyUnicode_Check(unicode)) {
1476 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001477 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001478 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479 Py_DECREF(unicode);
1480 goto onError;
1481 }
1482 Py_DECREF(buffer);
1483 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001484
Benjamin Peterson29060642009-01-31 22:14:21 +00001485 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 Py_XDECREF(buffer);
1487 return NULL;
1488}
1489
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001490PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1491 const char *encoding,
1492 const char *errors)
1493{
1494 PyObject *v;
1495
1496 if (!PyUnicode_Check(unicode)) {
1497 PyErr_BadArgument();
1498 goto onError;
1499 }
1500
1501 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001502 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001503
1504 /* Decode via the codec registry */
1505 v = PyCodec_Decode(unicode, encoding, errors);
1506 if (v == NULL)
1507 goto onError;
1508 return v;
1509
Benjamin Peterson29060642009-01-31 22:14:21 +00001510 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001511 return NULL;
1512}
1513
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001514PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1515 const char *encoding,
1516 const char *errors)
1517{
1518 PyObject *v;
1519
1520 if (!PyUnicode_Check(unicode)) {
1521 PyErr_BadArgument();
1522 goto onError;
1523 }
1524
1525 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001526 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001527
1528 /* Decode via the codec registry */
1529 v = PyCodec_Decode(unicode, encoding, errors);
1530 if (v == NULL)
1531 goto onError;
1532 if (!PyUnicode_Check(v)) {
1533 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001534 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001535 Py_TYPE(v)->tp_name);
1536 Py_DECREF(v);
1537 goto onError;
1538 }
1539 return v;
1540
Benjamin Peterson29060642009-01-31 22:14:21 +00001541 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001542 return NULL;
1543}
1544
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001546 Py_ssize_t size,
1547 const char *encoding,
1548 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549{
1550 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001551
Guido van Rossumd57fd912000-03-10 22:53:23 +00001552 unicode = PyUnicode_FromUnicode(s, size);
1553 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001554 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1556 Py_DECREF(unicode);
1557 return v;
1558}
1559
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001560PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1561 const char *encoding,
1562 const char *errors)
1563{
1564 PyObject *v;
1565
1566 if (!PyUnicode_Check(unicode)) {
1567 PyErr_BadArgument();
1568 goto onError;
1569 }
1570
1571 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001572 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001573
1574 /* Encode via the codec registry */
1575 v = PyCodec_Encode(unicode, encoding, errors);
1576 if (v == NULL)
1577 goto onError;
1578 return v;
1579
Benjamin Peterson29060642009-01-31 22:14:21 +00001580 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001581 return NULL;
1582}
1583
Victor Stinnerad158722010-10-27 00:25:46 +00001584PyObject *
1585PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001586{
Victor Stinner313a1202010-06-11 23:56:51 +00001587#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001588 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1589 PyUnicode_GET_SIZE(unicode),
1590 NULL);
1591#elif defined(__APPLE__)
1592 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1593 PyUnicode_GET_SIZE(unicode),
1594 "surrogateescape");
1595#else
1596 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001597 return PyUnicode_AsEncodedString(unicode,
1598 Py_FileSystemDefaultEncoding,
1599 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001600 }
1601 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001602 /* locale encoding with surrogateescape */
1603 wchar_t *wchar;
1604 char *bytes;
1605 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001606 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001607
1608 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1609 if (wchar == NULL)
1610 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001611 bytes = _Py_wchar2char(wchar, &error_pos);
1612 if (bytes == NULL) {
1613 if (error_pos != (size_t)-1) {
1614 char *errmsg = strerror(errno);
1615 PyObject *exc = NULL;
1616 if (errmsg == NULL)
1617 errmsg = "Py_wchar2char() failed";
1618 raise_encode_exception(&exc,
1619 "filesystemencoding",
1620 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1621 error_pos, error_pos+1,
1622 errmsg);
1623 Py_XDECREF(exc);
1624 }
1625 else
1626 PyErr_NoMemory();
1627 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001628 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001629 }
1630 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001631
1632 bytes_obj = PyBytes_FromString(bytes);
1633 PyMem_Free(bytes);
1634 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001635 }
Victor Stinnerad158722010-10-27 00:25:46 +00001636#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001637}
1638
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1640 const char *encoding,
1641 const char *errors)
1642{
1643 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001644 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001645
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 if (!PyUnicode_Check(unicode)) {
1647 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649 }
Fred Drakee4315f52000-05-09 19:53:39 +00001650
Tim Petersced69f82003-09-16 20:30:58 +00001651 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001652 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001653
1654 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001655 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1656 if (strcmp(lower, "utf-8") == 0)
1657 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1658 PyUnicode_GET_SIZE(unicode),
1659 errors);
1660 else if ((strcmp(lower, "latin-1") == 0) ||
1661 (strcmp(lower, "iso-8859-1") == 0))
1662 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1663 PyUnicode_GET_SIZE(unicode),
1664 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001665#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001666 else if (strcmp(lower, "mbcs") == 0)
1667 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1668 PyUnicode_GET_SIZE(unicode),
1669 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001670#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001671 else if (strcmp(lower, "ascii") == 0)
1672 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1673 PyUnicode_GET_SIZE(unicode),
1674 errors);
1675 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676
1677 /* Encode via the codec registry */
1678 v = PyCodec_Encode(unicode, encoding, errors);
1679 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001680 return NULL;
1681
1682 /* The normal path */
1683 if (PyBytes_Check(v))
1684 return v;
1685
1686 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001687 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001688 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001689 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001690
1691 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1692 "encoder %s returned bytearray instead of bytes",
1693 encoding);
1694 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001695 Py_DECREF(v);
1696 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001697 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001698
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001699 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1700 Py_DECREF(v);
1701 return b;
1702 }
1703
1704 PyErr_Format(PyExc_TypeError,
1705 "encoder did not return a bytes object (type=%.400s)",
1706 Py_TYPE(v)->tp_name);
1707 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001708 return NULL;
1709}
1710
1711PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1712 const char *encoding,
1713 const char *errors)
1714{
1715 PyObject *v;
1716
1717 if (!PyUnicode_Check(unicode)) {
1718 PyErr_BadArgument();
1719 goto onError;
1720 }
1721
1722 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001724
1725 /* Encode via the codec registry */
1726 v = PyCodec_Encode(unicode, encoding, errors);
1727 if (v == NULL)
1728 goto onError;
1729 if (!PyUnicode_Check(v)) {
1730 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001731 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001732 Py_TYPE(v)->tp_name);
1733 Py_DECREF(v);
1734 goto onError;
1735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001737
Benjamin Peterson29060642009-01-31 22:14:21 +00001738 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return NULL;
1740}
1741
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001742PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001743 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001744{
1745 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001746 if (v)
1747 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001748 if (errors != NULL)
1749 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001750 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001751 PyUnicode_GET_SIZE(unicode),
1752 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001753 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001754 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001755 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001756 return v;
1757}
1758
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001759PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001760PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001761 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001762 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1763}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001764
Christian Heimes5894ba72007-11-04 11:43:14 +00001765PyObject*
1766PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1767{
Victor Stinnerad158722010-10-27 00:25:46 +00001768#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1769 return PyUnicode_DecodeMBCS(s, size, NULL);
1770#elif defined(__APPLE__)
1771 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1772#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001773 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1774 can be undefined. If it is case, decode using UTF-8. The following assumes
1775 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1776 bootstrapping process where the codecs aren't ready yet.
1777 */
1778 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001779 return PyUnicode_Decode(s, size,
1780 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001781 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001782 }
1783 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001784 /* locale encoding with surrogateescape */
1785 wchar_t *wchar;
1786 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001787 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001788
1789 if (s[size] != '\0' || size != strlen(s)) {
1790 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1791 return NULL;
1792 }
1793
Victor Stinner168e1172010-10-16 23:16:16 +00001794 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001795 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001796 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001797
Victor Stinner168e1172010-10-16 23:16:16 +00001798 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001799 PyMem_Free(wchar);
1800 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001801 }
Victor Stinnerad158722010-10-27 00:25:46 +00001802#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001803}
1804
Martin v. Löwis011e8422009-05-05 04:43:17 +00001805
1806int
1807PyUnicode_FSConverter(PyObject* arg, void* addr)
1808{
1809 PyObject *output = NULL;
1810 Py_ssize_t size;
1811 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001812 if (arg == NULL) {
1813 Py_DECREF(*(PyObject**)addr);
1814 return 1;
1815 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001816 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001817 output = arg;
1818 Py_INCREF(output);
1819 }
1820 else {
1821 arg = PyUnicode_FromObject(arg);
1822 if (!arg)
1823 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001824 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001825 Py_DECREF(arg);
1826 if (!output)
1827 return 0;
1828 if (!PyBytes_Check(output)) {
1829 Py_DECREF(output);
1830 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1831 return 0;
1832 }
1833 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001834 size = PyBytes_GET_SIZE(output);
1835 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001836 if (size != strlen(data)) {
1837 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1838 Py_DECREF(output);
1839 return 0;
1840 }
1841 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001842 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001843}
1844
1845
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001846int
1847PyUnicode_FSDecoder(PyObject* arg, void* addr)
1848{
1849 PyObject *output = NULL;
1850 Py_ssize_t size;
1851 void *data;
1852 if (arg == NULL) {
1853 Py_DECREF(*(PyObject**)addr);
1854 return 1;
1855 }
1856 if (PyUnicode_Check(arg)) {
1857 output = arg;
1858 Py_INCREF(output);
1859 }
1860 else {
1861 arg = PyBytes_FromObject(arg);
1862 if (!arg)
1863 return 0;
1864 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1865 PyBytes_GET_SIZE(arg));
1866 Py_DECREF(arg);
1867 if (!output)
1868 return 0;
1869 if (!PyUnicode_Check(output)) {
1870 Py_DECREF(output);
1871 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1872 return 0;
1873 }
1874 }
1875 size = PyUnicode_GET_SIZE(output);
1876 data = PyUnicode_AS_UNICODE(output);
1877 if (size != Py_UNICODE_strlen(data)) {
1878 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1879 Py_DECREF(output);
1880 return 0;
1881 }
1882 *(PyObject**)addr = output;
1883 return Py_CLEANUP_SUPPORTED;
1884}
1885
1886
Martin v. Löwis5b222132007-06-10 09:51:05 +00001887char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001888_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001889{
Christian Heimesf3863112007-11-22 07:46:41 +00001890 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001891 if (!PyUnicode_Check(unicode)) {
1892 PyErr_BadArgument();
1893 return NULL;
1894 }
Christian Heimesf3863112007-11-22 07:46:41 +00001895 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1896 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001897 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001898 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001899 *psize = PyBytes_GET_SIZE(bytes);
1900 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001901}
1902
1903char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001904_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001905{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001906 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001907}
1908
Guido van Rossumd57fd912000-03-10 22:53:23 +00001909Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1910{
1911 if (!PyUnicode_Check(unicode)) {
1912 PyErr_BadArgument();
1913 goto onError;
1914 }
1915 return PyUnicode_AS_UNICODE(unicode);
1916
Benjamin Peterson29060642009-01-31 22:14:21 +00001917 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001918 return NULL;
1919}
1920
Martin v. Löwis18e16552006-02-15 17:27:45 +00001921Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922{
1923 if (!PyUnicode_Check(unicode)) {
1924 PyErr_BadArgument();
1925 goto onError;
1926 }
1927 return PyUnicode_GET_SIZE(unicode);
1928
Benjamin Peterson29060642009-01-31 22:14:21 +00001929 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 return -1;
1931}
1932
Thomas Wouters78890102000-07-22 19:25:51 +00001933const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001934{
Victor Stinner42cb4622010-09-01 19:39:01 +00001935 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001936}
1937
Victor Stinner554f3f02010-06-16 23:33:54 +00001938/* create or adjust a UnicodeDecodeError */
1939static void
1940make_decode_exception(PyObject **exceptionObject,
1941 const char *encoding,
1942 const char *input, Py_ssize_t length,
1943 Py_ssize_t startpos, Py_ssize_t endpos,
1944 const char *reason)
1945{
1946 if (*exceptionObject == NULL) {
1947 *exceptionObject = PyUnicodeDecodeError_Create(
1948 encoding, input, length, startpos, endpos, reason);
1949 }
1950 else {
1951 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1952 goto onError;
1953 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1954 goto onError;
1955 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1956 goto onError;
1957 }
1958 return;
1959
1960onError:
1961 Py_DECREF(*exceptionObject);
1962 *exceptionObject = NULL;
1963}
1964
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001965/* error handling callback helper:
1966 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001967 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001968 and adjust various state variables.
1969 return 0 on success, -1 on error
1970*/
1971
1972static
1973int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001974 const char *encoding, const char *reason,
1975 const char **input, const char **inend, Py_ssize_t *startinpos,
1976 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1977 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001978{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001979 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001980
1981 PyObject *restuple = NULL;
1982 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001983 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001984 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001985 Py_ssize_t requiredsize;
1986 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001987 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001988 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001989 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001990 int res = -1;
1991
1992 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001993 *errorHandler = PyCodec_LookupError(errors);
1994 if (*errorHandler == NULL)
1995 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001996 }
1997
Victor Stinner554f3f02010-06-16 23:33:54 +00001998 make_decode_exception(exceptionObject,
1999 encoding,
2000 *input, *inend - *input,
2001 *startinpos, *endinpos,
2002 reason);
2003 if (*exceptionObject == NULL)
2004 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002005
2006 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2007 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002008 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002009 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002010 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002011 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002012 }
2013 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002014 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002015
2016 /* Copy back the bytes variables, which might have been modified by the
2017 callback */
2018 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2019 if (!inputobj)
2020 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002021 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002022 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002023 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002024 *input = PyBytes_AS_STRING(inputobj);
2025 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002026 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002027 /* we can DECREF safely, as the exception has another reference,
2028 so the object won't go away. */
2029 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002030
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002031 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002032 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002033 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002034 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2035 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002036 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002037
2038 /* need more space? (at least enough for what we
2039 have+the replacement+the rest of the string (starting
2040 at the new input position), so we won't have to check space
2041 when there are no errors in the rest of the string) */
2042 repptr = PyUnicode_AS_UNICODE(repunicode);
2043 repsize = PyUnicode_GET_SIZE(repunicode);
2044 requiredsize = *outpos + repsize + insize-newpos;
2045 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002046 if (requiredsize<2*outsize)
2047 requiredsize = 2*outsize;
2048 if (_PyUnicode_Resize(output, requiredsize) < 0)
2049 goto onError;
2050 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002051 }
2052 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002053 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054 Py_UNICODE_COPY(*outptr, repptr, repsize);
2055 *outptr += repsize;
2056 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002057
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058 /* we made it! */
2059 res = 0;
2060
Benjamin Peterson29060642009-01-31 22:14:21 +00002061 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 Py_XDECREF(restuple);
2063 return res;
2064}
2065
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002066/* --- UTF-7 Codec -------------------------------------------------------- */
2067
Antoine Pitrou244651a2009-05-04 18:56:13 +00002068/* See RFC2152 for details. We encode conservatively and decode liberally. */
2069
2070/* Three simple macros defining base-64. */
2071
2072/* Is c a base-64 character? */
2073
2074#define IS_BASE64(c) \
2075 (((c) >= 'A' && (c) <= 'Z') || \
2076 ((c) >= 'a' && (c) <= 'z') || \
2077 ((c) >= '0' && (c) <= '9') || \
2078 (c) == '+' || (c) == '/')
2079
2080/* given that c is a base-64 character, what is its base-64 value? */
2081
2082#define FROM_BASE64(c) \
2083 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2084 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2085 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2086 (c) == '+' ? 62 : 63)
2087
2088/* What is the base-64 character of the bottom 6 bits of n? */
2089
2090#define TO_BASE64(n) \
2091 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2092
2093/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2094 * decoded as itself. We are permissive on decoding; the only ASCII
2095 * byte not decoding to itself is the + which begins a base64
2096 * string. */
2097
2098#define DECODE_DIRECT(c) \
2099 ((c) <= 127 && (c) != '+')
2100
2101/* The UTF-7 encoder treats ASCII characters differently according to
2102 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2103 * the above). See RFC2152. This array identifies these different
2104 * sets:
2105 * 0 : "Set D"
2106 * alphanumeric and '(),-./:?
2107 * 1 : "Set O"
2108 * !"#$%&*;<=>@[]^_`{|}
2109 * 2 : "whitespace"
2110 * ht nl cr sp
2111 * 3 : special (must be base64 encoded)
2112 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2113 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002114
Tim Petersced69f82003-09-16 20:30:58 +00002115static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002116char utf7_category[128] = {
2117/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2118 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2119/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2120 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2121/* sp ! " # $ % & ' ( ) * + , - . / */
2122 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2123/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2125/* @ A B C D E F G H I J K L M N O */
2126 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2127/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2129/* ` a b c d e f g h i j k l m n o */
2130 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2131/* p q r s t u v w x y z { | } ~ del */
2132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133};
2134
Antoine Pitrou244651a2009-05-04 18:56:13 +00002135/* ENCODE_DIRECT: this character should be encoded as itself. The
2136 * answer depends on whether we are encoding set O as itself, and also
2137 * on whether we are encoding whitespace as itself. RFC2152 makes it
2138 * clear that the answers to these questions vary between
2139 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002140
Antoine Pitrou244651a2009-05-04 18:56:13 +00002141#define ENCODE_DIRECT(c, directO, directWS) \
2142 ((c) < 128 && (c) > 0 && \
2143 ((utf7_category[(c)] == 0) || \
2144 (directWS && (utf7_category[(c)] == 2)) || \
2145 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002146
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002147PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002148 Py_ssize_t size,
2149 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002150{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002151 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2152}
2153
Antoine Pitrou244651a2009-05-04 18:56:13 +00002154/* The decoder. The only state we preserve is our read position,
2155 * i.e. how many characters we have consumed. So if we end in the
2156 * middle of a shift sequence we have to back off the read position
2157 * and the output to the beginning of the sequence, otherwise we lose
2158 * all the shift state (seen bits, number of bits seen, high
2159 * surrogate). */
2160
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002161PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002162 Py_ssize_t size,
2163 const char *errors,
2164 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002165{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002166 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002167 Py_ssize_t startinpos;
2168 Py_ssize_t endinpos;
2169 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002170 const char *e;
2171 PyUnicodeObject *unicode;
2172 Py_UNICODE *p;
2173 const char *errmsg = "";
2174 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002175 Py_UNICODE *shiftOutStart;
2176 unsigned int base64bits = 0;
2177 unsigned long base64buffer = 0;
2178 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002179 PyObject *errorHandler = NULL;
2180 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002181
2182 unicode = _PyUnicode_New(size);
2183 if (!unicode)
2184 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002185 if (size == 0) {
2186 if (consumed)
2187 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002188 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002189 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002190
2191 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002192 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002193 e = s + size;
2194
2195 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002196 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002197 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002198 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002199
Antoine Pitrou244651a2009-05-04 18:56:13 +00002200 if (inShift) { /* in a base-64 section */
2201 if (IS_BASE64(ch)) { /* consume a base-64 character */
2202 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2203 base64bits += 6;
2204 s++;
2205 if (base64bits >= 16) {
2206 /* we have enough bits for a UTF-16 value */
2207 Py_UNICODE outCh = (Py_UNICODE)
2208 (base64buffer >> (base64bits-16));
2209 base64bits -= 16;
2210 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2211 if (surrogate) {
2212 /* expecting a second surrogate */
2213 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2214#ifdef Py_UNICODE_WIDE
2215 *p++ = (((surrogate & 0x3FF)<<10)
2216 | (outCh & 0x3FF)) + 0x10000;
2217#else
2218 *p++ = surrogate;
2219 *p++ = outCh;
2220#endif
2221 surrogate = 0;
2222 }
2223 else {
2224 surrogate = 0;
2225 errmsg = "second surrogate missing";
2226 goto utf7Error;
2227 }
2228 }
2229 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2230 /* first surrogate */
2231 surrogate = outCh;
2232 }
2233 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2234 errmsg = "unexpected second surrogate";
2235 goto utf7Error;
2236 }
2237 else {
2238 *p++ = outCh;
2239 }
2240 }
2241 }
2242 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002243 inShift = 0;
2244 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002245 if (surrogate) {
2246 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002247 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002248 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002249 if (base64bits > 0) { /* left-over bits */
2250 if (base64bits >= 6) {
2251 /* We've seen at least one base-64 character */
2252 errmsg = "partial character in shift sequence";
2253 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002254 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002255 else {
2256 /* Some bits remain; they should be zero */
2257 if (base64buffer != 0) {
2258 errmsg = "non-zero padding bits in shift sequence";
2259 goto utf7Error;
2260 }
2261 }
2262 }
2263 if (ch != '-') {
2264 /* '-' is absorbed; other terminating
2265 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002266 *p++ = ch;
2267 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002268 }
2269 }
2270 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002271 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002272 s++; /* consume '+' */
2273 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002274 s++;
2275 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002276 }
2277 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002278 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002279 shiftOutStart = p;
2280 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002281 }
2282 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002283 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002284 *p++ = ch;
2285 s++;
2286 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002287 else {
2288 startinpos = s-starts;
2289 s++;
2290 errmsg = "unexpected special character";
2291 goto utf7Error;
2292 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002293 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002294utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002295 outpos = p-PyUnicode_AS_UNICODE(unicode);
2296 endinpos = s-starts;
2297 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002298 errors, &errorHandler,
2299 "utf7", errmsg,
2300 &starts, &e, &startinpos, &endinpos, &exc, &s,
2301 &unicode, &outpos, &p))
2302 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002303 }
2304
Antoine Pitrou244651a2009-05-04 18:56:13 +00002305 /* end of string */
2306
2307 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2308 /* if we're in an inconsistent state, that's an error */
2309 if (surrogate ||
2310 (base64bits >= 6) ||
2311 (base64bits > 0 && base64buffer != 0)) {
2312 outpos = p-PyUnicode_AS_UNICODE(unicode);
2313 endinpos = size;
2314 if (unicode_decode_call_errorhandler(
2315 errors, &errorHandler,
2316 "utf7", "unterminated shift sequence",
2317 &starts, &e, &startinpos, &endinpos, &exc, &s,
2318 &unicode, &outpos, &p))
2319 goto onError;
2320 if (s < e)
2321 goto restart;
2322 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002323 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002324
2325 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002326 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002327 if (inShift) {
2328 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002329 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002330 }
2331 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002332 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002333 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002334 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002335
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002336 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002337 goto onError;
2338
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002339 Py_XDECREF(errorHandler);
2340 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002341 return (PyObject *)unicode;
2342
Benjamin Peterson29060642009-01-31 22:14:21 +00002343 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002344 Py_XDECREF(errorHandler);
2345 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002346 Py_DECREF(unicode);
2347 return NULL;
2348}
2349
2350
2351PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002352 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002353 int base64SetO,
2354 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002355 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002356{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002357 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002358 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002359 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002360 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002361 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002362 unsigned int base64bits = 0;
2363 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002364 char * out;
2365 char * start;
2366
2367 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002368 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002369
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002370 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002371 return PyErr_NoMemory();
2372
Antoine Pitrou244651a2009-05-04 18:56:13 +00002373 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002374 if (v == NULL)
2375 return NULL;
2376
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002377 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002378 for (;i < size; ++i) {
2379 Py_UNICODE ch = s[i];
2380
Antoine Pitrou244651a2009-05-04 18:56:13 +00002381 if (inShift) {
2382 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2383 /* shifting out */
2384 if (base64bits) { /* output remaining bits */
2385 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2386 base64buffer = 0;
2387 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002388 }
2389 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002390 /* Characters not in the BASE64 set implicitly unshift the sequence
2391 so no '-' is required, except if the character is itself a '-' */
2392 if (IS_BASE64(ch) || ch == '-') {
2393 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002394 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002395 *out++ = (char) ch;
2396 }
2397 else {
2398 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002399 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002400 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002401 else { /* not in a shift sequence */
2402 if (ch == '+') {
2403 *out++ = '+';
2404 *out++ = '-';
2405 }
2406 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2407 *out++ = (char) ch;
2408 }
2409 else {
2410 *out++ = '+';
2411 inShift = 1;
2412 goto encode_char;
2413 }
2414 }
2415 continue;
2416encode_char:
2417#ifdef Py_UNICODE_WIDE
2418 if (ch >= 0x10000) {
2419 /* code first surrogate */
2420 base64bits += 16;
2421 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2422 while (base64bits >= 6) {
2423 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2424 base64bits -= 6;
2425 }
2426 /* prepare second surrogate */
2427 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2428 }
2429#endif
2430 base64bits += 16;
2431 base64buffer = (base64buffer << 16) | ch;
2432 while (base64bits >= 6) {
2433 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2434 base64bits -= 6;
2435 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002436 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002437 if (base64bits)
2438 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2439 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002440 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002441 if (_PyBytes_Resize(&v, out - start) < 0)
2442 return NULL;
2443 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002444}
2445
Antoine Pitrou244651a2009-05-04 18:56:13 +00002446#undef IS_BASE64
2447#undef FROM_BASE64
2448#undef TO_BASE64
2449#undef DECODE_DIRECT
2450#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002451
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452/* --- UTF-8 Codec -------------------------------------------------------- */
2453
Tim Petersced69f82003-09-16 20:30:58 +00002454static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002456 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2457 illegal prefix. See RFC 3629 for details */
2458 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2459 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2463 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2464 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002465 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2470 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2471 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2472 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2473 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474};
2475
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002477 Py_ssize_t size,
2478 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479{
Walter Dörwald69652032004-09-07 20:24:22 +00002480 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2481}
2482
Antoine Pitrouab868312009-01-10 15:40:25 +00002483/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2484#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2485
2486/* Mask to quickly check whether a C 'long' contains a
2487 non-ASCII, UTF8-encoded char. */
2488#if (SIZEOF_LONG == 8)
2489# define ASCII_CHAR_MASK 0x8080808080808080L
2490#elif (SIZEOF_LONG == 4)
2491# define ASCII_CHAR_MASK 0x80808080L
2492#else
2493# error C 'long' size should be either 4 or 8!
2494#endif
2495
Walter Dörwald69652032004-09-07 20:24:22 +00002496PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002497 Py_ssize_t size,
2498 const char *errors,
2499 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002500{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002501 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002503 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002504 Py_ssize_t startinpos;
2505 Py_ssize_t endinpos;
2506 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002507 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 PyUnicodeObject *unicode;
2509 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002510 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511 PyObject *errorHandler = NULL;
2512 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513
2514 /* Note: size will always be longer than the resulting Unicode
2515 character count */
2516 unicode = _PyUnicode_New(size);
2517 if (!unicode)
2518 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002519 if (size == 0) {
2520 if (consumed)
2521 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002524
2525 /* Unpack UTF-8 encoded data */
2526 p = unicode->str;
2527 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002528 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529
2530 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002531 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532
2533 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002534 /* Fast path for runs of ASCII characters. Given that common UTF-8
2535 input will consist of an overwhelming majority of ASCII
2536 characters, we try to optimize for this case by checking
2537 as many characters as a C 'long' can contain.
2538 First, check if we can do an aligned read, as most CPUs have
2539 a penalty for unaligned reads.
2540 */
2541 if (!((size_t) s & LONG_PTR_MASK)) {
2542 /* Help register allocation */
2543 register const char *_s = s;
2544 register Py_UNICODE *_p = p;
2545 while (_s < aligned_end) {
2546 /* Read a whole long at a time (either 4 or 8 bytes),
2547 and do a fast unrolled copy if it only contains ASCII
2548 characters. */
2549 unsigned long data = *(unsigned long *) _s;
2550 if (data & ASCII_CHAR_MASK)
2551 break;
2552 _p[0] = (unsigned char) _s[0];
2553 _p[1] = (unsigned char) _s[1];
2554 _p[2] = (unsigned char) _s[2];
2555 _p[3] = (unsigned char) _s[3];
2556#if (SIZEOF_LONG == 8)
2557 _p[4] = (unsigned char) _s[4];
2558 _p[5] = (unsigned char) _s[5];
2559 _p[6] = (unsigned char) _s[6];
2560 _p[7] = (unsigned char) _s[7];
2561#endif
2562 _s += SIZEOF_LONG;
2563 _p += SIZEOF_LONG;
2564 }
2565 s = _s;
2566 p = _p;
2567 if (s == e)
2568 break;
2569 ch = (unsigned char)*s;
2570 }
2571 }
2572
2573 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002574 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 s++;
2576 continue;
2577 }
2578
2579 n = utf8_code_length[ch];
2580
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002581 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002582 if (consumed)
2583 break;
2584 else {
2585 errmsg = "unexpected end of data";
2586 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002587 endinpos = startinpos+1;
2588 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2589 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002590 goto utf8Error;
2591 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593
2594 switch (n) {
2595
2596 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002597 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002598 startinpos = s-starts;
2599 endinpos = startinpos+1;
2600 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601
2602 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002603 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002604 startinpos = s-starts;
2605 endinpos = startinpos+1;
2606 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607
2608 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002609 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002610 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002611 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002612 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002613 goto utf8Error;
2614 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002616 assert ((ch > 0x007F) && (ch <= 0x07FF));
2617 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 break;
2619
2620 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002621 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2622 will result in surrogates in range d800-dfff. Surrogates are
2623 not valid UTF-8 so they are rejected.
2624 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2625 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002626 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002627 (s[2] & 0xc0) != 0x80 ||
2628 ((unsigned char)s[0] == 0xE0 &&
2629 (unsigned char)s[1] < 0xA0) ||
2630 ((unsigned char)s[0] == 0xED &&
2631 (unsigned char)s[1] > 0x9F)) {
2632 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002633 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002634 endinpos = startinpos + 1;
2635
2636 /* if s[1] first two bits are 1 and 0, then the invalid
2637 continuation byte is s[2], so increment endinpos by 1,
2638 if not, s[1] is invalid and endinpos doesn't need to
2639 be incremented. */
2640 if ((s[1] & 0xC0) == 0x80)
2641 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002642 goto utf8Error;
2643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002644 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002645 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2646 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002647 break;
2648
2649 case 4:
2650 if ((s[1] & 0xc0) != 0x80 ||
2651 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002652 (s[3] & 0xc0) != 0x80 ||
2653 ((unsigned char)s[0] == 0xF0 &&
2654 (unsigned char)s[1] < 0x90) ||
2655 ((unsigned char)s[0] == 0xF4 &&
2656 (unsigned char)s[1] > 0x8F)) {
2657 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002658 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002659 endinpos = startinpos + 1;
2660 if ((s[1] & 0xC0) == 0x80) {
2661 endinpos++;
2662 if ((s[2] & 0xC0) == 0x80)
2663 endinpos++;
2664 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002665 goto utf8Error;
2666 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002667 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002668 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2669 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2670
Fredrik Lundh8f455852001-06-27 18:59:43 +00002671#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002672 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002673#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002674 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002675
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002676 /* translate from 10000..10FFFF to 0..FFFF */
2677 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002678
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002679 /* high surrogate = top 10 bits added to D800 */
2680 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002681
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002682 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002683 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002684#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 }
2687 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002688 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002689
Benjamin Peterson29060642009-01-31 22:14:21 +00002690 utf8Error:
2691 outpos = p-PyUnicode_AS_UNICODE(unicode);
2692 if (unicode_decode_call_errorhandler(
2693 errors, &errorHandler,
2694 "utf8", errmsg,
2695 &starts, &e, &startinpos, &endinpos, &exc, &s,
2696 &unicode, &outpos, &p))
2697 goto onError;
2698 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 }
Walter Dörwald69652032004-09-07 20:24:22 +00002700 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002701 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702
2703 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002704 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 goto onError;
2706
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002707 Py_XDECREF(errorHandler);
2708 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 return (PyObject *)unicode;
2710
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 Py_XDECREF(errorHandler);
2713 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 Py_DECREF(unicode);
2715 return NULL;
2716}
2717
Antoine Pitrouab868312009-01-10 15:40:25 +00002718#undef ASCII_CHAR_MASK
2719
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002720#ifdef __APPLE__
2721
2722/* Simplified UTF-8 decoder using surrogateescape error handler,
2723 used to decode the command line arguments on Mac OS X. */
2724
2725wchar_t*
2726_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2727{
2728 int n;
2729 const char *e;
2730 wchar_t *unicode, *p;
2731
2732 /* Note: size will always be longer than the resulting Unicode
2733 character count */
2734 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2735 PyErr_NoMemory();
2736 return NULL;
2737 }
2738 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2739 if (!unicode)
2740 return NULL;
2741
2742 /* Unpack UTF-8 encoded data */
2743 p = unicode;
2744 e = s + size;
2745 while (s < e) {
2746 Py_UCS4 ch = (unsigned char)*s;
2747
2748 if (ch < 0x80) {
2749 *p++ = (wchar_t)ch;
2750 s++;
2751 continue;
2752 }
2753
2754 n = utf8_code_length[ch];
2755 if (s + n > e) {
2756 goto surrogateescape;
2757 }
2758
2759 switch (n) {
2760 case 0:
2761 case 1:
2762 goto surrogateescape;
2763
2764 case 2:
2765 if ((s[1] & 0xc0) != 0x80)
2766 goto surrogateescape;
2767 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2768 assert ((ch > 0x007F) && (ch <= 0x07FF));
2769 *p++ = (wchar_t)ch;
2770 break;
2771
2772 case 3:
2773 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2774 will result in surrogates in range d800-dfff. Surrogates are
2775 not valid UTF-8 so they are rejected.
2776 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2777 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2778 if ((s[1] & 0xc0) != 0x80 ||
2779 (s[2] & 0xc0) != 0x80 ||
2780 ((unsigned char)s[0] == 0xE0 &&
2781 (unsigned char)s[1] < 0xA0) ||
2782 ((unsigned char)s[0] == 0xED &&
2783 (unsigned char)s[1] > 0x9F)) {
2784
2785 goto surrogateescape;
2786 }
2787 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2788 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2789 *p++ = (Py_UNICODE)ch;
2790 break;
2791
2792 case 4:
2793 if ((s[1] & 0xc0) != 0x80 ||
2794 (s[2] & 0xc0) != 0x80 ||
2795 (s[3] & 0xc0) != 0x80 ||
2796 ((unsigned char)s[0] == 0xF0 &&
2797 (unsigned char)s[1] < 0x90) ||
2798 ((unsigned char)s[0] == 0xF4 &&
2799 (unsigned char)s[1] > 0x8F)) {
2800 goto surrogateescape;
2801 }
2802 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2803 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2804 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2805
2806#if SIZEOF_WCHAR_T == 4
2807 *p++ = (wchar_t)ch;
2808#else
2809 /* compute and append the two surrogates: */
2810
2811 /* translate from 10000..10FFFF to 0..FFFF */
2812 ch -= 0x10000;
2813
2814 /* high surrogate = top 10 bits added to D800 */
2815 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2816
2817 /* low surrogate = bottom 10 bits added to DC00 */
2818 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2819#endif
2820 break;
2821 }
2822 s += n;
2823 continue;
2824
2825 surrogateescape:
2826 *p++ = 0xDC00 + ch;
2827 s++;
2828 }
2829 *p = L'\0';
2830 return unicode;
2831}
2832
2833#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002834
Tim Peters602f7402002-04-27 18:03:26 +00002835/* Allocation strategy: if the string is short, convert into a stack buffer
2836 and allocate exactly as much space needed at the end. Else allocate the
2837 maximum possible needed (4 result bytes per Unicode character), and return
2838 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002839*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002840PyObject *
2841PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002842 Py_ssize_t size,
2843 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844{
Tim Peters602f7402002-04-27 18:03:26 +00002845#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002846
Guido van Rossum98297ee2007-11-06 21:34:58 +00002847 Py_ssize_t i; /* index into s of next input byte */
2848 PyObject *result; /* result string object */
2849 char *p; /* next free byte in output buffer */
2850 Py_ssize_t nallocated; /* number of result bytes allocated */
2851 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002852 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002853 PyObject *errorHandler = NULL;
2854 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002855
Tim Peters602f7402002-04-27 18:03:26 +00002856 assert(s != NULL);
2857 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858
Tim Peters602f7402002-04-27 18:03:26 +00002859 if (size <= MAX_SHORT_UNICHARS) {
2860 /* Write into the stack buffer; nallocated can't overflow.
2861 * At the end, we'll allocate exactly as much heap space as it
2862 * turns out we need.
2863 */
2864 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002865 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002866 p = stackbuf;
2867 }
2868 else {
2869 /* Overallocate on the heap, and give the excess back at the end. */
2870 nallocated = size * 4;
2871 if (nallocated / 4 != size) /* overflow! */
2872 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002873 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002874 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002875 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002876 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002877 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002878
Tim Peters602f7402002-04-27 18:03:26 +00002879 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002880 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002881
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002882 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002883 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002885
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002887 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002888 *p++ = (char)(0xc0 | (ch >> 6));
2889 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002890 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002891#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002892 /* Special case: check for high and low surrogate */
2893 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2894 Py_UCS4 ch2 = s[i];
2895 /* Combine the two surrogates to form a UCS4 value */
2896 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2897 i++;
2898
2899 /* Encode UCS4 Unicode ordinals */
2900 *p++ = (char)(0xf0 | (ch >> 18));
2901 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002902 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2903 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002904 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002905#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002906 Py_ssize_t newpos;
2907 PyObject *rep;
2908 Py_ssize_t repsize, k;
2909 rep = unicode_encode_call_errorhandler
2910 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2911 s, size, &exc, i-1, i, &newpos);
2912 if (!rep)
2913 goto error;
2914
2915 if (PyBytes_Check(rep))
2916 repsize = PyBytes_GET_SIZE(rep);
2917 else
2918 repsize = PyUnicode_GET_SIZE(rep);
2919
2920 if (repsize > 4) {
2921 Py_ssize_t offset;
2922
2923 if (result == NULL)
2924 offset = p - stackbuf;
2925 else
2926 offset = p - PyBytes_AS_STRING(result);
2927
2928 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2929 /* integer overflow */
2930 PyErr_NoMemory();
2931 goto error;
2932 }
2933 nallocated += repsize - 4;
2934 if (result != NULL) {
2935 if (_PyBytes_Resize(&result, nallocated) < 0)
2936 goto error;
2937 } else {
2938 result = PyBytes_FromStringAndSize(NULL, nallocated);
2939 if (result == NULL)
2940 goto error;
2941 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2942 }
2943 p = PyBytes_AS_STRING(result) + offset;
2944 }
2945
2946 if (PyBytes_Check(rep)) {
2947 char *prep = PyBytes_AS_STRING(rep);
2948 for(k = repsize; k > 0; k--)
2949 *p++ = *prep++;
2950 } else /* rep is unicode */ {
2951 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2952 Py_UNICODE c;
2953
2954 for(k=0; k<repsize; k++) {
2955 c = prep[k];
2956 if (0x80 <= c) {
2957 raise_encode_exception(&exc, "utf-8", s, size,
2958 i-1, i, "surrogates not allowed");
2959 goto error;
2960 }
2961 *p++ = (char)prep[k];
2962 }
2963 }
2964 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002965#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002966 }
Victor Stinner445a6232010-04-22 20:01:57 +00002967#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002968 } else if (ch < 0x10000) {
2969 *p++ = (char)(0xe0 | (ch >> 12));
2970 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2971 *p++ = (char)(0x80 | (ch & 0x3f));
2972 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002973 /* Encode UCS4 Unicode ordinals */
2974 *p++ = (char)(0xf0 | (ch >> 18));
2975 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2976 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2977 *p++ = (char)(0x80 | (ch & 0x3f));
2978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002980
Guido van Rossum98297ee2007-11-06 21:34:58 +00002981 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002982 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002983 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002984 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002985 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002986 }
2987 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002988 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002989 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002990 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002991 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002992 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002993 Py_XDECREF(errorHandler);
2994 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002995 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002996 error:
2997 Py_XDECREF(errorHandler);
2998 Py_XDECREF(exc);
2999 Py_XDECREF(result);
3000 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003001
Tim Peters602f7402002-04-27 18:03:26 +00003002#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003}
3004
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3006{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 if (!PyUnicode_Check(unicode)) {
3008 PyErr_BadArgument();
3009 return NULL;
3010 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003011 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003012 PyUnicode_GET_SIZE(unicode),
3013 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014}
3015
Walter Dörwald41980ca2007-08-16 21:55:45 +00003016/* --- UTF-32 Codec ------------------------------------------------------- */
3017
3018PyObject *
3019PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003020 Py_ssize_t size,
3021 const char *errors,
3022 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003023{
3024 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3025}
3026
3027PyObject *
3028PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 Py_ssize_t size,
3030 const char *errors,
3031 int *byteorder,
3032 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003033{
3034 const char *starts = s;
3035 Py_ssize_t startinpos;
3036 Py_ssize_t endinpos;
3037 Py_ssize_t outpos;
3038 PyUnicodeObject *unicode;
3039 Py_UNICODE *p;
3040#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003041 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003042 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003043#else
3044 const int pairs = 0;
3045#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003046 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003047 int bo = 0; /* assume native ordering by default */
3048 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003049 /* Offsets from q for retrieving bytes in the right order. */
3050#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3051 int iorder[] = {0, 1, 2, 3};
3052#else
3053 int iorder[] = {3, 2, 1, 0};
3054#endif
3055 PyObject *errorHandler = NULL;
3056 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003057
Walter Dörwald41980ca2007-08-16 21:55:45 +00003058 q = (unsigned char *)s;
3059 e = q + size;
3060
3061 if (byteorder)
3062 bo = *byteorder;
3063
3064 /* Check for BOM marks (U+FEFF) in the input and adjust current
3065 byte order setting accordingly. In native mode, the leading BOM
3066 mark is skipped, in all other modes, it is copied to the output
3067 stream as-is (giving a ZWNBSP character). */
3068 if (bo == 0) {
3069 if (size >= 4) {
3070 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003072#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003073 if (bom == 0x0000FEFF) {
3074 q += 4;
3075 bo = -1;
3076 }
3077 else if (bom == 0xFFFE0000) {
3078 q += 4;
3079 bo = 1;
3080 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003081#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003082 if (bom == 0x0000FEFF) {
3083 q += 4;
3084 bo = 1;
3085 }
3086 else if (bom == 0xFFFE0000) {
3087 q += 4;
3088 bo = -1;
3089 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003090#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003092 }
3093
3094 if (bo == -1) {
3095 /* force LE */
3096 iorder[0] = 0;
3097 iorder[1] = 1;
3098 iorder[2] = 2;
3099 iorder[3] = 3;
3100 }
3101 else if (bo == 1) {
3102 /* force BE */
3103 iorder[0] = 3;
3104 iorder[1] = 2;
3105 iorder[2] = 1;
3106 iorder[3] = 0;
3107 }
3108
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003109 /* On narrow builds we split characters outside the BMP into two
3110 codepoints => count how much extra space we need. */
3111#ifndef Py_UNICODE_WIDE
3112 for (qq = q; qq < e; qq += 4)
3113 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3114 pairs++;
3115#endif
3116
3117 /* This might be one to much, because of a BOM */
3118 unicode = _PyUnicode_New((size+3)/4+pairs);
3119 if (!unicode)
3120 return NULL;
3121 if (size == 0)
3122 return (PyObject *)unicode;
3123
3124 /* Unpack UTF-32 encoded data */
3125 p = unicode->str;
3126
Walter Dörwald41980ca2007-08-16 21:55:45 +00003127 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003128 Py_UCS4 ch;
3129 /* remaining bytes at the end? (size should be divisible by 4) */
3130 if (e-q<4) {
3131 if (consumed)
3132 break;
3133 errmsg = "truncated data";
3134 startinpos = ((const char *)q)-starts;
3135 endinpos = ((const char *)e)-starts;
3136 goto utf32Error;
3137 /* The remaining input chars are ignored if the callback
3138 chooses to skip the input */
3139 }
3140 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3141 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003142
Benjamin Peterson29060642009-01-31 22:14:21 +00003143 if (ch >= 0x110000)
3144 {
3145 errmsg = "codepoint not in range(0x110000)";
3146 startinpos = ((const char *)q)-starts;
3147 endinpos = startinpos+4;
3148 goto utf32Error;
3149 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003150#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003151 if (ch >= 0x10000)
3152 {
3153 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3154 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3155 }
3156 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003157#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003158 *p++ = ch;
3159 q += 4;
3160 continue;
3161 utf32Error:
3162 outpos = p-PyUnicode_AS_UNICODE(unicode);
3163 if (unicode_decode_call_errorhandler(
3164 errors, &errorHandler,
3165 "utf32", errmsg,
3166 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3167 &unicode, &outpos, &p))
3168 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003169 }
3170
3171 if (byteorder)
3172 *byteorder = bo;
3173
3174 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003175 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003176
3177 /* Adjust length */
3178 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3179 goto onError;
3180
3181 Py_XDECREF(errorHandler);
3182 Py_XDECREF(exc);
3183 return (PyObject *)unicode;
3184
Benjamin Peterson29060642009-01-31 22:14:21 +00003185 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003186 Py_DECREF(unicode);
3187 Py_XDECREF(errorHandler);
3188 Py_XDECREF(exc);
3189 return NULL;
3190}
3191
3192PyObject *
3193PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003194 Py_ssize_t size,
3195 const char *errors,
3196 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003197{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003198 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003199 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003200 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003201#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003202 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003203#else
3204 const int pairs = 0;
3205#endif
3206 /* Offsets from p for storing byte pairs in the right order. */
3207#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3208 int iorder[] = {0, 1, 2, 3};
3209#else
3210 int iorder[] = {3, 2, 1, 0};
3211#endif
3212
Benjamin Peterson29060642009-01-31 22:14:21 +00003213#define STORECHAR(CH) \
3214 do { \
3215 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3216 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3217 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3218 p[iorder[0]] = (CH) & 0xff; \
3219 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003220 } while(0)
3221
3222 /* In narrow builds we can output surrogate pairs as one codepoint,
3223 so we need less space. */
3224#ifndef Py_UNICODE_WIDE
3225 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3227 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3228 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003229#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003230 nsize = (size - pairs + (byteorder == 0));
3231 bytesize = nsize * 4;
3232 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003234 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003235 if (v == NULL)
3236 return NULL;
3237
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003238 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003239 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003241 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003242 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003243
3244 if (byteorder == -1) {
3245 /* force LE */
3246 iorder[0] = 0;
3247 iorder[1] = 1;
3248 iorder[2] = 2;
3249 iorder[3] = 3;
3250 }
3251 else if (byteorder == 1) {
3252 /* force BE */
3253 iorder[0] = 3;
3254 iorder[1] = 2;
3255 iorder[2] = 1;
3256 iorder[3] = 0;
3257 }
3258
3259 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003260 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003261#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3263 Py_UCS4 ch2 = *s;
3264 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3265 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3266 s++;
3267 size--;
3268 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003269 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003270#endif
3271 STORECHAR(ch);
3272 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003273
3274 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003275 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003276#undef STORECHAR
3277}
3278
3279PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3280{
3281 if (!PyUnicode_Check(unicode)) {
3282 PyErr_BadArgument();
3283 return NULL;
3284 }
3285 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003286 PyUnicode_GET_SIZE(unicode),
3287 NULL,
3288 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003289}
3290
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291/* --- UTF-16 Codec ------------------------------------------------------- */
3292
Tim Peters772747b2001-08-09 22:21:55 +00003293PyObject *
3294PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 Py_ssize_t size,
3296 const char *errors,
3297 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298{
Walter Dörwald69652032004-09-07 20:24:22 +00003299 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3300}
3301
Antoine Pitrouab868312009-01-10 15:40:25 +00003302/* Two masks for fast checking of whether a C 'long' may contain
3303 UTF16-encoded surrogate characters. This is an efficient heuristic,
3304 assuming that non-surrogate characters with a code point >= 0x8000 are
3305 rare in most input.
3306 FAST_CHAR_MASK is used when the input is in native byte ordering,
3307 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003308*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003309#if (SIZEOF_LONG == 8)
3310# define FAST_CHAR_MASK 0x8000800080008000L
3311# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3312#elif (SIZEOF_LONG == 4)
3313# define FAST_CHAR_MASK 0x80008000L
3314# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3315#else
3316# error C 'long' size should be either 4 or 8!
3317#endif
3318
Walter Dörwald69652032004-09-07 20:24:22 +00003319PyObject *
3320PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003321 Py_ssize_t size,
3322 const char *errors,
3323 int *byteorder,
3324 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003325{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003327 Py_ssize_t startinpos;
3328 Py_ssize_t endinpos;
3329 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 PyUnicodeObject *unicode;
3331 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003332 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003333 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003334 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003335 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003336 /* Offsets from q for retrieving byte pairs in the right order. */
3337#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3338 int ihi = 1, ilo = 0;
3339#else
3340 int ihi = 0, ilo = 1;
3341#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 PyObject *errorHandler = NULL;
3343 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344
3345 /* Note: size will always be longer than the resulting Unicode
3346 character count */
3347 unicode = _PyUnicode_New(size);
3348 if (!unicode)
3349 return NULL;
3350 if (size == 0)
3351 return (PyObject *)unicode;
3352
3353 /* Unpack UTF-16 encoded data */
3354 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003355 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003356 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357
3358 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003359 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003361 /* Check for BOM marks (U+FEFF) in the input and adjust current
3362 byte order setting accordingly. In native mode, the leading BOM
3363 mark is skipped, in all other modes, it is copied to the output
3364 stream as-is (giving a ZWNBSP character). */
3365 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003366 if (size >= 2) {
3367 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003368#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003369 if (bom == 0xFEFF) {
3370 q += 2;
3371 bo = -1;
3372 }
3373 else if (bom == 0xFFFE) {
3374 q += 2;
3375 bo = 1;
3376 }
Tim Petersced69f82003-09-16 20:30:58 +00003377#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003378 if (bom == 0xFEFF) {
3379 q += 2;
3380 bo = 1;
3381 }
3382 else if (bom == 0xFFFE) {
3383 q += 2;
3384 bo = -1;
3385 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003386#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003387 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389
Tim Peters772747b2001-08-09 22:21:55 +00003390 if (bo == -1) {
3391 /* force LE */
3392 ihi = 1;
3393 ilo = 0;
3394 }
3395 else if (bo == 1) {
3396 /* force BE */
3397 ihi = 0;
3398 ilo = 1;
3399 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003400#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3401 native_ordering = ilo < ihi;
3402#else
3403 native_ordering = ilo > ihi;
3404#endif
Tim Peters772747b2001-08-09 22:21:55 +00003405
Antoine Pitrouab868312009-01-10 15:40:25 +00003406 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003407 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003409 /* First check for possible aligned read of a C 'long'. Unaligned
3410 reads are more expensive, better to defer to another iteration. */
3411 if (!((size_t) q & LONG_PTR_MASK)) {
3412 /* Fast path for runs of non-surrogate chars. */
3413 register const unsigned char *_q = q;
3414 Py_UNICODE *_p = p;
3415 if (native_ordering) {
3416 /* Native ordering is simple: as long as the input cannot
3417 possibly contain a surrogate char, do an unrolled copy
3418 of several 16-bit code points to the target object.
3419 The non-surrogate check is done on several input bytes
3420 at a time (as many as a C 'long' can contain). */
3421 while (_q < aligned_end) {
3422 unsigned long data = * (unsigned long *) _q;
3423 if (data & FAST_CHAR_MASK)
3424 break;
3425 _p[0] = ((unsigned short *) _q)[0];
3426 _p[1] = ((unsigned short *) _q)[1];
3427#if (SIZEOF_LONG == 8)
3428 _p[2] = ((unsigned short *) _q)[2];
3429 _p[3] = ((unsigned short *) _q)[3];
3430#endif
3431 _q += SIZEOF_LONG;
3432 _p += SIZEOF_LONG / 2;
3433 }
3434 }
3435 else {
3436 /* Byteswapped ordering is similar, but we must decompose
3437 the copy bytewise, and take care of zero'ing out the
3438 upper bytes if the target object is in 32-bit units
3439 (that is, in UCS-4 builds). */
3440 while (_q < aligned_end) {
3441 unsigned long data = * (unsigned long *) _q;
3442 if (data & SWAPPED_FAST_CHAR_MASK)
3443 break;
3444 /* Zero upper bytes in UCS-4 builds */
3445#if (Py_UNICODE_SIZE > 2)
3446 _p[0] = 0;
3447 _p[1] = 0;
3448#if (SIZEOF_LONG == 8)
3449 _p[2] = 0;
3450 _p[3] = 0;
3451#endif
3452#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003453 /* Issue #4916; UCS-4 builds on big endian machines must
3454 fill the two last bytes of each 4-byte unit. */
3455#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3456# define OFF 2
3457#else
3458# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003459#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003460 ((unsigned char *) _p)[OFF + 1] = _q[0];
3461 ((unsigned char *) _p)[OFF + 0] = _q[1];
3462 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3463 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3464#if (SIZEOF_LONG == 8)
3465 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3466 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3467 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3468 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3469#endif
3470#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003471 _q += SIZEOF_LONG;
3472 _p += SIZEOF_LONG / 2;
3473 }
3474 }
3475 p = _p;
3476 q = _q;
3477 if (q >= e)
3478 break;
3479 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003480 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481
Benjamin Peterson14339b62009-01-31 16:36:08 +00003482 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003483
3484 if (ch < 0xD800 || ch > 0xDFFF) {
3485 *p++ = ch;
3486 continue;
3487 }
3488
3489 /* UTF-16 code pair: */
3490 if (q > e) {
3491 errmsg = "unexpected end of data";
3492 startinpos = (((const char *)q) - 2) - starts;
3493 endinpos = ((const char *)e) + 1 - starts;
3494 goto utf16Error;
3495 }
3496 if (0xD800 <= ch && ch <= 0xDBFF) {
3497 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3498 q += 2;
3499 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003500#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003501 *p++ = ch;
3502 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003503#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003505#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003506 continue;
3507 }
3508 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003509 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003510 startinpos = (((const char *)q)-4)-starts;
3511 endinpos = startinpos+2;
3512 goto utf16Error;
3513 }
3514
Benjamin Peterson14339b62009-01-31 16:36:08 +00003515 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003516 errmsg = "illegal encoding";
3517 startinpos = (((const char *)q)-2)-starts;
3518 endinpos = startinpos+2;
3519 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003520
Benjamin Peterson29060642009-01-31 22:14:21 +00003521 utf16Error:
3522 outpos = p - PyUnicode_AS_UNICODE(unicode);
3523 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003524 errors,
3525 &errorHandler,
3526 "utf16", errmsg,
3527 &starts,
3528 (const char **)&e,
3529 &startinpos,
3530 &endinpos,
3531 &exc,
3532 (const char **)&q,
3533 &unicode,
3534 &outpos,
3535 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003536 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003538 /* remaining byte at the end? (size should be even) */
3539 if (e == q) {
3540 if (!consumed) {
3541 errmsg = "truncated data";
3542 startinpos = ((const char *)q) - starts;
3543 endinpos = ((const char *)e) + 1 - starts;
3544 outpos = p - PyUnicode_AS_UNICODE(unicode);
3545 if (unicode_decode_call_errorhandler(
3546 errors,
3547 &errorHandler,
3548 "utf16", errmsg,
3549 &starts,
3550 (const char **)&e,
3551 &startinpos,
3552 &endinpos,
3553 &exc,
3554 (const char **)&q,
3555 &unicode,
3556 &outpos,
3557 &p))
3558 goto onError;
3559 /* The remaining input chars are ignored if the callback
3560 chooses to skip the input */
3561 }
3562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563
3564 if (byteorder)
3565 *byteorder = bo;
3566
Walter Dörwald69652032004-09-07 20:24:22 +00003567 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003568 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003569
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003571 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 goto onError;
3573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 Py_XDECREF(errorHandler);
3575 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576 return (PyObject *)unicode;
3577
Benjamin Peterson29060642009-01-31 22:14:21 +00003578 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 Py_XDECREF(errorHandler);
3581 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582 return NULL;
3583}
3584
Antoine Pitrouab868312009-01-10 15:40:25 +00003585#undef FAST_CHAR_MASK
3586#undef SWAPPED_FAST_CHAR_MASK
3587
Tim Peters772747b2001-08-09 22:21:55 +00003588PyObject *
3589PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 Py_ssize_t size,
3591 const char *errors,
3592 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003594 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003595 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003596 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003597#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003598 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003599#else
3600 const int pairs = 0;
3601#endif
Tim Peters772747b2001-08-09 22:21:55 +00003602 /* Offsets from p for storing byte pairs in the right order. */
3603#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3604 int ihi = 1, ilo = 0;
3605#else
3606 int ihi = 0, ilo = 1;
3607#endif
3608
Benjamin Peterson29060642009-01-31 22:14:21 +00003609#define STORECHAR(CH) \
3610 do { \
3611 p[ihi] = ((CH) >> 8) & 0xff; \
3612 p[ilo] = (CH) & 0xff; \
3613 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003614 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003616#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003617 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003618 if (s[i] >= 0x10000)
3619 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003620#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003621 /* 2 * (size + pairs + (byteorder == 0)) */
3622 if (size > PY_SSIZE_T_MAX ||
3623 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003624 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003625 nsize = size + pairs + (byteorder == 0);
3626 bytesize = nsize * 2;
3627 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003628 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003629 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 if (v == NULL)
3631 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003633 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003636 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003637 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003638
3639 if (byteorder == -1) {
3640 /* force LE */
3641 ihi = 1;
3642 ilo = 0;
3643 }
3644 else if (byteorder == 1) {
3645 /* force BE */
3646 ihi = 0;
3647 ilo = 1;
3648 }
3649
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003650 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003651 Py_UNICODE ch = *s++;
3652 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003653#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003654 if (ch >= 0x10000) {
3655 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3656 ch = 0xD800 | ((ch-0x10000) >> 10);
3657 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003658#endif
Tim Peters772747b2001-08-09 22:21:55 +00003659 STORECHAR(ch);
3660 if (ch2)
3661 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003662 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003663
3664 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003665 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003666#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667}
3668
3669PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3670{
3671 if (!PyUnicode_Check(unicode)) {
3672 PyErr_BadArgument();
3673 return NULL;
3674 }
3675 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 PyUnicode_GET_SIZE(unicode),
3677 NULL,
3678 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679}
3680
3681/* --- Unicode Escape Codec ----------------------------------------------- */
3682
Fredrik Lundh06d12682001-01-24 07:59:11 +00003683static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003684
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003686 Py_ssize_t size,
3687 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003690 Py_ssize_t startinpos;
3691 Py_ssize_t endinpos;
3692 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003693 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003697 char* message;
3698 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 PyObject *errorHandler = NULL;
3700 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003701
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 /* Escaped strings will always be longer than the resulting
3703 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003704 length after conversion to the true value.
3705 (but if the error callback returns a long replacement string
3706 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 v = _PyUnicode_New(size);
3708 if (v == NULL)
3709 goto onError;
3710 if (size == 0)
3711 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003715
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 while (s < end) {
3717 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003718 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720
3721 /* Non-escape characters are interpreted as Unicode ordinals */
3722 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003723 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 continue;
3725 }
3726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003727 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 /* \ - Escapes */
3729 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003730 c = *s++;
3731 if (s > end)
3732 c = '\0'; /* Invalid after \ */
3733 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734
Benjamin Peterson29060642009-01-31 22:14:21 +00003735 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736 case '\n': break;
3737 case '\\': *p++ = '\\'; break;
3738 case '\'': *p++ = '\''; break;
3739 case '\"': *p++ = '\"'; break;
3740 case 'b': *p++ = '\b'; break;
3741 case 'f': *p++ = '\014'; break; /* FF */
3742 case 't': *p++ = '\t'; break;
3743 case 'n': *p++ = '\n'; break;
3744 case 'r': *p++ = '\r'; break;
3745 case 'v': *p++ = '\013'; break; /* VT */
3746 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3747
Benjamin Peterson29060642009-01-31 22:14:21 +00003748 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749 case '0': case '1': case '2': case '3':
3750 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003751 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003752 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003753 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003754 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003755 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003757 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 break;
3759
Benjamin Peterson29060642009-01-31 22:14:21 +00003760 /* hex escapes */
3761 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003763 digits = 2;
3764 message = "truncated \\xXX escape";
3765 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766
Benjamin Peterson29060642009-01-31 22:14:21 +00003767 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003769 digits = 4;
3770 message = "truncated \\uXXXX escape";
3771 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772
Benjamin Peterson29060642009-01-31 22:14:21 +00003773 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003774 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003775 digits = 8;
3776 message = "truncated \\UXXXXXXXX escape";
3777 hexescape:
3778 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 outpos = p-PyUnicode_AS_UNICODE(v);
3780 if (s+digits>end) {
3781 endinpos = size;
3782 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003783 errors, &errorHandler,
3784 "unicodeescape", "end of string in escape sequence",
3785 &starts, &end, &startinpos, &endinpos, &exc, &s,
3786 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 goto onError;
3788 goto nextByte;
3789 }
3790 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003791 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003792 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 endinpos = (s+i+1)-starts;
3794 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003795 errors, &errorHandler,
3796 "unicodeescape", message,
3797 &starts, &end, &startinpos, &endinpos, &exc, &s,
3798 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003799 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003800 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003801 }
3802 chr = (chr<<4) & ~0xF;
3803 if (c >= '0' && c <= '9')
3804 chr += c - '0';
3805 else if (c >= 'a' && c <= 'f')
3806 chr += 10 + c - 'a';
3807 else
3808 chr += 10 + c - 'A';
3809 }
3810 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003811 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 /* _decoding_error will have already written into the
3813 target buffer. */
3814 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003815 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003816 /* when we get here, chr is a 32-bit unicode character */
3817 if (chr <= 0xffff)
3818 /* UCS-2 character */
3819 *p++ = (Py_UNICODE) chr;
3820 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003821 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003822 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003823#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003824 *p++ = chr;
3825#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003826 chr -= 0x10000L;
3827 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003828 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003829#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003830 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003831 endinpos = s-starts;
3832 outpos = p-PyUnicode_AS_UNICODE(v);
3833 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003834 errors, &errorHandler,
3835 "unicodeescape", "illegal Unicode character",
3836 &starts, &end, &startinpos, &endinpos, &exc, &s,
3837 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003838 goto onError;
3839 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003840 break;
3841
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003843 case 'N':
3844 message = "malformed \\N character escape";
3845 if (ucnhash_CAPI == NULL) {
3846 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003847 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003848 if (ucnhash_CAPI == NULL)
3849 goto ucnhashError;
3850 }
3851 if (*s == '{') {
3852 const char *start = s+1;
3853 /* look for the closing brace */
3854 while (*s != '}' && s < end)
3855 s++;
3856 if (s > start && s < end && *s == '}') {
3857 /* found a name. look it up in the unicode database */
3858 message = "unknown Unicode character name";
3859 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003860 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003861 goto store;
3862 }
3863 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864 endinpos = s-starts;
3865 outpos = p-PyUnicode_AS_UNICODE(v);
3866 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003867 errors, &errorHandler,
3868 "unicodeescape", message,
3869 &starts, &end, &startinpos, &endinpos, &exc, &s,
3870 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003871 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003872 break;
3873
3874 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003875 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876 message = "\\ at end of string";
3877 s--;
3878 endinpos = s-starts;
3879 outpos = p-PyUnicode_AS_UNICODE(v);
3880 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003881 errors, &errorHandler,
3882 "unicodeescape", message,
3883 &starts, &end, &startinpos, &endinpos, &exc, &s,
3884 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003885 goto onError;
3886 }
3887 else {
3888 *p++ = '\\';
3889 *p++ = (unsigned char)s[-1];
3890 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003891 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003893 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003894 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003896 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003898 Py_XDECREF(errorHandler);
3899 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003901
Benjamin Peterson29060642009-01-31 22:14:21 +00003902 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003903 PyErr_SetString(
3904 PyExc_UnicodeError,
3905 "\\N escapes not supported (can't load unicodedata module)"
3906 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003907 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003908 Py_XDECREF(errorHandler);
3909 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003910 return NULL;
3911
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 Py_XDECREF(errorHandler);
3915 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916 return NULL;
3917}
3918
3919/* Return a Unicode-Escape string version of the Unicode object.
3920
3921 If quotes is true, the string is enclosed in u"" or u'' quotes as
3922 appropriate.
3923
3924*/
3925
Thomas Wouters477c8d52006-05-27 19:21:47 +00003926Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003927 Py_ssize_t size,
3928 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003929{
3930 /* like wcschr, but doesn't stop at NULL characters */
3931
3932 while (size-- > 0) {
3933 if (*s == ch)
3934 return s;
3935 s++;
3936 }
3937
3938 return NULL;
3939}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003940
Walter Dörwald79e913e2007-05-12 11:08:06 +00003941static const char *hexdigits = "0123456789abcdef";
3942
3943PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003944 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003946 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003949#ifdef Py_UNICODE_WIDE
3950 const Py_ssize_t expandsize = 10;
3951#else
3952 const Py_ssize_t expandsize = 6;
3953#endif
3954
Thomas Wouters89f507f2006-12-13 04:49:30 +00003955 /* XXX(nnorwitz): rather than over-allocating, it would be
3956 better to choose a different scheme. Perhaps scan the
3957 first N-chars of the string and allocate based on that size.
3958 */
3959 /* Initial allocation is based on the longest-possible unichr
3960 escape.
3961
3962 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3963 unichr, so in this case it's the longest unichr escape. In
3964 narrow (UTF-16) builds this is five chars per source unichr
3965 since there are two unichrs in the surrogate pair, so in narrow
3966 (UTF-16) builds it's not the longest unichr escape.
3967
3968 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3969 so in the narrow (UTF-16) build case it's the longest unichr
3970 escape.
3971 */
3972
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003973 if (size == 0)
3974 return PyBytes_FromStringAndSize(NULL, 0);
3975
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003976 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003977 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003978
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003979 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 2
3981 + expandsize*size
3982 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 if (repr == NULL)
3984 return NULL;
3985
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003986 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988 while (size-- > 0) {
3989 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003990
Walter Dörwald79e913e2007-05-12 11:08:06 +00003991 /* Escape backslashes */
3992 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993 *p++ = '\\';
3994 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003995 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003996 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003997
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003998#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003999 /* Map 21-bit characters to '\U00xxxxxx' */
4000 else if (ch >= 0x10000) {
4001 *p++ = '\\';
4002 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004003 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4004 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4005 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4006 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4007 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4008 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4009 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4010 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004011 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004012 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004013#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004014 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4015 else if (ch >= 0xD800 && ch < 0xDC00) {
4016 Py_UNICODE ch2;
4017 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004018
Benjamin Peterson29060642009-01-31 22:14:21 +00004019 ch2 = *s++;
4020 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004021 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004022 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4023 *p++ = '\\';
4024 *p++ = 'U';
4025 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4026 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4027 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4028 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4029 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4030 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4031 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4032 *p++ = hexdigits[ucs & 0x0000000F];
4033 continue;
4034 }
4035 /* Fall through: isolated surrogates are copied as-is */
4036 s--;
4037 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004038 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004039#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004040
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004042 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043 *p++ = '\\';
4044 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004045 *p++ = hexdigits[(ch >> 12) & 0x000F];
4046 *p++ = hexdigits[(ch >> 8) & 0x000F];
4047 *p++ = hexdigits[(ch >> 4) & 0x000F];
4048 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004050
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004051 /* Map special whitespace to '\t', \n', '\r' */
4052 else if (ch == '\t') {
4053 *p++ = '\\';
4054 *p++ = 't';
4055 }
4056 else if (ch == '\n') {
4057 *p++ = '\\';
4058 *p++ = 'n';
4059 }
4060 else if (ch == '\r') {
4061 *p++ = '\\';
4062 *p++ = 'r';
4063 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004064
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004065 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004066 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004068 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004069 *p++ = hexdigits[(ch >> 4) & 0x000F];
4070 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004071 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004072
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 /* Copy everything else as-is */
4074 else
4075 *p++ = (char) ch;
4076 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004078 assert(p - PyBytes_AS_STRING(repr) > 0);
4079 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4080 return NULL;
4081 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082}
4083
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004084PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004086 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 if (!PyUnicode_Check(unicode)) {
4088 PyErr_BadArgument();
4089 return NULL;
4090 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004091 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4092 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004093 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094}
4095
4096/* --- Raw Unicode Escape Codec ------------------------------------------- */
4097
4098PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004099 Py_ssize_t size,
4100 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004103 Py_ssize_t startinpos;
4104 Py_ssize_t endinpos;
4105 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108 const char *end;
4109 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 PyObject *errorHandler = NULL;
4111 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004112
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 /* Escaped strings will always be longer than the resulting
4114 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 length after conversion to the true value. (But decoding error
4116 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 v = _PyUnicode_New(size);
4118 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004119 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 end = s + size;
4124 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 unsigned char c;
4126 Py_UCS4 x;
4127 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004128 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 /* Non-escape characters are interpreted as Unicode ordinals */
4131 if (*s != '\\') {
4132 *p++ = (unsigned char)*s++;
4133 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 startinpos = s-starts;
4136
4137 /* \u-escapes are only interpreted iff the number of leading
4138 backslashes if odd */
4139 bs = s;
4140 for (;s < end;) {
4141 if (*s != '\\')
4142 break;
4143 *p++ = (unsigned char)*s++;
4144 }
4145 if (((s - bs) & 1) == 0 ||
4146 s >= end ||
4147 (*s != 'u' && *s != 'U')) {
4148 continue;
4149 }
4150 p--;
4151 count = *s=='u' ? 4 : 8;
4152 s++;
4153
4154 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4155 outpos = p-PyUnicode_AS_UNICODE(v);
4156 for (x = 0, i = 0; i < count; ++i, ++s) {
4157 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004158 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004159 endinpos = s-starts;
4160 if (unicode_decode_call_errorhandler(
4161 errors, &errorHandler,
4162 "rawunicodeescape", "truncated \\uXXXX",
4163 &starts, &end, &startinpos, &endinpos, &exc, &s,
4164 &v, &outpos, &p))
4165 goto onError;
4166 goto nextByte;
4167 }
4168 x = (x<<4) & ~0xF;
4169 if (c >= '0' && c <= '9')
4170 x += c - '0';
4171 else if (c >= 'a' && c <= 'f')
4172 x += 10 + c - 'a';
4173 else
4174 x += 10 + c - 'A';
4175 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004176 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004177 /* UCS-2 character */
4178 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004179 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 /* UCS-4 character. Either store directly, or as
4181 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004182#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004184#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 x -= 0x10000L;
4186 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4187 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004188#endif
4189 } else {
4190 endinpos = s-starts;
4191 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004192 if (unicode_decode_call_errorhandler(
4193 errors, &errorHandler,
4194 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 &starts, &end, &startinpos, &endinpos, &exc, &s,
4196 &v, &outpos, &p))
4197 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 nextByte:
4200 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004202 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 Py_XDECREF(errorHandler);
4205 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004207
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210 Py_XDECREF(errorHandler);
4211 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 return NULL;
4213}
4214
4215PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004217{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004218 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219 char *p;
4220 char *q;
4221
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004222#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004223 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004224#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004225 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004226#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004227
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004228 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004230
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004231 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004232 if (repr == NULL)
4233 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004234 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004235 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004237 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238 while (size-- > 0) {
4239 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004240#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004241 /* Map 32-bit characters to '\Uxxxxxxxx' */
4242 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004243 *p++ = '\\';
4244 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004245 *p++ = hexdigits[(ch >> 28) & 0xf];
4246 *p++ = hexdigits[(ch >> 24) & 0xf];
4247 *p++ = hexdigits[(ch >> 20) & 0xf];
4248 *p++ = hexdigits[(ch >> 16) & 0xf];
4249 *p++ = hexdigits[(ch >> 12) & 0xf];
4250 *p++ = hexdigits[(ch >> 8) & 0xf];
4251 *p++ = hexdigits[(ch >> 4) & 0xf];
4252 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004253 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004254 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004255#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4257 if (ch >= 0xD800 && ch < 0xDC00) {
4258 Py_UNICODE ch2;
4259 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004260
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 ch2 = *s++;
4262 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004263 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004264 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4265 *p++ = '\\';
4266 *p++ = 'U';
4267 *p++ = hexdigits[(ucs >> 28) & 0xf];
4268 *p++ = hexdigits[(ucs >> 24) & 0xf];
4269 *p++ = hexdigits[(ucs >> 20) & 0xf];
4270 *p++ = hexdigits[(ucs >> 16) & 0xf];
4271 *p++ = hexdigits[(ucs >> 12) & 0xf];
4272 *p++ = hexdigits[(ucs >> 8) & 0xf];
4273 *p++ = hexdigits[(ucs >> 4) & 0xf];
4274 *p++ = hexdigits[ucs & 0xf];
4275 continue;
4276 }
4277 /* Fall through: isolated surrogates are copied as-is */
4278 s--;
4279 size++;
4280 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004281#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 /* Map 16-bit characters to '\uxxxx' */
4283 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 *p++ = '\\';
4285 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004286 *p++ = hexdigits[(ch >> 12) & 0xf];
4287 *p++ = hexdigits[(ch >> 8) & 0xf];
4288 *p++ = hexdigits[(ch >> 4) & 0xf];
4289 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 /* Copy everything else as-is */
4292 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293 *p++ = (char) ch;
4294 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004295 size = p - q;
4296
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004297 assert(size > 0);
4298 if (_PyBytes_Resize(&repr, size) < 0)
4299 return NULL;
4300 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301}
4302
4303PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4304{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004305 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004307 PyErr_BadArgument();
4308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004310 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4311 PyUnicode_GET_SIZE(unicode));
4312
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004313 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314}
4315
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004316/* --- Unicode Internal Codec ------------------------------------------- */
4317
4318PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004319 Py_ssize_t size,
4320 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004321{
4322 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004323 Py_ssize_t startinpos;
4324 Py_ssize_t endinpos;
4325 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004326 PyUnicodeObject *v;
4327 Py_UNICODE *p;
4328 const char *end;
4329 const char *reason;
4330 PyObject *errorHandler = NULL;
4331 PyObject *exc = NULL;
4332
Neal Norwitzd43069c2006-01-08 01:12:10 +00004333#ifdef Py_UNICODE_WIDE
4334 Py_UNICODE unimax = PyUnicode_GetMax();
4335#endif
4336
Thomas Wouters89f507f2006-12-13 04:49:30 +00004337 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004338 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4339 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004340 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004341 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004342 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004343 p = PyUnicode_AS_UNICODE(v);
4344 end = s + size;
4345
4346 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004347 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004348 /* We have to sanity check the raw data, otherwise doom looms for
4349 some malformed UCS-4 data. */
4350 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004351#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004352 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004353#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004354 end-s < Py_UNICODE_SIZE
4355 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004356 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004357 startinpos = s - starts;
4358 if (end-s < Py_UNICODE_SIZE) {
4359 endinpos = end-starts;
4360 reason = "truncated input";
4361 }
4362 else {
4363 endinpos = s - starts + Py_UNICODE_SIZE;
4364 reason = "illegal code point (> 0x10FFFF)";
4365 }
4366 outpos = p - PyUnicode_AS_UNICODE(v);
4367 if (unicode_decode_call_errorhandler(
4368 errors, &errorHandler,
4369 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004370 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004371 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004372 goto onError;
4373 }
4374 }
4375 else {
4376 p++;
4377 s += Py_UNICODE_SIZE;
4378 }
4379 }
4380
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004381 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004382 goto onError;
4383 Py_XDECREF(errorHandler);
4384 Py_XDECREF(exc);
4385 return (PyObject *)v;
4386
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004388 Py_XDECREF(v);
4389 Py_XDECREF(errorHandler);
4390 Py_XDECREF(exc);
4391 return NULL;
4392}
4393
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394/* --- Latin-1 Codec ------------------------------------------------------ */
4395
4396PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 Py_ssize_t size,
4398 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399{
4400 PyUnicodeObject *v;
4401 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004402 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004403
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004405 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 Py_UNICODE r = *(unsigned char*)s;
4407 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004408 }
4409
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410 v = _PyUnicode_New(size);
4411 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004416 e = s + size;
4417 /* Unrolling the copy makes it much faster by reducing the looping
4418 overhead. This is similar to what many memcpy() implementations do. */
4419 unrolled_end = e - 4;
4420 while (s < unrolled_end) {
4421 p[0] = (unsigned char) s[0];
4422 p[1] = (unsigned char) s[1];
4423 p[2] = (unsigned char) s[2];
4424 p[3] = (unsigned char) s[3];
4425 s += 4;
4426 p += 4;
4427 }
4428 while (s < e)
4429 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004431
Benjamin Peterson29060642009-01-31 22:14:21 +00004432 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004433 Py_XDECREF(v);
4434 return NULL;
4435}
4436
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437/* create or adjust a UnicodeEncodeError */
4438static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 const char *encoding,
4440 const Py_UNICODE *unicode, Py_ssize_t size,
4441 Py_ssize_t startpos, Py_ssize_t endpos,
4442 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 *exceptionObject = PyUnicodeEncodeError_Create(
4446 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 }
4448 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4450 goto onError;
4451 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4452 goto onError;
4453 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4454 goto onError;
4455 return;
4456 onError:
4457 Py_DECREF(*exceptionObject);
4458 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 }
4460}
4461
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004462/* raises a UnicodeEncodeError */
4463static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004464 const char *encoding,
4465 const Py_UNICODE *unicode, Py_ssize_t size,
4466 Py_ssize_t startpos, Py_ssize_t endpos,
4467 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468{
4469 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004470 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473}
4474
4475/* error handling callback helper:
4476 build arguments, call the callback and check the arguments,
4477 put the result into newpos and return the replacement string, which
4478 has to be freed by the caller */
4479static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 PyObject **errorHandler,
4481 const char *encoding, const char *reason,
4482 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4483 Py_ssize_t startpos, Py_ssize_t endpos,
4484 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004486 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487
4488 PyObject *restuple;
4489 PyObject *resunicode;
4490
4491 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 }
4496
4497 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501
4502 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004505 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004507 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 Py_DECREF(restuple);
4509 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004511 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 &resunicode, newpos)) {
4513 Py_DECREF(restuple);
4514 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004516 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4517 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4518 Py_DECREF(restuple);
4519 return NULL;
4520 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004523 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4525 Py_DECREF(restuple);
4526 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004527 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 Py_INCREF(resunicode);
4529 Py_DECREF(restuple);
4530 return resunicode;
4531}
4532
4533static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 Py_ssize_t size,
4535 const char *errors,
4536 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537{
4538 /* output object */
4539 PyObject *res;
4540 /* pointers to the beginning and end+1 of input */
4541 const Py_UNICODE *startp = p;
4542 const Py_UNICODE *endp = p + size;
4543 /* pointer to the beginning of the unencodable characters */
4544 /* const Py_UNICODE *badp = NULL; */
4545 /* pointer into the output */
4546 char *str;
4547 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004548 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004549 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4550 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 PyObject *errorHandler = NULL;
4552 PyObject *exc = NULL;
4553 /* the following variable is used for caching string comparisons
4554 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4555 int known_errorHandler = -1;
4556
4557 /* allocate enough for a simple encoding without
4558 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004559 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004560 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004561 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004563 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004564 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 ressize = size;
4566
4567 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004568 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 /* can we encode this? */
4571 if (c<limit) {
4572 /* no overflow check, because we know that the space is enough */
4573 *str++ = (char)c;
4574 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 else {
4577 Py_ssize_t unicodepos = p-startp;
4578 Py_ssize_t requiredsize;
4579 PyObject *repunicode;
4580 Py_ssize_t repsize;
4581 Py_ssize_t newpos;
4582 Py_ssize_t respos;
4583 Py_UNICODE *uni2;
4584 /* startpos for collecting unencodable chars */
4585 const Py_UNICODE *collstart = p;
4586 const Py_UNICODE *collend = p;
4587 /* find all unecodable characters */
4588 while ((collend < endp) && ((*collend)>=limit))
4589 ++collend;
4590 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4591 if (known_errorHandler==-1) {
4592 if ((errors==NULL) || (!strcmp(errors, "strict")))
4593 known_errorHandler = 1;
4594 else if (!strcmp(errors, "replace"))
4595 known_errorHandler = 2;
4596 else if (!strcmp(errors, "ignore"))
4597 known_errorHandler = 3;
4598 else if (!strcmp(errors, "xmlcharrefreplace"))
4599 known_errorHandler = 4;
4600 else
4601 known_errorHandler = 0;
4602 }
4603 switch (known_errorHandler) {
4604 case 1: /* strict */
4605 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4606 goto onError;
4607 case 2: /* replace */
4608 while (collstart++<collend)
4609 *str++ = '?'; /* fall through */
4610 case 3: /* ignore */
4611 p = collend;
4612 break;
4613 case 4: /* xmlcharrefreplace */
4614 respos = str - PyBytes_AS_STRING(res);
4615 /* determine replacement size (temporarily (mis)uses p) */
4616 for (p = collstart, repsize = 0; p < collend; ++p) {
4617 if (*p<10)
4618 repsize += 2+1+1;
4619 else if (*p<100)
4620 repsize += 2+2+1;
4621 else if (*p<1000)
4622 repsize += 2+3+1;
4623 else if (*p<10000)
4624 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004625#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004626 else
4627 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004628#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004629 else if (*p<100000)
4630 repsize += 2+5+1;
4631 else if (*p<1000000)
4632 repsize += 2+6+1;
4633 else
4634 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004635#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 }
4637 requiredsize = respos+repsize+(endp-collend);
4638 if (requiredsize > ressize) {
4639 if (requiredsize<2*ressize)
4640 requiredsize = 2*ressize;
4641 if (_PyBytes_Resize(&res, requiredsize))
4642 goto onError;
4643 str = PyBytes_AS_STRING(res) + respos;
4644 ressize = requiredsize;
4645 }
4646 /* generate replacement (temporarily (mis)uses p) */
4647 for (p = collstart; p < collend; ++p) {
4648 str += sprintf(str, "&#%d;", (int)*p);
4649 }
4650 p = collend;
4651 break;
4652 default:
4653 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4654 encoding, reason, startp, size, &exc,
4655 collstart-startp, collend-startp, &newpos);
4656 if (repunicode == NULL)
4657 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004658 if (PyBytes_Check(repunicode)) {
4659 /* Directly copy bytes result to output. */
4660 repsize = PyBytes_Size(repunicode);
4661 if (repsize > 1) {
4662 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004663 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004664 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4665 Py_DECREF(repunicode);
4666 goto onError;
4667 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004668 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004669 ressize += repsize-1;
4670 }
4671 memcpy(str, PyBytes_AsString(repunicode), repsize);
4672 str += repsize;
4673 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004674 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004675 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004676 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 /* need more space? (at least enough for what we
4678 have+the replacement+the rest of the string, so
4679 we won't have to check space for encodable characters) */
4680 respos = str - PyBytes_AS_STRING(res);
4681 repsize = PyUnicode_GET_SIZE(repunicode);
4682 requiredsize = respos+repsize+(endp-collend);
4683 if (requiredsize > ressize) {
4684 if (requiredsize<2*ressize)
4685 requiredsize = 2*ressize;
4686 if (_PyBytes_Resize(&res, requiredsize)) {
4687 Py_DECREF(repunicode);
4688 goto onError;
4689 }
4690 str = PyBytes_AS_STRING(res) + respos;
4691 ressize = requiredsize;
4692 }
4693 /* check if there is anything unencodable in the replacement
4694 and copy it to the output */
4695 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4696 c = *uni2;
4697 if (c >= limit) {
4698 raise_encode_exception(&exc, encoding, startp, size,
4699 unicodepos, unicodepos+1, reason);
4700 Py_DECREF(repunicode);
4701 goto onError;
4702 }
4703 *str = (char)c;
4704 }
4705 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004706 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004708 }
4709 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004710 /* Resize if we allocated to much */
4711 size = str - PyBytes_AS_STRING(res);
4712 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004713 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004714 if (_PyBytes_Resize(&res, size) < 0)
4715 goto onError;
4716 }
4717
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 Py_XDECREF(errorHandler);
4719 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004720 return res;
4721
4722 onError:
4723 Py_XDECREF(res);
4724 Py_XDECREF(errorHandler);
4725 Py_XDECREF(exc);
4726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004727}
4728
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 Py_ssize_t size,
4731 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734}
4735
4736PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4737{
4738 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 PyErr_BadArgument();
4740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 }
4742 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 PyUnicode_GET_SIZE(unicode),
4744 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745}
4746
4747/* --- 7-bit ASCII Codec -------------------------------------------------- */
4748
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 Py_ssize_t size,
4751 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004753 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 PyUnicodeObject *v;
4755 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004756 Py_ssize_t startinpos;
4757 Py_ssize_t endinpos;
4758 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004759 const char *e;
4760 PyObject *errorHandler = NULL;
4761 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004762
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004764 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 Py_UNICODE r = *(unsigned char*)s;
4766 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004767 }
Tim Petersced69f82003-09-16 20:30:58 +00004768
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769 v = _PyUnicode_New(size);
4770 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004771 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004773 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775 e = s + size;
4776 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 register unsigned char c = (unsigned char)*s;
4778 if (c < 128) {
4779 *p++ = c;
4780 ++s;
4781 }
4782 else {
4783 startinpos = s-starts;
4784 endinpos = startinpos + 1;
4785 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4786 if (unicode_decode_call_errorhandler(
4787 errors, &errorHandler,
4788 "ascii", "ordinal not in range(128)",
4789 &starts, &e, &startinpos, &endinpos, &exc, &s,
4790 &v, &outpos, &p))
4791 goto onError;
4792 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004794 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004795 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4796 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004797 Py_XDECREF(errorHandler);
4798 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004800
Benjamin Peterson29060642009-01-31 22:14:21 +00004801 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 Py_XDECREF(errorHandler);
4804 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805 return NULL;
4806}
4807
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 Py_ssize_t size,
4810 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813}
4814
4815PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4816{
4817 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 PyErr_BadArgument();
4819 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 }
4821 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004822 PyUnicode_GET_SIZE(unicode),
4823 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824}
4825
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004826#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004828/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004829
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004830#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004831#define NEED_RETRY
4832#endif
4833
4834/* XXX This code is limited to "true" double-byte encodings, as
4835 a) it assumes an incomplete character consists of a single byte, and
4836 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004837 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004838
4839static int is_dbcs_lead_byte(const char *s, int offset)
4840{
4841 const char *curr = s + offset;
4842
4843 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 const char *prev = CharPrev(s, curr);
4845 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004846 }
4847 return 0;
4848}
4849
4850/*
4851 * Decode MBCS string into unicode object. If 'final' is set, converts
4852 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4853 */
4854static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 const char *s, /* MBCS string */
4856 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004857 int final,
4858 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004859{
4860 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004861 Py_ssize_t n;
4862 DWORD usize;
4863 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004864
4865 assert(size >= 0);
4866
Victor Stinner554f3f02010-06-16 23:33:54 +00004867 /* check and handle 'errors' arg */
4868 if (errors==NULL || strcmp(errors, "strict")==0)
4869 flags = MB_ERR_INVALID_CHARS;
4870 else if (strcmp(errors, "ignore")==0)
4871 flags = 0;
4872 else {
4873 PyErr_Format(PyExc_ValueError,
4874 "mbcs encoding does not support errors='%s'",
4875 errors);
4876 return -1;
4877 }
4878
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004879 /* Skip trailing lead-byte unless 'final' is set */
4880 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004882
4883 /* First get the size of the result */
4884 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004885 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4886 if (usize==0)
4887 goto mbcs_decode_error;
4888 } else
4889 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004890
4891 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 /* Create unicode object */
4893 *v = _PyUnicode_New(usize);
4894 if (*v == NULL)
4895 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004896 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004897 }
4898 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 /* Extend unicode object */
4900 n = PyUnicode_GET_SIZE(*v);
4901 if (_PyUnicode_Resize(v, n + usize) < 0)
4902 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004903 }
4904
4905 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004906 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004908 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4909 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004911 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004912 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004913
4914mbcs_decode_error:
4915 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4916 we raise a UnicodeDecodeError - else it is a 'generic'
4917 windows error
4918 */
4919 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4920 /* Ideally, we should get reason from FormatMessage - this
4921 is the Windows 2000 English version of the message
4922 */
4923 PyObject *exc = NULL;
4924 const char *reason = "No mapping for the Unicode character exists "
4925 "in the target multi-byte code page.";
4926 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4927 if (exc != NULL) {
4928 PyCodec_StrictErrors(exc);
4929 Py_DECREF(exc);
4930 }
4931 } else {
4932 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4933 }
4934 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004935}
4936
4937PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004938 Py_ssize_t size,
4939 const char *errors,
4940 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004941{
4942 PyUnicodeObject *v = NULL;
4943 int done;
4944
4945 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004947
4948#ifdef NEED_RETRY
4949 retry:
4950 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004951 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004952 else
4953#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004954 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004955
4956 if (done < 0) {
4957 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004959 }
4960
4961 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004963
4964#ifdef NEED_RETRY
4965 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 s += done;
4967 size -= done;
4968 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004969 }
4970#endif
4971
4972 return (PyObject *)v;
4973}
4974
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004975PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 Py_ssize_t size,
4977 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004978{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004979 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4980}
4981
4982/*
4983 * Convert unicode into string object (MBCS).
4984 * Returns 0 if succeed, -1 otherwise.
4985 */
4986static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004987 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004988 int size, /* size of unicode */
4989 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004990{
Victor Stinner554f3f02010-06-16 23:33:54 +00004991 BOOL usedDefaultChar = FALSE;
4992 BOOL *pusedDefaultChar;
4993 int mbcssize;
4994 Py_ssize_t n;
4995 PyObject *exc = NULL;
4996 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004997
4998 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004999
Victor Stinner554f3f02010-06-16 23:33:54 +00005000 /* check and handle 'errors' arg */
5001 if (errors==NULL || strcmp(errors, "strict")==0) {
5002 flags = WC_NO_BEST_FIT_CHARS;
5003 pusedDefaultChar = &usedDefaultChar;
5004 } else if (strcmp(errors, "replace")==0) {
5005 flags = 0;
5006 pusedDefaultChar = NULL;
5007 } else {
5008 PyErr_Format(PyExc_ValueError,
5009 "mbcs encoding does not support errors='%s'",
5010 errors);
5011 return -1;
5012 }
5013
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005014 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005015 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005016 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5017 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 if (mbcssize == 0) {
5019 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5020 return -1;
5021 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005022 /* If we used a default char, then we failed! */
5023 if (pusedDefaultChar && *pusedDefaultChar)
5024 goto mbcs_encode_error;
5025 } else {
5026 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005027 }
5028
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005029 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 /* Create string object */
5031 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5032 if (*repr == NULL)
5033 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005034 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005035 }
5036 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 /* Extend string object */
5038 n = PyBytes_Size(*repr);
5039 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5040 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005041 }
5042
5043 /* Do the conversion */
5044 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005046 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5047 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5049 return -1;
5050 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005051 if (pusedDefaultChar && *pusedDefaultChar)
5052 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005053 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005054 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005055
5056mbcs_encode_error:
5057 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5058 Py_XDECREF(exc);
5059 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005060}
5061
5062PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 Py_ssize_t size,
5064 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005065{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005066 PyObject *repr = NULL;
5067 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005068
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005069#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005071 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005072 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005073 else
5074#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005075 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005076
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005077 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 Py_XDECREF(repr);
5079 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005080 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005081
5082#ifdef NEED_RETRY
5083 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 p += INT_MAX;
5085 size -= INT_MAX;
5086 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005087 }
5088#endif
5089
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005090 return repr;
5091}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005092
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005093PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5094{
5095 if (!PyUnicode_Check(unicode)) {
5096 PyErr_BadArgument();
5097 return NULL;
5098 }
5099 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 PyUnicode_GET_SIZE(unicode),
5101 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005102}
5103
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005104#undef NEED_RETRY
5105
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005106#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005107
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108/* --- Character Mapping Codec -------------------------------------------- */
5109
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 Py_ssize_t size,
5112 PyObject *mapping,
5113 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005115 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005116 Py_ssize_t startinpos;
5117 Py_ssize_t endinpos;
5118 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005119 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 PyUnicodeObject *v;
5121 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005122 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005123 PyObject *errorHandler = NULL;
5124 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005125 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005126 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005127
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128 /* Default to Latin-1 */
5129 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131
5132 v = _PyUnicode_New(size);
5133 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005136 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005138 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005139 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 mapstring = PyUnicode_AS_UNICODE(mapping);
5141 maplen = PyUnicode_GET_SIZE(mapping);
5142 while (s < e) {
5143 unsigned char ch = *s;
5144 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 if (ch < maplen)
5147 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148
Benjamin Peterson29060642009-01-31 22:14:21 +00005149 if (x == 0xfffe) {
5150 /* undefined mapping */
5151 outpos = p-PyUnicode_AS_UNICODE(v);
5152 startinpos = s-starts;
5153 endinpos = startinpos+1;
5154 if (unicode_decode_call_errorhandler(
5155 errors, &errorHandler,
5156 "charmap", "character maps to <undefined>",
5157 &starts, &e, &startinpos, &endinpos, &exc, &s,
5158 &v, &outpos, &p)) {
5159 goto onError;
5160 }
5161 continue;
5162 }
5163 *p++ = x;
5164 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005165 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005166 }
5167 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 while (s < e) {
5169 unsigned char ch = *s;
5170 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005171
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5173 w = PyLong_FromLong((long)ch);
5174 if (w == NULL)
5175 goto onError;
5176 x = PyObject_GetItem(mapping, w);
5177 Py_DECREF(w);
5178 if (x == NULL) {
5179 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5180 /* No mapping found means: mapping is undefined. */
5181 PyErr_Clear();
5182 x = Py_None;
5183 Py_INCREF(x);
5184 } else
5185 goto onError;
5186 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005187
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 /* Apply mapping */
5189 if (PyLong_Check(x)) {
5190 long value = PyLong_AS_LONG(x);
5191 if (value < 0 || value > 65535) {
5192 PyErr_SetString(PyExc_TypeError,
5193 "character mapping must be in range(65536)");
5194 Py_DECREF(x);
5195 goto onError;
5196 }
5197 *p++ = (Py_UNICODE)value;
5198 }
5199 else if (x == Py_None) {
5200 /* undefined mapping */
5201 outpos = p-PyUnicode_AS_UNICODE(v);
5202 startinpos = s-starts;
5203 endinpos = startinpos+1;
5204 if (unicode_decode_call_errorhandler(
5205 errors, &errorHandler,
5206 "charmap", "character maps to <undefined>",
5207 &starts, &e, &startinpos, &endinpos, &exc, &s,
5208 &v, &outpos, &p)) {
5209 Py_DECREF(x);
5210 goto onError;
5211 }
5212 Py_DECREF(x);
5213 continue;
5214 }
5215 else if (PyUnicode_Check(x)) {
5216 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005217
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 if (targetsize == 1)
5219 /* 1-1 mapping */
5220 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005221
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 else if (targetsize > 1) {
5223 /* 1-n mapping */
5224 if (targetsize > extrachars) {
5225 /* resize first */
5226 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5227 Py_ssize_t needed = (targetsize - extrachars) + \
5228 (targetsize << 2);
5229 extrachars += needed;
5230 /* XXX overflow detection missing */
5231 if (_PyUnicode_Resize(&v,
5232 PyUnicode_GET_SIZE(v) + needed) < 0) {
5233 Py_DECREF(x);
5234 goto onError;
5235 }
5236 p = PyUnicode_AS_UNICODE(v) + oldpos;
5237 }
5238 Py_UNICODE_COPY(p,
5239 PyUnicode_AS_UNICODE(x),
5240 targetsize);
5241 p += targetsize;
5242 extrachars -= targetsize;
5243 }
5244 /* 1-0 mapping: skip the character */
5245 }
5246 else {
5247 /* wrong return value */
5248 PyErr_SetString(PyExc_TypeError,
5249 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005250 Py_DECREF(x);
5251 goto onError;
5252 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 Py_DECREF(x);
5254 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256 }
5257 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5259 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005260 Py_XDECREF(errorHandler);
5261 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005263
Benjamin Peterson29060642009-01-31 22:14:21 +00005264 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005265 Py_XDECREF(errorHandler);
5266 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 Py_XDECREF(v);
5268 return NULL;
5269}
5270
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005271/* Charmap encoding: the lookup table */
5272
5273struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 PyObject_HEAD
5275 unsigned char level1[32];
5276 int count2, count3;
5277 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005278};
5279
5280static PyObject*
5281encoding_map_size(PyObject *obj, PyObject* args)
5282{
5283 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005284 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005286}
5287
5288static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005289 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 PyDoc_STR("Return the size (in bytes) of this object") },
5291 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005292};
5293
5294static void
5295encoding_map_dealloc(PyObject* o)
5296{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005297 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005298}
5299
5300static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005301 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 "EncodingMap", /*tp_name*/
5303 sizeof(struct encoding_map), /*tp_basicsize*/
5304 0, /*tp_itemsize*/
5305 /* methods */
5306 encoding_map_dealloc, /*tp_dealloc*/
5307 0, /*tp_print*/
5308 0, /*tp_getattr*/
5309 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005310 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 0, /*tp_repr*/
5312 0, /*tp_as_number*/
5313 0, /*tp_as_sequence*/
5314 0, /*tp_as_mapping*/
5315 0, /*tp_hash*/
5316 0, /*tp_call*/
5317 0, /*tp_str*/
5318 0, /*tp_getattro*/
5319 0, /*tp_setattro*/
5320 0, /*tp_as_buffer*/
5321 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5322 0, /*tp_doc*/
5323 0, /*tp_traverse*/
5324 0, /*tp_clear*/
5325 0, /*tp_richcompare*/
5326 0, /*tp_weaklistoffset*/
5327 0, /*tp_iter*/
5328 0, /*tp_iternext*/
5329 encoding_map_methods, /*tp_methods*/
5330 0, /*tp_members*/
5331 0, /*tp_getset*/
5332 0, /*tp_base*/
5333 0, /*tp_dict*/
5334 0, /*tp_descr_get*/
5335 0, /*tp_descr_set*/
5336 0, /*tp_dictoffset*/
5337 0, /*tp_init*/
5338 0, /*tp_alloc*/
5339 0, /*tp_new*/
5340 0, /*tp_free*/
5341 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005342};
5343
5344PyObject*
5345PyUnicode_BuildEncodingMap(PyObject* string)
5346{
5347 Py_UNICODE *decode;
5348 PyObject *result;
5349 struct encoding_map *mresult;
5350 int i;
5351 int need_dict = 0;
5352 unsigned char level1[32];
5353 unsigned char level2[512];
5354 unsigned char *mlevel1, *mlevel2, *mlevel3;
5355 int count2 = 0, count3 = 0;
5356
5357 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5358 PyErr_BadArgument();
5359 return NULL;
5360 }
5361 decode = PyUnicode_AS_UNICODE(string);
5362 memset(level1, 0xFF, sizeof level1);
5363 memset(level2, 0xFF, sizeof level2);
5364
5365 /* If there isn't a one-to-one mapping of NULL to \0,
5366 or if there are non-BMP characters, we need to use
5367 a mapping dictionary. */
5368 if (decode[0] != 0)
5369 need_dict = 1;
5370 for (i = 1; i < 256; i++) {
5371 int l1, l2;
5372 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005373#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005374 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005375#endif
5376 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005377 need_dict = 1;
5378 break;
5379 }
5380 if (decode[i] == 0xFFFE)
5381 /* unmapped character */
5382 continue;
5383 l1 = decode[i] >> 11;
5384 l2 = decode[i] >> 7;
5385 if (level1[l1] == 0xFF)
5386 level1[l1] = count2++;
5387 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005388 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005389 }
5390
5391 if (count2 >= 0xFF || count3 >= 0xFF)
5392 need_dict = 1;
5393
5394 if (need_dict) {
5395 PyObject *result = PyDict_New();
5396 PyObject *key, *value;
5397 if (!result)
5398 return NULL;
5399 for (i = 0; i < 256; i++) {
5400 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005401 key = PyLong_FromLong(decode[i]);
5402 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005403 if (!key || !value)
5404 goto failed1;
5405 if (PyDict_SetItem(result, key, value) == -1)
5406 goto failed1;
5407 Py_DECREF(key);
5408 Py_DECREF(value);
5409 }
5410 return result;
5411 failed1:
5412 Py_XDECREF(key);
5413 Py_XDECREF(value);
5414 Py_DECREF(result);
5415 return NULL;
5416 }
5417
5418 /* Create a three-level trie */
5419 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5420 16*count2 + 128*count3 - 1);
5421 if (!result)
5422 return PyErr_NoMemory();
5423 PyObject_Init(result, &EncodingMapType);
5424 mresult = (struct encoding_map*)result;
5425 mresult->count2 = count2;
5426 mresult->count3 = count3;
5427 mlevel1 = mresult->level1;
5428 mlevel2 = mresult->level23;
5429 mlevel3 = mresult->level23 + 16*count2;
5430 memcpy(mlevel1, level1, 32);
5431 memset(mlevel2, 0xFF, 16*count2);
5432 memset(mlevel3, 0, 128*count3);
5433 count3 = 0;
5434 for (i = 1; i < 256; i++) {
5435 int o1, o2, o3, i2, i3;
5436 if (decode[i] == 0xFFFE)
5437 /* unmapped character */
5438 continue;
5439 o1 = decode[i]>>11;
5440 o2 = (decode[i]>>7) & 0xF;
5441 i2 = 16*mlevel1[o1] + o2;
5442 if (mlevel2[i2] == 0xFF)
5443 mlevel2[i2] = count3++;
5444 o3 = decode[i] & 0x7F;
5445 i3 = 128*mlevel2[i2] + o3;
5446 mlevel3[i3] = i;
5447 }
5448 return result;
5449}
5450
5451static int
5452encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5453{
5454 struct encoding_map *map = (struct encoding_map*)mapping;
5455 int l1 = c>>11;
5456 int l2 = (c>>7) & 0xF;
5457 int l3 = c & 0x7F;
5458 int i;
5459
5460#ifdef Py_UNICODE_WIDE
5461 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005463 }
5464#endif
5465 if (c == 0)
5466 return 0;
5467 /* level 1*/
5468 i = map->level1[l1];
5469 if (i == 0xFF) {
5470 return -1;
5471 }
5472 /* level 2*/
5473 i = map->level23[16*i+l2];
5474 if (i == 0xFF) {
5475 return -1;
5476 }
5477 /* level 3 */
5478 i = map->level23[16*map->count2 + 128*i + l3];
5479 if (i == 0) {
5480 return -1;
5481 }
5482 return i;
5483}
5484
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005485/* Lookup the character ch in the mapping. If the character
5486 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005487 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489{
Christian Heimes217cfd12007-12-02 14:31:20 +00005490 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005491 PyObject *x;
5492
5493 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005495 x = PyObject_GetItem(mapping, w);
5496 Py_DECREF(w);
5497 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5499 /* No mapping found means: mapping is undefined. */
5500 PyErr_Clear();
5501 x = Py_None;
5502 Py_INCREF(x);
5503 return x;
5504 } else
5505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005507 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005509 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 long value = PyLong_AS_LONG(x);
5511 if (value < 0 || value > 255) {
5512 PyErr_SetString(PyExc_TypeError,
5513 "character mapping must be in range(256)");
5514 Py_DECREF(x);
5515 return NULL;
5516 }
5517 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005519 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 /* wrong return value */
5523 PyErr_Format(PyExc_TypeError,
5524 "character mapping must return integer, bytes or None, not %.400s",
5525 x->ob_type->tp_name);
5526 Py_DECREF(x);
5527 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 }
5529}
5530
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005531static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005532charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005533{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005534 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5535 /* exponentially overallocate to minimize reallocations */
5536 if (requiredsize < 2*outsize)
5537 requiredsize = 2*outsize;
5538 if (_PyBytes_Resize(outobj, requiredsize))
5539 return -1;
5540 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005541}
5542
Benjamin Peterson14339b62009-01-31 16:36:08 +00005543typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005545}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005546/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005547 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005548 space is available. Return a new reference to the object that
5549 was put in the output buffer, or Py_None, if the mapping was undefined
5550 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005551 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005552static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005553charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005555{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005556 PyObject *rep;
5557 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005558 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005559
Christian Heimes90aa7642007-12-19 02:45:37 +00005560 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005561 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005563 if (res == -1)
5564 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 if (outsize<requiredsize)
5566 if (charmapencode_resize(outobj, outpos, requiredsize))
5567 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005568 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 outstart[(*outpos)++] = (char)res;
5570 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005571 }
5572
5573 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005575 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005576 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005577 Py_DECREF(rep);
5578 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005579 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 if (PyLong_Check(rep)) {
5581 Py_ssize_t requiredsize = *outpos+1;
5582 if (outsize<requiredsize)
5583 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5584 Py_DECREF(rep);
5585 return enc_EXCEPTION;
5586 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005587 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005589 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 else {
5591 const char *repchars = PyBytes_AS_STRING(rep);
5592 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5593 Py_ssize_t requiredsize = *outpos+repsize;
5594 if (outsize<requiredsize)
5595 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5596 Py_DECREF(rep);
5597 return enc_EXCEPTION;
5598 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005599 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 memcpy(outstart + *outpos, repchars, repsize);
5601 *outpos += repsize;
5602 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005604 Py_DECREF(rep);
5605 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606}
5607
5608/* handle an error in PyUnicode_EncodeCharmap
5609 Return 0 on success, -1 on error */
5610static
5611int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005612 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005613 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005614 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005615 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005616{
5617 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005618 Py_ssize_t repsize;
5619 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620 Py_UNICODE *uni2;
5621 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005622 Py_ssize_t collstartpos = *inpos;
5623 Py_ssize_t collendpos = *inpos+1;
5624 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 char *encoding = "charmap";
5626 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005627 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005629 /* find all unencodable characters */
5630 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005631 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005632 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 int res = encoding_map_lookup(p[collendpos], mapping);
5634 if (res != -1)
5635 break;
5636 ++collendpos;
5637 continue;
5638 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005639
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 rep = charmapencode_lookup(p[collendpos], mapping);
5641 if (rep==NULL)
5642 return -1;
5643 else if (rep!=Py_None) {
5644 Py_DECREF(rep);
5645 break;
5646 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005647 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005649 }
5650 /* cache callback name lookup
5651 * (if not done yet, i.e. it's the first error) */
5652 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 if ((errors==NULL) || (!strcmp(errors, "strict")))
5654 *known_errorHandler = 1;
5655 else if (!strcmp(errors, "replace"))
5656 *known_errorHandler = 2;
5657 else if (!strcmp(errors, "ignore"))
5658 *known_errorHandler = 3;
5659 else if (!strcmp(errors, "xmlcharrefreplace"))
5660 *known_errorHandler = 4;
5661 else
5662 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663 }
5664 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005665 case 1: /* strict */
5666 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5667 return -1;
5668 case 2: /* replace */
5669 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 x = charmapencode_output('?', mapping, res, respos);
5671 if (x==enc_EXCEPTION) {
5672 return -1;
5673 }
5674 else if (x==enc_FAILED) {
5675 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5676 return -1;
5677 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005678 }
5679 /* fall through */
5680 case 3: /* ignore */
5681 *inpos = collendpos;
5682 break;
5683 case 4: /* xmlcharrefreplace */
5684 /* generate replacement (temporarily (mis)uses p) */
5685 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 char buffer[2+29+1+1];
5687 char *cp;
5688 sprintf(buffer, "&#%d;", (int)p[collpos]);
5689 for (cp = buffer; *cp; ++cp) {
5690 x = charmapencode_output(*cp, mapping, res, respos);
5691 if (x==enc_EXCEPTION)
5692 return -1;
5693 else if (x==enc_FAILED) {
5694 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5695 return -1;
5696 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005697 }
5698 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005699 *inpos = collendpos;
5700 break;
5701 default:
5702 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 encoding, reason, p, size, exceptionObject,
5704 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005705 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005707 if (PyBytes_Check(repunicode)) {
5708 /* Directly copy bytes result to output. */
5709 Py_ssize_t outsize = PyBytes_Size(*res);
5710 Py_ssize_t requiredsize;
5711 repsize = PyBytes_Size(repunicode);
5712 requiredsize = *respos + repsize;
5713 if (requiredsize > outsize)
5714 /* Make room for all additional bytes. */
5715 if (charmapencode_resize(res, respos, requiredsize)) {
5716 Py_DECREF(repunicode);
5717 return -1;
5718 }
5719 memcpy(PyBytes_AsString(*res) + *respos,
5720 PyBytes_AsString(repunicode), repsize);
5721 *respos += repsize;
5722 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005723 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005724 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005725 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005726 /* generate replacement */
5727 repsize = PyUnicode_GET_SIZE(repunicode);
5728 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 x = charmapencode_output(*uni2, mapping, res, respos);
5730 if (x==enc_EXCEPTION) {
5731 return -1;
5732 }
5733 else if (x==enc_FAILED) {
5734 Py_DECREF(repunicode);
5735 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5736 return -1;
5737 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005738 }
5739 *inpos = newpos;
5740 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 }
5742 return 0;
5743}
5744
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 Py_ssize_t size,
5747 PyObject *mapping,
5748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750 /* output object */
5751 PyObject *res = NULL;
5752 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005753 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005755 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 PyObject *errorHandler = NULL;
5757 PyObject *exc = NULL;
5758 /* the following variable is used for caching string comparisons
5759 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5760 * 3=ignore, 4=xmlcharrefreplace */
5761 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762
5763 /* Default to Latin-1 */
5764 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 /* allocate enough for a simple encoding without
5768 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005769 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 if (res == NULL)
5771 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005772 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 /* try to encode it */
5777 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5778 if (x==enc_EXCEPTION) /* error */
5779 goto onError;
5780 if (x==enc_FAILED) { /* unencodable character */
5781 if (charmap_encoding_error(p, size, &inpos, mapping,
5782 &exc,
5783 &known_errorHandler, &errorHandler, errors,
5784 &res, &respos)) {
5785 goto onError;
5786 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005787 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 else
5789 /* done with this character => adjust input position */
5790 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005794 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005795 if (_PyBytes_Resize(&res, respos) < 0)
5796 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005797
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 Py_XDECREF(exc);
5799 Py_XDECREF(errorHandler);
5800 return res;
5801
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005803 Py_XDECREF(res);
5804 Py_XDECREF(exc);
5805 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 return NULL;
5807}
5808
5809PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811{
5812 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 PyErr_BadArgument();
5814 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 }
5816 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 PyUnicode_GET_SIZE(unicode),
5818 mapping,
5819 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820}
5821
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005822/* create or adjust a UnicodeTranslateError */
5823static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 const Py_UNICODE *unicode, Py_ssize_t size,
5825 Py_ssize_t startpos, Py_ssize_t endpos,
5826 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005828 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005829 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 }
5832 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5834 goto onError;
5835 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5836 goto onError;
5837 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5838 goto onError;
5839 return;
5840 onError:
5841 Py_DECREF(*exceptionObject);
5842 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 }
5844}
5845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846/* raises a UnicodeTranslateError */
5847static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 const Py_UNICODE *unicode, Py_ssize_t size,
5849 Py_ssize_t startpos, Py_ssize_t endpos,
5850 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851{
5852 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005854 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856}
5857
5858/* error handling callback helper:
5859 build arguments, call the callback and check the arguments,
5860 put the result into newpos and return the replacement string, which
5861 has to be freed by the caller */
5862static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 PyObject **errorHandler,
5864 const char *reason,
5865 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5866 Py_ssize_t startpos, Py_ssize_t endpos,
5867 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005869 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005871 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005872 PyObject *restuple;
5873 PyObject *resunicode;
5874
5875 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 }
5880
5881 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885
5886 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005888 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005891 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 Py_DECREF(restuple);
5893 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005894 }
5895 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 &resunicode, &i_newpos)) {
5897 Py_DECREF(restuple);
5898 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005899 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005900 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005902 else
5903 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005904 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5906 Py_DECREF(restuple);
5907 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 Py_INCREF(resunicode);
5910 Py_DECREF(restuple);
5911 return resunicode;
5912}
5913
5914/* Lookup the character ch in the mapping and put the result in result,
5915 which must be decrefed by the caller.
5916 Return 0 on success, -1 on error */
5917static
5918int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5919{
Christian Heimes217cfd12007-12-02 14:31:20 +00005920 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005921 PyObject *x;
5922
5923 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 x = PyObject_GetItem(mapping, w);
5926 Py_DECREF(w);
5927 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5929 /* No mapping found means: use 1:1 mapping. */
5930 PyErr_Clear();
5931 *result = NULL;
5932 return 0;
5933 } else
5934 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005935 }
5936 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 *result = x;
5938 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005939 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005940 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 long value = PyLong_AS_LONG(x);
5942 long max = PyUnicode_GetMax();
5943 if (value < 0 || value > max) {
5944 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005945 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 Py_DECREF(x);
5947 return -1;
5948 }
5949 *result = x;
5950 return 0;
5951 }
5952 else if (PyUnicode_Check(x)) {
5953 *result = x;
5954 return 0;
5955 }
5956 else {
5957 /* wrong return value */
5958 PyErr_SetString(PyExc_TypeError,
5959 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005960 Py_DECREF(x);
5961 return -1;
5962 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005963}
5964/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005965 if not reallocate and adjust various state variables.
5966 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967static
Walter Dörwald4894c302003-10-24 14:25:28 +00005968int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005971 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005972 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 /* remember old output position */
5974 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5975 /* exponentially overallocate to minimize reallocations */
5976 if (requiredsize < 2 * oldsize)
5977 requiredsize = 2 * oldsize;
5978 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5979 return -1;
5980 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005981 }
5982 return 0;
5983}
5984/* lookup the character, put the result in the output string and adjust
5985 various state variables. Return a new reference to the object that
5986 was put in the output buffer in *result, or Py_None, if the mapping was
5987 undefined (in which case no character was written).
5988 The called must decref result.
5989 Return 0 on success, -1 on error. */
5990static
Walter Dörwald4894c302003-10-24 14:25:28 +00005991int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5993 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994{
Walter Dörwald4894c302003-10-24 14:25:28 +00005995 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005997 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 /* not found => default to 1:1 mapping */
5999 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 }
6001 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006003 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 /* no overflow check, because we know that the space is enough */
6005 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006006 }
6007 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6009 if (repsize==1) {
6010 /* no overflow check, because we know that the space is enough */
6011 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6012 }
6013 else if (repsize!=0) {
6014 /* more than one character */
6015 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6016 (insize - (curinp-startinp)) +
6017 repsize - 1;
6018 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6019 return -1;
6020 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6021 *outp += repsize;
6022 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 }
6024 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 return 0;
6027}
6028
6029PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 Py_ssize_t size,
6031 PyObject *mapping,
6032 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006034 /* output object */
6035 PyObject *res = NULL;
6036 /* pointers to the beginning and end+1 of input */
6037 const Py_UNICODE *startp = p;
6038 const Py_UNICODE *endp = p + size;
6039 /* pointer into the output */
6040 Py_UNICODE *str;
6041 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006042 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 char *reason = "character maps to <undefined>";
6044 PyObject *errorHandler = NULL;
6045 PyObject *exc = NULL;
6046 /* the following variable is used for caching string comparisons
6047 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6048 * 3=ignore, 4=xmlcharrefreplace */
6049 int known_errorHandler = -1;
6050
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 PyErr_BadArgument();
6053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006055
6056 /* allocate enough for a simple 1:1 translation without
6057 replacements, if we need more, we'll resize */
6058 res = PyUnicode_FromUnicode(NULL, size);
6059 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006063 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 /* try to encode it */
6067 PyObject *x = NULL;
6068 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6069 Py_XDECREF(x);
6070 goto onError;
6071 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006072 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 if (x!=Py_None) /* it worked => adjust input pointer */
6074 ++p;
6075 else { /* untranslatable character */
6076 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6077 Py_ssize_t repsize;
6078 Py_ssize_t newpos;
6079 Py_UNICODE *uni2;
6080 /* startpos for collecting untranslatable chars */
6081 const Py_UNICODE *collstart = p;
6082 const Py_UNICODE *collend = p+1;
6083 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 /* find all untranslatable characters */
6086 while (collend < endp) {
6087 if (charmaptranslate_lookup(*collend, mapping, &x))
6088 goto onError;
6089 Py_XDECREF(x);
6090 if (x!=Py_None)
6091 break;
6092 ++collend;
6093 }
6094 /* cache callback name lookup
6095 * (if not done yet, i.e. it's the first error) */
6096 if (known_errorHandler==-1) {
6097 if ((errors==NULL) || (!strcmp(errors, "strict")))
6098 known_errorHandler = 1;
6099 else if (!strcmp(errors, "replace"))
6100 known_errorHandler = 2;
6101 else if (!strcmp(errors, "ignore"))
6102 known_errorHandler = 3;
6103 else if (!strcmp(errors, "xmlcharrefreplace"))
6104 known_errorHandler = 4;
6105 else
6106 known_errorHandler = 0;
6107 }
6108 switch (known_errorHandler) {
6109 case 1: /* strict */
6110 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006111 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 case 2: /* replace */
6113 /* No need to check for space, this is a 1:1 replacement */
6114 for (coll = collstart; coll<collend; ++coll)
6115 *str++ = '?';
6116 /* fall through */
6117 case 3: /* ignore */
6118 p = collend;
6119 break;
6120 case 4: /* xmlcharrefreplace */
6121 /* generate replacement (temporarily (mis)uses p) */
6122 for (p = collstart; p < collend; ++p) {
6123 char buffer[2+29+1+1];
6124 char *cp;
6125 sprintf(buffer, "&#%d;", (int)*p);
6126 if (charmaptranslate_makespace(&res, &str,
6127 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6128 goto onError;
6129 for (cp = buffer; *cp; ++cp)
6130 *str++ = *cp;
6131 }
6132 p = collend;
6133 break;
6134 default:
6135 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6136 reason, startp, size, &exc,
6137 collstart-startp, collend-startp, &newpos);
6138 if (repunicode == NULL)
6139 goto onError;
6140 /* generate replacement */
6141 repsize = PyUnicode_GET_SIZE(repunicode);
6142 if (charmaptranslate_makespace(&res, &str,
6143 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6144 Py_DECREF(repunicode);
6145 goto onError;
6146 }
6147 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6148 *str++ = *uni2;
6149 p = startp + newpos;
6150 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006151 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006152 }
6153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154 /* Resize if we allocated to much */
6155 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006156 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 if (PyUnicode_Resize(&res, respos) < 0)
6158 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006159 }
6160 Py_XDECREF(exc);
6161 Py_XDECREF(errorHandler);
6162 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165 Py_XDECREF(res);
6166 Py_XDECREF(exc);
6167 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 return NULL;
6169}
6170
6171PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 PyObject *mapping,
6173 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174{
6175 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006176
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 str = PyUnicode_FromObject(str);
6178 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 PyUnicode_GET_SIZE(str),
6182 mapping,
6183 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 Py_DECREF(str);
6185 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006186
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 Py_XDECREF(str);
6189 return NULL;
6190}
Tim Petersced69f82003-09-16 20:30:58 +00006191
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006192PyObject *
6193PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6194 Py_ssize_t length)
6195{
6196 PyObject *result;
6197 Py_UNICODE *p; /* write pointer into result */
6198 Py_ssize_t i;
6199 /* Copy to a new string */
6200 result = (PyObject *)_PyUnicode_New(length);
6201 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6202 if (result == NULL)
6203 return result;
6204 p = PyUnicode_AS_UNICODE(result);
6205 /* Iterate over code points */
6206 for (i = 0; i < length; i++) {
6207 Py_UNICODE ch =s[i];
6208 if (ch > 127) {
6209 int decimal = Py_UNICODE_TODECIMAL(ch);
6210 if (decimal >= 0)
6211 p[i] = '0' + decimal;
6212 }
6213 }
6214 return result;
6215}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006216/* --- Decimal Encoder ---------------------------------------------------- */
6217
6218int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 Py_ssize_t length,
6220 char *output,
6221 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006222{
6223 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006224 PyObject *errorHandler = NULL;
6225 PyObject *exc = NULL;
6226 const char *encoding = "decimal";
6227 const char *reason = "invalid decimal Unicode string";
6228 /* the following variable is used for caching string comparisons
6229 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6230 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006231
6232 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 PyErr_BadArgument();
6234 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006235 }
6236
6237 p = s;
6238 end = s + length;
6239 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 register Py_UNICODE ch = *p;
6241 int decimal;
6242 PyObject *repunicode;
6243 Py_ssize_t repsize;
6244 Py_ssize_t newpos;
6245 Py_UNICODE *uni2;
6246 Py_UNICODE *collstart;
6247 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006248
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006250 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 ++p;
6252 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006253 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 decimal = Py_UNICODE_TODECIMAL(ch);
6255 if (decimal >= 0) {
6256 *output++ = '0' + decimal;
6257 ++p;
6258 continue;
6259 }
6260 if (0 < ch && ch < 256) {
6261 *output++ = (char)ch;
6262 ++p;
6263 continue;
6264 }
6265 /* All other characters are considered unencodable */
6266 collstart = p;
6267 collend = p+1;
6268 while (collend < end) {
6269 if ((0 < *collend && *collend < 256) ||
6270 !Py_UNICODE_ISSPACE(*collend) ||
6271 Py_UNICODE_TODECIMAL(*collend))
6272 break;
6273 }
6274 /* cache callback name lookup
6275 * (if not done yet, i.e. it's the first error) */
6276 if (known_errorHandler==-1) {
6277 if ((errors==NULL) || (!strcmp(errors, "strict")))
6278 known_errorHandler = 1;
6279 else if (!strcmp(errors, "replace"))
6280 known_errorHandler = 2;
6281 else if (!strcmp(errors, "ignore"))
6282 known_errorHandler = 3;
6283 else if (!strcmp(errors, "xmlcharrefreplace"))
6284 known_errorHandler = 4;
6285 else
6286 known_errorHandler = 0;
6287 }
6288 switch (known_errorHandler) {
6289 case 1: /* strict */
6290 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6291 goto onError;
6292 case 2: /* replace */
6293 for (p = collstart; p < collend; ++p)
6294 *output++ = '?';
6295 /* fall through */
6296 case 3: /* ignore */
6297 p = collend;
6298 break;
6299 case 4: /* xmlcharrefreplace */
6300 /* generate replacement (temporarily (mis)uses p) */
6301 for (p = collstart; p < collend; ++p)
6302 output += sprintf(output, "&#%d;", (int)*p);
6303 p = collend;
6304 break;
6305 default:
6306 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6307 encoding, reason, s, length, &exc,
6308 collstart-s, collend-s, &newpos);
6309 if (repunicode == NULL)
6310 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006311 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006312 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006313 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6314 Py_DECREF(repunicode);
6315 goto onError;
6316 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 /* generate replacement */
6318 repsize = PyUnicode_GET_SIZE(repunicode);
6319 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6320 Py_UNICODE ch = *uni2;
6321 if (Py_UNICODE_ISSPACE(ch))
6322 *output++ = ' ';
6323 else {
6324 decimal = Py_UNICODE_TODECIMAL(ch);
6325 if (decimal >= 0)
6326 *output++ = '0' + decimal;
6327 else if (0 < ch && ch < 256)
6328 *output++ = (char)ch;
6329 else {
6330 Py_DECREF(repunicode);
6331 raise_encode_exception(&exc, encoding,
6332 s, length, collstart-s, collend-s, reason);
6333 goto onError;
6334 }
6335 }
6336 }
6337 p = s + newpos;
6338 Py_DECREF(repunicode);
6339 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006340 }
6341 /* 0-terminate the output string */
6342 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006343 Py_XDECREF(exc);
6344 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006345 return 0;
6346
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006348 Py_XDECREF(exc);
6349 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006350 return -1;
6351}
6352
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353/* --- Helpers ------------------------------------------------------------ */
6354
Eric Smith8c663262007-08-25 02:26:07 +00006355#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006356#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006357
Thomas Wouters477c8d52006-05-27 19:21:47 +00006358#include "stringlib/count.h"
6359#include "stringlib/find.h"
6360#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006361#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006362
Eric Smith5807c412008-05-11 21:00:57 +00006363#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006364#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006365#include "stringlib/localeutil.h"
6366
Thomas Wouters477c8d52006-05-27 19:21:47 +00006367/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006368#define ADJUST_INDICES(start, end, len) \
6369 if (end > len) \
6370 end = len; \
6371 else if (end < 0) { \
6372 end += len; \
6373 if (end < 0) \
6374 end = 0; \
6375 } \
6376 if (start < 0) { \
6377 start += len; \
6378 if (start < 0) \
6379 start = 0; \
6380 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006381
Martin v. Löwis18e16552006-02-15 17:27:45 +00006382Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006383 PyObject *substr,
6384 Py_ssize_t start,
6385 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006387 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006388 PyUnicodeObject* str_obj;
6389 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006390
Thomas Wouters477c8d52006-05-27 19:21:47 +00006391 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6392 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006394 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6395 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 Py_DECREF(str_obj);
6397 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 }
Tim Petersced69f82003-09-16 20:30:58 +00006399
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006400 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006401 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006402 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6403 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006404 );
6405
6406 Py_DECREF(sub_obj);
6407 Py_DECREF(str_obj);
6408
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 return result;
6410}
6411
Martin v. Löwis18e16552006-02-15 17:27:45 +00006412Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006413 PyObject *sub,
6414 Py_ssize_t start,
6415 Py_ssize_t end,
6416 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006418 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006419
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006421 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006423 sub = PyUnicode_FromObject(sub);
6424 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 Py_DECREF(str);
6426 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 }
Tim Petersced69f82003-09-16 20:30:58 +00006428
Thomas Wouters477c8d52006-05-27 19:21:47 +00006429 if (direction > 0)
6430 result = stringlib_find_slice(
6431 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6432 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6433 start, end
6434 );
6435 else
6436 result = stringlib_rfind_slice(
6437 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6438 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6439 start, end
6440 );
6441
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006443 Py_DECREF(sub);
6444
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 return result;
6446}
6447
Tim Petersced69f82003-09-16 20:30:58 +00006448static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 PyUnicodeObject *substring,
6451 Py_ssize_t start,
6452 Py_ssize_t end,
6453 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 if (substring->length == 0)
6456 return 1;
6457
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006458 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 end -= substring->length;
6460 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462
6463 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 if (Py_UNICODE_MATCH(self, end, substring))
6465 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 } else {
6467 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 }
6470
6471 return 0;
6472}
6473
Martin v. Löwis18e16552006-02-15 17:27:45 +00006474Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 PyObject *substr,
6476 Py_ssize_t start,
6477 Py_ssize_t end,
6478 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006480 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006481
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 str = PyUnicode_FromObject(str);
6483 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 substr = PyUnicode_FromObject(substr);
6486 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 Py_DECREF(str);
6488 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 }
Tim Petersced69f82003-09-16 20:30:58 +00006490
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 (PyUnicodeObject *)substr,
6493 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 Py_DECREF(str);
6495 Py_DECREF(substr);
6496 return result;
6497}
6498
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499/* Apply fixfct filter to the Unicode object self and return a
6500 reference to the modified object */
6501
Tim Petersced69f82003-09-16 20:30:58 +00006502static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505{
6506
6507 PyUnicodeObject *u;
6508
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006509 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006512
6513 Py_UNICODE_COPY(u->str, self->str, self->length);
6514
Tim Peters7a29bd52001-09-12 03:03:31 +00006515 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 /* fixfct should return TRUE if it modified the buffer. If
6517 FALSE, return a reference to the original buffer instead
6518 (to save space, not time) */
6519 Py_INCREF(self);
6520 Py_DECREF(u);
6521 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 }
6523 return (PyObject*) u;
6524}
6525
Tim Petersced69f82003-09-16 20:30:58 +00006526static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527int fixupper(PyUnicodeObject *self)
6528{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006529 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 Py_UNICODE *s = self->str;
6531 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006532
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006535
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 ch = Py_UNICODE_TOUPPER(*s);
6537 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 *s = ch;
6540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 s++;
6542 }
6543
6544 return status;
6545}
6546
Tim Petersced69f82003-09-16 20:30:58 +00006547static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548int fixlower(PyUnicodeObject *self)
6549{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006550 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 Py_UNICODE *s = self->str;
6552 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006553
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006556
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 ch = Py_UNICODE_TOLOWER(*s);
6558 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 *s = ch;
6561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 s++;
6563 }
6564
6565 return status;
6566}
6567
Tim Petersced69f82003-09-16 20:30:58 +00006568static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569int fixswapcase(PyUnicodeObject *self)
6570{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006571 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 Py_UNICODE *s = self->str;
6573 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006574
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 while (len-- > 0) {
6576 if (Py_UNICODE_ISUPPER(*s)) {
6577 *s = Py_UNICODE_TOLOWER(*s);
6578 status = 1;
6579 } else if (Py_UNICODE_ISLOWER(*s)) {
6580 *s = Py_UNICODE_TOUPPER(*s);
6581 status = 1;
6582 }
6583 s++;
6584 }
6585
6586 return status;
6587}
6588
Tim Petersced69f82003-09-16 20:30:58 +00006589static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590int fixcapitalize(PyUnicodeObject *self)
6591{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006592 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006593 Py_UNICODE *s = self->str;
6594 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006595
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006596 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006598 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 *s = Py_UNICODE_TOUPPER(*s);
6600 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006602 s++;
6603 while (--len > 0) {
6604 if (Py_UNICODE_ISUPPER(*s)) {
6605 *s = Py_UNICODE_TOLOWER(*s);
6606 status = 1;
6607 }
6608 s++;
6609 }
6610 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611}
6612
6613static
6614int fixtitle(PyUnicodeObject *self)
6615{
6616 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6617 register Py_UNICODE *e;
6618 int previous_is_cased;
6619
6620 /* Shortcut for single character strings */
6621 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6623 if (*p != ch) {
6624 *p = ch;
6625 return 1;
6626 }
6627 else
6628 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 }
Tim Petersced69f82003-09-16 20:30:58 +00006630
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 e = p + PyUnicode_GET_SIZE(self);
6632 previous_is_cased = 0;
6633 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006635
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 if (previous_is_cased)
6637 *p = Py_UNICODE_TOLOWER(ch);
6638 else
6639 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006640
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 if (Py_UNICODE_ISLOWER(ch) ||
6642 Py_UNICODE_ISUPPER(ch) ||
6643 Py_UNICODE_ISTITLE(ch))
6644 previous_is_cased = 1;
6645 else
6646 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 }
6648 return 1;
6649}
6650
Tim Peters8ce9f162004-08-27 01:49:32 +00006651PyObject *
6652PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653{
Skip Montanaro6543b452004-09-16 03:28:13 +00006654 const Py_UNICODE blank = ' ';
6655 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006656 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006657 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006658 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6659 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006660 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6661 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006662 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006663 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664
Tim Peters05eba1f2004-08-27 21:32:02 +00006665 fseq = PySequence_Fast(seq, "");
6666 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006667 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006668 }
6669
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006670 /* NOTE: the following code can't call back into Python code,
6671 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006672 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006673
Tim Peters05eba1f2004-08-27 21:32:02 +00006674 seqlen = PySequence_Fast_GET_SIZE(fseq);
6675 /* If empty sequence, return u"". */
6676 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006677 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6678 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006679 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006680 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006681 /* If singleton sequence with an exact Unicode, return that. */
6682 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 item = items[0];
6684 if (PyUnicode_CheckExact(item)) {
6685 Py_INCREF(item);
6686 res = (PyUnicodeObject *)item;
6687 goto Done;
6688 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006689 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006690 else {
6691 /* Set up sep and seplen */
6692 if (separator == NULL) {
6693 sep = &blank;
6694 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006695 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006696 else {
6697 if (!PyUnicode_Check(separator)) {
6698 PyErr_Format(PyExc_TypeError,
6699 "separator: expected str instance,"
6700 " %.80s found",
6701 Py_TYPE(separator)->tp_name);
6702 goto onError;
6703 }
6704 sep = PyUnicode_AS_UNICODE(separator);
6705 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006706 }
6707 }
6708
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006709 /* There are at least two things to join, or else we have a subclass
6710 * of str in the sequence.
6711 * Do a pre-pass to figure out the total amount of space we'll
6712 * need (sz), and see whether all argument are strings.
6713 */
6714 sz = 0;
6715 for (i = 0; i < seqlen; i++) {
6716 const Py_ssize_t old_sz = sz;
6717 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 if (!PyUnicode_Check(item)) {
6719 PyErr_Format(PyExc_TypeError,
6720 "sequence item %zd: expected str instance,"
6721 " %.80s found",
6722 i, Py_TYPE(item)->tp_name);
6723 goto onError;
6724 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006725 sz += PyUnicode_GET_SIZE(item);
6726 if (i != 0)
6727 sz += seplen;
6728 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6729 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006731 goto onError;
6732 }
6733 }
Tim Petersced69f82003-09-16 20:30:58 +00006734
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006735 res = _PyUnicode_New(sz);
6736 if (res == NULL)
6737 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006738
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006739 /* Catenate everything. */
6740 res_p = PyUnicode_AS_UNICODE(res);
6741 for (i = 0; i < seqlen; ++i) {
6742 Py_ssize_t itemlen;
6743 item = items[i];
6744 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 /* Copy item, and maybe the separator. */
6746 if (i) {
6747 Py_UNICODE_COPY(res_p, sep, seplen);
6748 res_p += seplen;
6749 }
6750 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6751 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006752 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006753
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006755 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 return (PyObject *)res;
6757
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006759 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006760 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 return NULL;
6762}
6763
Tim Petersced69f82003-09-16 20:30:58 +00006764static
6765PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 Py_ssize_t left,
6767 Py_ssize_t right,
6768 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769{
6770 PyUnicodeObject *u;
6771
6772 if (left < 0)
6773 left = 0;
6774 if (right < 0)
6775 right = 0;
6776
Tim Peters7a29bd52001-09-12 03:03:31 +00006777 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 Py_INCREF(self);
6779 return self;
6780 }
6781
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006782 if (left > PY_SSIZE_T_MAX - self->length ||
6783 right > PY_SSIZE_T_MAX - (left + self->length)) {
6784 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6785 return NULL;
6786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 u = _PyUnicode_New(left + self->length + right);
6788 if (u) {
6789 if (left)
6790 Py_UNICODE_FILL(u->str, fill, left);
6791 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6792 if (right)
6793 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6794 }
6795
6796 return u;
6797}
6798
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006799PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802
6803 string = PyUnicode_FromObject(string);
6804 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006807 list = stringlib_splitlines(
6808 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6809 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810
6811 Py_DECREF(string);
6812 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813}
6814
Tim Petersced69f82003-09-16 20:30:58 +00006815static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 PyUnicodeObject *substring,
6818 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006821 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006824 return stringlib_split_whitespace(
6825 (PyObject*) self, self->str, self->length, maxcount
6826 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006828 return stringlib_split(
6829 (PyObject*) self, self->str, self->length,
6830 substring->str, substring->length,
6831 maxcount
6832 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833}
6834
Tim Petersced69f82003-09-16 20:30:58 +00006835static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006836PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 PyUnicodeObject *substring,
6838 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006839{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006840 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006841 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006842
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006843 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006844 return stringlib_rsplit_whitespace(
6845 (PyObject*) self, self->str, self->length, maxcount
6846 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006847
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006848 return stringlib_rsplit(
6849 (PyObject*) self, self->str, self->length,
6850 substring->str, substring->length,
6851 maxcount
6852 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006853}
6854
6855static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 PyUnicodeObject *str1,
6858 PyUnicodeObject *str2,
6859 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860{
6861 PyUnicodeObject *u;
6862
6863 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006865 else if (maxcount == 0 || self->length == 0)
6866 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867
Thomas Wouters477c8d52006-05-27 19:21:47 +00006868 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006869 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006870 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006871 if (str1->length == 0)
6872 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006873 if (str1->length == 1) {
6874 /* replace characters */
6875 Py_UNICODE u1, u2;
6876 if (!findchar(self->str, self->length, str1->str[0]))
6877 goto nothing;
6878 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6879 if (!u)
6880 return NULL;
6881 Py_UNICODE_COPY(u->str, self->str, self->length);
6882 u1 = str1->str[0];
6883 u2 = str2->str[0];
6884 for (i = 0; i < u->length; i++)
6885 if (u->str[i] == u1) {
6886 if (--maxcount < 0)
6887 break;
6888 u->str[i] = u2;
6889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006891 i = stringlib_find(
6892 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006894 if (i < 0)
6895 goto nothing;
6896 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6897 if (!u)
6898 return NULL;
6899 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006900
6901 /* change everything in-place, starting with this one */
6902 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6903 i += str1->length;
6904
6905 while ( --maxcount > 0) {
6906 i = stringlib_find(self->str+i, self->length-i,
6907 str1->str, str1->length,
6908 i);
6909 if (i == -1)
6910 break;
6911 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6912 i += str1->length;
6913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006916
6917 Py_ssize_t n, i, j, e;
6918 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 Py_UNICODE *p;
6920
6921 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006922 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6923 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006924 if (n == 0)
6925 goto nothing;
6926 /* new_size = self->length + n * (str2->length - str1->length)); */
6927 delta = (str2->length - str1->length);
6928 if (delta == 0) {
6929 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006931 product = n * (str2->length - str1->length);
6932 if ((product / (str2->length - str1->length)) != n) {
6933 PyErr_SetString(PyExc_OverflowError,
6934 "replace string is too long");
6935 return NULL;
6936 }
6937 new_size = self->length + product;
6938 if (new_size < 0) {
6939 PyErr_SetString(PyExc_OverflowError,
6940 "replace string is too long");
6941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 }
6943 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006944 u = _PyUnicode_New(new_size);
6945 if (!u)
6946 return NULL;
6947 i = 0;
6948 p = u->str;
6949 e = self->length - str1->length;
6950 if (str1->length > 0) {
6951 while (n-- > 0) {
6952 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006953 j = stringlib_find(self->str+i, self->length-i,
6954 str1->str, str1->length,
6955 i);
6956 if (j == -1)
6957 break;
6958 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006959 /* copy unchanged part [i:j] */
6960 Py_UNICODE_COPY(p, self->str+i, j-i);
6961 p += j - i;
6962 }
6963 /* copy substitution string */
6964 if (str2->length > 0) {
6965 Py_UNICODE_COPY(p, str2->str, str2->length);
6966 p += str2->length;
6967 }
6968 i = j + str1->length;
6969 }
6970 if (i < self->length)
6971 /* copy tail [i:] */
6972 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6973 } else {
6974 /* interleave */
6975 while (n > 0) {
6976 Py_UNICODE_COPY(p, str2->str, str2->length);
6977 p += str2->length;
6978 if (--n <= 0)
6979 break;
6980 *p++ = self->str[i++];
6981 }
6982 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006986
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006988 /* nothing to replace; return original string (when possible) */
6989 if (PyUnicode_CheckExact(self)) {
6990 Py_INCREF(self);
6991 return (PyObject *) self;
6992 }
6993 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994}
6995
6996/* --- Unicode Object Methods --------------------------------------------- */
6997
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006998PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000\n\
7001Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007002characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003
7004static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007005unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 return fixup(self, fixtitle);
7008}
7009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007010PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012\n\
7013Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007014have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015
7016static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007017unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 return fixup(self, fixcapitalize);
7020}
7021
7022#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007023PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025\n\
7026Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007027normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028
7029static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007030unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031{
7032 PyObject *list;
7033 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007034 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 /* Split into words */
7037 list = split(self, NULL, -1);
7038 if (!list)
7039 return NULL;
7040
7041 /* Capitalize each word */
7042 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7043 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 if (item == NULL)
7046 goto onError;
7047 Py_DECREF(PyList_GET_ITEM(list, i));
7048 PyList_SET_ITEM(list, i, item);
7049 }
7050
7051 /* Join the words to form a new string */
7052 item = PyUnicode_Join(NULL, list);
7053
Benjamin Peterson29060642009-01-31 22:14:21 +00007054 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 Py_DECREF(list);
7056 return (PyObject *)item;
7057}
7058#endif
7059
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007060/* Argument converter. Coerces to a single unicode character */
7061
7062static int
7063convert_uc(PyObject *obj, void *addr)
7064{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007065 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7066 PyObject *uniobj;
7067 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007068
Benjamin Peterson14339b62009-01-31 16:36:08 +00007069 uniobj = PyUnicode_FromObject(obj);
7070 if (uniobj == NULL) {
7071 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007073 return 0;
7074 }
7075 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7076 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007078 Py_DECREF(uniobj);
7079 return 0;
7080 }
7081 unistr = PyUnicode_AS_UNICODE(uniobj);
7082 *fillcharloc = unistr[0];
7083 Py_DECREF(uniobj);
7084 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007085}
7086
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007087PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007090Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007091done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092
7093static PyObject *
7094unicode_center(PyUnicodeObject *self, PyObject *args)
7095{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007096 Py_ssize_t marg, left;
7097 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007098 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099
Thomas Woutersde017742006-02-16 19:34:37 +00007100 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101 return NULL;
7102
Tim Peters7a29bd52001-09-12 03:03:31 +00007103 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 Py_INCREF(self);
7105 return (PyObject*) self;
7106 }
7107
7108 marg = width - self->length;
7109 left = marg / 2 + (marg & width & 1);
7110
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007111 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112}
7113
Marc-André Lemburge5034372000-08-08 08:04:29 +00007114#if 0
7115
7116/* This code should go into some future Unicode collation support
7117 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007118 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007119
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007120/* speedy UTF-16 code point order comparison */
7121/* gleaned from: */
7122/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7123
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007124static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007125{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007126 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007127 0, 0, 0, 0, 0, 0, 0, 0,
7128 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007129 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007130};
7131
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132static int
7133unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7134{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007135 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007136
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 Py_UNICODE *s1 = str1->str;
7138 Py_UNICODE *s2 = str2->str;
7139
7140 len1 = str1->length;
7141 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007142
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007144 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007145
7146 c1 = *s1++;
7147 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007148
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 if (c1 > (1<<11) * 26)
7150 c1 += utf16Fixup[c1>>11];
7151 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007152 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007153 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007154
7155 if (c1 != c2)
7156 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007157
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007158 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 }
7160
7161 return (len1 < len2) ? -1 : (len1 != len2);
7162}
7163
Marc-André Lemburge5034372000-08-08 08:04:29 +00007164#else
7165
7166static int
7167unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7168{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007169 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007170
7171 Py_UNICODE *s1 = str1->str;
7172 Py_UNICODE *s2 = str2->str;
7173
7174 len1 = str1->length;
7175 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007176
Marc-André Lemburge5034372000-08-08 08:04:29 +00007177 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007178 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007179
Fredrik Lundh45714e92001-06-26 16:39:36 +00007180 c1 = *s1++;
7181 c2 = *s2++;
7182
7183 if (c1 != c2)
7184 return (c1 < c2) ? -1 : 1;
7185
Marc-André Lemburge5034372000-08-08 08:04:29 +00007186 len1--; len2--;
7187 }
7188
7189 return (len1 < len2) ? -1 : (len1 != len2);
7190}
7191
7192#endif
7193
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007197 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7198 return unicode_compare((PyUnicodeObject *)left,
7199 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007200 PyErr_Format(PyExc_TypeError,
7201 "Can't compare %.100s and %.100s",
7202 left->ob_type->tp_name,
7203 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 return -1;
7205}
7206
Martin v. Löwis5b222132007-06-10 09:51:05 +00007207int
7208PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7209{
7210 int i;
7211 Py_UNICODE *id;
7212 assert(PyUnicode_Check(uni));
7213 id = PyUnicode_AS_UNICODE(uni);
7214 /* Compare Unicode string and source character set string */
7215 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 if (id[i] != str[i])
7217 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007218 /* This check keeps Python strings that end in '\0' from comparing equal
7219 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007220 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007222 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007224 return 0;
7225}
7226
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007227
Benjamin Peterson29060642009-01-31 22:14:21 +00007228#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007229 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007230
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007231PyObject *PyUnicode_RichCompare(PyObject *left,
7232 PyObject *right,
7233 int op)
7234{
7235 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007236
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007237 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7238 PyObject *v;
7239 if (((PyUnicodeObject *) left)->length !=
7240 ((PyUnicodeObject *) right)->length) {
7241 if (op == Py_EQ) {
7242 Py_INCREF(Py_False);
7243 return Py_False;
7244 }
7245 if (op == Py_NE) {
7246 Py_INCREF(Py_True);
7247 return Py_True;
7248 }
7249 }
7250 if (left == right)
7251 result = 0;
7252 else
7253 result = unicode_compare((PyUnicodeObject *)left,
7254 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007255
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007256 /* Convert the return value to a Boolean */
7257 switch (op) {
7258 case Py_EQ:
7259 v = TEST_COND(result == 0);
7260 break;
7261 case Py_NE:
7262 v = TEST_COND(result != 0);
7263 break;
7264 case Py_LE:
7265 v = TEST_COND(result <= 0);
7266 break;
7267 case Py_GE:
7268 v = TEST_COND(result >= 0);
7269 break;
7270 case Py_LT:
7271 v = TEST_COND(result == -1);
7272 break;
7273 case Py_GT:
7274 v = TEST_COND(result == 1);
7275 break;
7276 default:
7277 PyErr_BadArgument();
7278 return NULL;
7279 }
7280 Py_INCREF(v);
7281 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007283
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007284 Py_INCREF(Py_NotImplemented);
7285 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007286}
7287
Guido van Rossum403d68b2000-03-13 15:55:09 +00007288int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007289 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007290{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007291 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007292 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007293
7294 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007295 sub = PyUnicode_FromObject(element);
7296 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 PyErr_Format(PyExc_TypeError,
7298 "'in <string>' requires string as left operand, not %s",
7299 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007300 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007301 }
7302
Thomas Wouters477c8d52006-05-27 19:21:47 +00007303 str = PyUnicode_FromObject(container);
7304 if (!str) {
7305 Py_DECREF(sub);
7306 return -1;
7307 }
7308
7309 result = stringlib_contains_obj(str, sub);
7310
7311 Py_DECREF(str);
7312 Py_DECREF(sub);
7313
Guido van Rossum403d68b2000-03-13 15:55:09 +00007314 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007315}
7316
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317/* Concat to string or Unicode object giving a new Unicode object. */
7318
7319PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321{
7322 PyUnicodeObject *u = NULL, *v = NULL, *w;
7323
7324 /* Coerce the two arguments */
7325 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7326 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7329 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331
7332 /* Shortcuts */
7333 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 Py_DECREF(v);
7335 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336 }
7337 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 Py_DECREF(u);
7339 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 }
7341
7342 /* Concat the two Unicode strings */
7343 w = _PyUnicode_New(u->length + v->length);
7344 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346 Py_UNICODE_COPY(w->str, u->str, u->length);
7347 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7348
7349 Py_DECREF(u);
7350 Py_DECREF(v);
7351 return (PyObject *)w;
7352
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354 Py_XDECREF(u);
7355 Py_XDECREF(v);
7356 return NULL;
7357}
7358
Walter Dörwald1ab83302007-05-18 17:15:44 +00007359void
7360PyUnicode_Append(PyObject **pleft, PyObject *right)
7361{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007362 PyObject *new;
7363 if (*pleft == NULL)
7364 return;
7365 if (right == NULL || !PyUnicode_Check(*pleft)) {
7366 Py_DECREF(*pleft);
7367 *pleft = NULL;
7368 return;
7369 }
7370 new = PyUnicode_Concat(*pleft, right);
7371 Py_DECREF(*pleft);
7372 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007373}
7374
7375void
7376PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7377{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007378 PyUnicode_Append(pleft, right);
7379 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007380}
7381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007382PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007385Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007386string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007387interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388
7389static PyObject *
7390unicode_count(PyUnicodeObject *self, PyObject *args)
7391{
7392 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007393 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007394 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395 PyObject *result;
7396
Guido van Rossumb8872e62000-05-09 14:14:27 +00007397 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399 return NULL;
7400
7401 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007402 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007405
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007406 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007407 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007408 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007409 substring->str, substring->length,
7410 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007411 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412
7413 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007414
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 return result;
7416}
7417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007418PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007419 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007421Encode S using the codec registered for encoding. Default encoding\n\
7422is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007423handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007424a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7425'xmlcharrefreplace' as well as any other name registered with\n\
7426codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427
7428static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007429unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007431 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 char *encoding = NULL;
7433 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007434
Benjamin Peterson308d6372009-09-18 21:42:35 +00007435 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7436 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007438 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007439}
7440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007441PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443\n\
7444Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007445If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446
7447static PyObject*
7448unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7449{
7450 Py_UNICODE *e;
7451 Py_UNICODE *p;
7452 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007453 Py_UNICODE *qe;
7454 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 PyUnicodeObject *u;
7456 int tabsize = 8;
7457
7458 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460
Thomas Wouters7e474022000-07-16 12:04:32 +00007461 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007462 i = 0; /* chars up to and including most recent \n or \r */
7463 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7464 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465 for (p = self->str; p < e; p++)
7466 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 if (tabsize > 0) {
7468 incr = tabsize - (j % tabsize); /* cannot overflow */
7469 if (j > PY_SSIZE_T_MAX - incr)
7470 goto overflow1;
7471 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007472 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 if (j > PY_SSIZE_T_MAX - 1)
7476 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477 j++;
7478 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 if (i > PY_SSIZE_T_MAX - j)
7480 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007482 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 }
7484 }
7485
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007486 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007488
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 /* Second pass: create output string and fill it */
7490 u = _PyUnicode_New(i + j);
7491 if (!u)
7492 return NULL;
7493
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007494 j = 0; /* same as in first pass */
7495 q = u->str; /* next output char */
7496 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
7498 for (p = self->str; p < e; p++)
7499 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 if (tabsize > 0) {
7501 i = tabsize - (j % tabsize);
7502 j += i;
7503 while (i--) {
7504 if (q >= qe)
7505 goto overflow2;
7506 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007507 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007509 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 else {
7511 if (q >= qe)
7512 goto overflow2;
7513 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007514 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 if (*p == '\n' || *p == '\r')
7516 j = 0;
7517 }
7518
7519 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007520
7521 overflow2:
7522 Py_DECREF(u);
7523 overflow1:
7524 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526}
7527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007528PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530\n\
7531Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007532such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533arguments start and end are interpreted as in slice notation.\n\
7534\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007535Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536
7537static PyObject *
7538unicode_find(PyUnicodeObject *self, PyObject *args)
7539{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007540 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007541 Py_ssize_t start;
7542 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007543 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544
Christian Heimes9cd17752007-11-18 19:35:23 +00007545 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547
Thomas Wouters477c8d52006-05-27 19:21:47 +00007548 result = stringlib_find_slice(
7549 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7550 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7551 start, end
7552 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553
7554 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007555
Christian Heimes217cfd12007-12-02 14:31:20 +00007556 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557}
7558
7559static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007560unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561{
7562 if (index < 0 || index >= self->length) {
7563 PyErr_SetString(PyExc_IndexError, "string index out of range");
7564 return NULL;
7565 }
7566
7567 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7568}
7569
Guido van Rossumc2504932007-09-18 19:42:40 +00007570/* Believe it or not, this produces the same value for ASCII strings
7571 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007572static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007573unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574{
Guido van Rossumc2504932007-09-18 19:42:40 +00007575 Py_ssize_t len;
7576 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007577 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007578
7579 if (self->hash != -1)
7580 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007581 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007582 p = self->str;
7583 x = *p << 7;
7584 while (--len >= 0)
7585 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007586 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007587 if (x == -1)
7588 x = -2;
7589 self->hash = x;
7590 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591}
7592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007593PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007596Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597
7598static PyObject *
7599unicode_index(PyUnicodeObject *self, PyObject *args)
7600{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007601 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007602 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007603 Py_ssize_t start;
7604 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605
Christian Heimes9cd17752007-11-18 19:35:23 +00007606 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608
Thomas Wouters477c8d52006-05-27 19:21:47 +00007609 result = stringlib_find_slice(
7610 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7611 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7612 start, end
7613 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614
7615 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007616
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617 if (result < 0) {
7618 PyErr_SetString(PyExc_ValueError, "substring not found");
7619 return NULL;
7620 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007621
Christian Heimes217cfd12007-12-02 14:31:20 +00007622 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623}
7624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007625PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007628Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007629at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630
7631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007632unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633{
7634 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7635 register const Py_UNICODE *e;
7636 int cased;
7637
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 /* Shortcut for single character strings */
7639 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007642 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007643 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007645
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 e = p + PyUnicode_GET_SIZE(self);
7647 cased = 0;
7648 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007650
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7652 return PyBool_FromLong(0);
7653 else if (!cased && Py_UNICODE_ISLOWER(ch))
7654 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007656 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657}
7658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007659PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007662Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007663at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664
7665static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007666unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667{
7668 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7669 register const Py_UNICODE *e;
7670 int cased;
7671
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672 /* Shortcut for single character strings */
7673 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007676 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007677 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007679
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680 e = p + PyUnicode_GET_SIZE(self);
7681 cased = 0;
7682 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007684
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7686 return PyBool_FromLong(0);
7687 else if (!cased && Py_UNICODE_ISUPPER(ch))
7688 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007690 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691}
7692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007693PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007696Return True if S is a titlecased string and there is at least one\n\
7697character in S, i.e. upper- and titlecase characters may only\n\
7698follow uncased characters and lowercase characters only cased ones.\n\
7699Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700
7701static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007702unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703{
7704 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7705 register const Py_UNICODE *e;
7706 int cased, previous_is_cased;
7707
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708 /* Shortcut for single character strings */
7709 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7711 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007713 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007714 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007716
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 e = p + PyUnicode_GET_SIZE(self);
7718 cased = 0;
7719 previous_is_cased = 0;
7720 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007721 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007722
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7724 if (previous_is_cased)
7725 return PyBool_FromLong(0);
7726 previous_is_cased = 1;
7727 cased = 1;
7728 }
7729 else if (Py_UNICODE_ISLOWER(ch)) {
7730 if (!previous_is_cased)
7731 return PyBool_FromLong(0);
7732 previous_is_cased = 1;
7733 cased = 1;
7734 }
7735 else
7736 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007738 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739}
7740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007741PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007744Return True if all characters in S are whitespace\n\
7745and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746
7747static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007748unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749{
7750 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7751 register const Py_UNICODE *e;
7752
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 /* Shortcut for single character strings */
7754 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 Py_UNICODE_ISSPACE(*p))
7756 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007758 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007759 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007761
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762 e = p + PyUnicode_GET_SIZE(self);
7763 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 if (!Py_UNICODE_ISSPACE(*p))
7765 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007767 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768}
7769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007770PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007772\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007773Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007774and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007775
7776static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007777unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007778{
7779 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7780 register const Py_UNICODE *e;
7781
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007782 /* Shortcut for single character strings */
7783 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 Py_UNICODE_ISALPHA(*p))
7785 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007786
7787 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007788 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007790
7791 e = p + PyUnicode_GET_SIZE(self);
7792 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 if (!Py_UNICODE_ISALPHA(*p))
7794 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007795 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007796 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007797}
7798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007799PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007801\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007802Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007803and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007804
7805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007806unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007807{
7808 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7809 register const Py_UNICODE *e;
7810
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007811 /* Shortcut for single character strings */
7812 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 Py_UNICODE_ISALNUM(*p))
7814 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007815
7816 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007817 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007819
7820 e = p + PyUnicode_GET_SIZE(self);
7821 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 if (!Py_UNICODE_ISALNUM(*p))
7823 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007825 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007826}
7827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007828PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007831Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007832False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833
7834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007835unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836{
7837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7838 register const Py_UNICODE *e;
7839
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 /* Shortcut for single character strings */
7841 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 Py_UNICODE_ISDECIMAL(*p))
7843 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007845 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007846 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007848
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 e = p + PyUnicode_GET_SIZE(self);
7850 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 if (!Py_UNICODE_ISDECIMAL(*p))
7852 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007854 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855}
7856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007857PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007860Return True if all characters in S are digits\n\
7861and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862
7863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007864unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865{
7866 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7867 register const Py_UNICODE *e;
7868
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 /* Shortcut for single character strings */
7870 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 Py_UNICODE_ISDIGIT(*p))
7872 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007874 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007875 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007877
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 e = p + PyUnicode_GET_SIZE(self);
7879 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 if (!Py_UNICODE_ISDIGIT(*p))
7881 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007883 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884}
7885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007886PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007889Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007890False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891
7892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007893unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894{
7895 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7896 register const Py_UNICODE *e;
7897
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898 /* Shortcut for single character strings */
7899 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 Py_UNICODE_ISNUMERIC(*p))
7901 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007903 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007904 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007906
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907 e = p + PyUnicode_GET_SIZE(self);
7908 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 if (!Py_UNICODE_ISNUMERIC(*p))
7910 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007912 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913}
7914
Martin v. Löwis47383402007-08-15 07:32:56 +00007915int
7916PyUnicode_IsIdentifier(PyObject *self)
7917{
7918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7919 register const Py_UNICODE *e;
7920
7921 /* Special case for empty strings */
7922 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007924
7925 /* PEP 3131 says that the first character must be in
7926 XID_Start and subsequent characters in XID_Continue,
7927 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007928 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007929 letters, digits, underscore). However, given the current
7930 definition of XID_Start and XID_Continue, it is sufficient
7931 to check just for these, except that _ must be allowed
7932 as starting an identifier. */
7933 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7934 return 0;
7935
7936 e = p + PyUnicode_GET_SIZE(self);
7937 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 if (!_PyUnicode_IsXidContinue(*p))
7939 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007940 }
7941 return 1;
7942}
7943
7944PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007946\n\
7947Return True if S is a valid identifier according\n\
7948to the language definition.");
7949
7950static PyObject*
7951unicode_isidentifier(PyObject *self)
7952{
7953 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7954}
7955
Georg Brandl559e5d72008-06-11 18:37:52 +00007956PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007958\n\
7959Return True if all characters in S are considered\n\
7960printable in repr() or S is empty, False otherwise.");
7961
7962static PyObject*
7963unicode_isprintable(PyObject *self)
7964{
7965 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7966 register const Py_UNICODE *e;
7967
7968 /* Shortcut for single character strings */
7969 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7970 Py_RETURN_TRUE;
7971 }
7972
7973 e = p + PyUnicode_GET_SIZE(self);
7974 for (; p < e; p++) {
7975 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7976 Py_RETURN_FALSE;
7977 }
7978 }
7979 Py_RETURN_TRUE;
7980}
7981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007982PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007983 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984\n\
7985Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007986iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987
7988static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007989unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007991 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992}
7993
Martin v. Löwis18e16552006-02-15 17:27:45 +00007994static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995unicode_length(PyUnicodeObject *self)
7996{
7997 return self->length;
7998}
7999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008000PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008003Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008004done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005
8006static PyObject *
8007unicode_ljust(PyUnicodeObject *self, PyObject *args)
8008{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008009 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008010 Py_UNICODE fillchar = ' ';
8011
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008012 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 return NULL;
8014
Tim Peters7a29bd52001-09-12 03:03:31 +00008015 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 Py_INCREF(self);
8017 return (PyObject*) self;
8018 }
8019
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008020 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021}
8022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008023PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008026Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027
8028static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008029unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 return fixup(self, fixlower);
8032}
8033
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008034#define LEFTSTRIP 0
8035#define RIGHTSTRIP 1
8036#define BOTHSTRIP 2
8037
8038/* Arrays indexed by above */
8039static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8040
8041#define STRIPNAME(i) (stripformat[i]+3)
8042
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008043/* externally visible for str.strip(unicode) */
8044PyObject *
8045_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8046{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008047 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8048 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8049 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8050 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8051 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008052
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008054
Benjamin Peterson14339b62009-01-31 16:36:08 +00008055 i = 0;
8056 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8058 i++;
8059 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008061
Benjamin Peterson14339b62009-01-31 16:36:08 +00008062 j = len;
8063 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 do {
8065 j--;
8066 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8067 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008068 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008069
Benjamin Peterson14339b62009-01-31 16:36:08 +00008070 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 Py_INCREF(self);
8072 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008073 }
8074 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008076}
8077
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078
8079static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008080do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008082 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8083 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008084
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085 i = 0;
8086 if (striptype != RIGHTSTRIP) {
8087 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8088 i++;
8089 }
8090 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008091
Benjamin Peterson14339b62009-01-31 16:36:08 +00008092 j = len;
8093 if (striptype != LEFTSTRIP) {
8094 do {
8095 j--;
8096 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8097 j++;
8098 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008099
Benjamin Peterson14339b62009-01-31 16:36:08 +00008100 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8101 Py_INCREF(self);
8102 return (PyObject*)self;
8103 }
8104 else
8105 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106}
8107
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008108
8109static PyObject *
8110do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8111{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008112 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008113
Benjamin Peterson14339b62009-01-31 16:36:08 +00008114 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8115 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008116
Benjamin Peterson14339b62009-01-31 16:36:08 +00008117 if (sep != NULL && sep != Py_None) {
8118 if (PyUnicode_Check(sep))
8119 return _PyUnicode_XStrip(self, striptype, sep);
8120 else {
8121 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 "%s arg must be None or str",
8123 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008124 return NULL;
8125 }
8126 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008127
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008129}
8130
8131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008132PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008134\n\
8135Return a copy of the string S with leading and trailing\n\
8136whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008137If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008138
8139static PyObject *
8140unicode_strip(PyUnicodeObject *self, PyObject *args)
8141{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008142 if (PyTuple_GET_SIZE(args) == 0)
8143 return do_strip(self, BOTHSTRIP); /* Common case */
8144 else
8145 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008146}
8147
8148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008149PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008151\n\
8152Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008153If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008154
8155static PyObject *
8156unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8157{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 if (PyTuple_GET_SIZE(args) == 0)
8159 return do_strip(self, LEFTSTRIP); /* Common case */
8160 else
8161 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008162}
8163
8164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008165PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008167\n\
8168Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008169If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008170
8171static PyObject *
8172unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8173{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008174 if (PyTuple_GET_SIZE(args) == 0)
8175 return do_strip(self, RIGHTSTRIP); /* Common case */
8176 else
8177 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008178}
8179
8180
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008182unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183{
8184 PyUnicodeObject *u;
8185 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008186 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008187 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188
Georg Brandl222de0f2009-04-12 12:01:50 +00008189 if (len < 1) {
8190 Py_INCREF(unicode_empty);
8191 return (PyObject *)unicode_empty;
8192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193
Tim Peters7a29bd52001-09-12 03:03:31 +00008194 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195 /* no repeat, return original string */
8196 Py_INCREF(str);
8197 return (PyObject*) str;
8198 }
Tim Peters8f422462000-09-09 06:13:41 +00008199
8200 /* ensure # of chars needed doesn't overflow int and # of bytes
8201 * needed doesn't overflow size_t
8202 */
8203 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008204 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008205 PyErr_SetString(PyExc_OverflowError,
8206 "repeated string is too long");
8207 return NULL;
8208 }
8209 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8210 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8211 PyErr_SetString(PyExc_OverflowError,
8212 "repeated string is too long");
8213 return NULL;
8214 }
8215 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 if (!u)
8217 return NULL;
8218
8219 p = u->str;
8220
Georg Brandl222de0f2009-04-12 12:01:50 +00008221 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008222 Py_UNICODE_FILL(p, str->str[0], len);
8223 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008224 Py_ssize_t done = str->length; /* number of characters copied this far */
8225 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008227 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008228 Py_UNICODE_COPY(p+done, p, n);
8229 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 }
8232
8233 return (PyObject*) u;
8234}
8235
8236PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 PyObject *subobj,
8238 PyObject *replobj,
8239 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240{
8241 PyObject *self;
8242 PyObject *str1;
8243 PyObject *str2;
8244 PyObject *result;
8245
8246 self = PyUnicode_FromObject(obj);
8247 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 str1 = PyUnicode_FromObject(subobj);
8250 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 Py_DECREF(self);
8252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253 }
8254 str2 = PyUnicode_FromObject(replobj);
8255 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 Py_DECREF(self);
8257 Py_DECREF(str1);
8258 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259 }
Tim Petersced69f82003-09-16 20:30:58 +00008260 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 (PyUnicodeObject *)str1,
8262 (PyUnicodeObject *)str2,
8263 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 Py_DECREF(self);
8265 Py_DECREF(str1);
8266 Py_DECREF(str2);
8267 return result;
8268}
8269
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008270PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008271 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272\n\
8273Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008274old replaced by new. If the optional argument count is\n\
8275given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276
8277static PyObject*
8278unicode_replace(PyUnicodeObject *self, PyObject *args)
8279{
8280 PyUnicodeObject *str1;
8281 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008282 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 PyObject *result;
8284
Martin v. Löwis18e16552006-02-15 17:27:45 +00008285 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 return NULL;
8287 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8288 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008291 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 Py_DECREF(str1);
8293 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295
8296 result = replace(self, str1, str2, maxcount);
8297
8298 Py_DECREF(str1);
8299 Py_DECREF(str2);
8300 return result;
8301}
8302
8303static
8304PyObject *unicode_repr(PyObject *unicode)
8305{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008306 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008307 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008308 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8309 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8310
8311 /* XXX(nnorwitz): rather than over-allocating, it would be
8312 better to choose a different scheme. Perhaps scan the
8313 first N-chars of the string and allocate based on that size.
8314 */
8315 /* Initial allocation is based on the longest-possible unichr
8316 escape.
8317
8318 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8319 unichr, so in this case it's the longest unichr escape. In
8320 narrow (UTF-16) builds this is five chars per source unichr
8321 since there are two unichrs in the surrogate pair, so in narrow
8322 (UTF-16) builds it's not the longest unichr escape.
8323
8324 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8325 so in the narrow (UTF-16) build case it's the longest unichr
8326 escape.
8327 */
8328
Walter Dörwald1ab83302007-05-18 17:15:44 +00008329 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008331#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008333#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008335#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008337 if (repr == NULL)
8338 return NULL;
8339
Walter Dörwald1ab83302007-05-18 17:15:44 +00008340 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008341
8342 /* Add quote */
8343 *p++ = (findchar(s, size, '\'') &&
8344 !findchar(s, size, '"')) ? '"' : '\'';
8345 while (size-- > 0) {
8346 Py_UNICODE ch = *s++;
8347
8348 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008349 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008350 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008351 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008352 continue;
8353 }
8354
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008356 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008357 *p++ = '\\';
8358 *p++ = 't';
8359 }
8360 else if (ch == '\n') {
8361 *p++ = '\\';
8362 *p++ = 'n';
8363 }
8364 else if (ch == '\r') {
8365 *p++ = '\\';
8366 *p++ = 'r';
8367 }
8368
8369 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008370 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008371 *p++ = '\\';
8372 *p++ = 'x';
8373 *p++ = hexdigits[(ch >> 4) & 0x000F];
8374 *p++ = hexdigits[ch & 0x000F];
8375 }
8376
Georg Brandl559e5d72008-06-11 18:37:52 +00008377 /* Copy ASCII characters as-is */
8378 else if (ch < 0x7F) {
8379 *p++ = ch;
8380 }
8381
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008383 else {
8384 Py_UCS4 ucs = ch;
8385
8386#ifndef Py_UNICODE_WIDE
8387 Py_UNICODE ch2 = 0;
8388 /* Get code point from surrogate pair */
8389 if (size > 0) {
8390 ch2 = *s;
8391 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008393 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008395 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008396 size--;
8397 }
8398 }
8399#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008400 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008401 (categories Z* and C* except ASCII space)
8402 */
8403 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8404 /* Map 8-bit characters to '\xhh' */
8405 if (ucs <= 0xff) {
8406 *p++ = '\\';
8407 *p++ = 'x';
8408 *p++ = hexdigits[(ch >> 4) & 0x000F];
8409 *p++ = hexdigits[ch & 0x000F];
8410 }
8411 /* Map 21-bit characters to '\U00xxxxxx' */
8412 else if (ucs >= 0x10000) {
8413 *p++ = '\\';
8414 *p++ = 'U';
8415 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8416 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8417 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8418 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8419 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8420 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8421 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8422 *p++ = hexdigits[ucs & 0x0000000F];
8423 }
8424 /* Map 16-bit characters to '\uxxxx' */
8425 else {
8426 *p++ = '\\';
8427 *p++ = 'u';
8428 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8429 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8430 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8431 *p++ = hexdigits[ucs & 0x000F];
8432 }
8433 }
8434 /* Copy characters as-is */
8435 else {
8436 *p++ = ch;
8437#ifndef Py_UNICODE_WIDE
8438 if (ucs >= 0x10000)
8439 *p++ = ch2;
8440#endif
8441 }
8442 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008443 }
8444 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008445 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008446
8447 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008448 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008449 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450}
8451
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008452PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454\n\
8455Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008456such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457arguments start and end are interpreted as in slice notation.\n\
8458\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008459Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460
8461static PyObject *
8462unicode_rfind(PyUnicodeObject *self, PyObject *args)
8463{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008464 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008465 Py_ssize_t start;
8466 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008467 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468
Christian Heimes9cd17752007-11-18 19:35:23 +00008469 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008470 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471
Thomas Wouters477c8d52006-05-27 19:21:47 +00008472 result = stringlib_rfind_slice(
8473 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8474 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8475 start, end
8476 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477
8478 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008479
Christian Heimes217cfd12007-12-02 14:31:20 +00008480 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481}
8482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008483PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008486Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487
8488static PyObject *
8489unicode_rindex(PyUnicodeObject *self, PyObject *args)
8490{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008491 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008492 Py_ssize_t start;
8493 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008494 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495
Christian Heimes9cd17752007-11-18 19:35:23 +00008496 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498
Thomas Wouters477c8d52006-05-27 19:21:47 +00008499 result = stringlib_rfind_slice(
8500 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8501 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8502 start, end
8503 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504
8505 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008506
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 if (result < 0) {
8508 PyErr_SetString(PyExc_ValueError, "substring not found");
8509 return NULL;
8510 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008511 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512}
8513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008514PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008517Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008518done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519
8520static PyObject *
8521unicode_rjust(PyUnicodeObject *self, PyObject *args)
8522{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008523 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008524 Py_UNICODE fillchar = ' ';
8525
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008526 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527 return NULL;
8528
Tim Peters7a29bd52001-09-12 03:03:31 +00008529 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530 Py_INCREF(self);
8531 return (PyObject*) self;
8532 }
8533
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008534 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535}
8536
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 PyObject *sep,
8539 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540{
8541 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008542
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 s = PyUnicode_FromObject(s);
8544 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008545 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 if (sep != NULL) {
8547 sep = PyUnicode_FromObject(sep);
8548 if (sep == NULL) {
8549 Py_DECREF(s);
8550 return NULL;
8551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 }
8553
8554 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8555
8556 Py_DECREF(s);
8557 Py_XDECREF(sep);
8558 return result;
8559}
8560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008561PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563\n\
8564Return a list of the words in S, using sep as the\n\
8565delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008566splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008567whitespace string is a separator and empty strings are\n\
8568removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569
8570static PyObject*
8571unicode_split(PyUnicodeObject *self, PyObject *args)
8572{
8573 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008574 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575
Martin v. Löwis18e16552006-02-15 17:27:45 +00008576 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 return NULL;
8578
8579 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585}
8586
Thomas Wouters477c8d52006-05-27 19:21:47 +00008587PyObject *
8588PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8589{
8590 PyObject* str_obj;
8591 PyObject* sep_obj;
8592 PyObject* out;
8593
8594 str_obj = PyUnicode_FromObject(str_in);
8595 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008597 sep_obj = PyUnicode_FromObject(sep_in);
8598 if (!sep_obj) {
8599 Py_DECREF(str_obj);
8600 return NULL;
8601 }
8602
8603 out = stringlib_partition(
8604 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8605 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8606 );
8607
8608 Py_DECREF(sep_obj);
8609 Py_DECREF(str_obj);
8610
8611 return out;
8612}
8613
8614
8615PyObject *
8616PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8617{
8618 PyObject* str_obj;
8619 PyObject* sep_obj;
8620 PyObject* out;
8621
8622 str_obj = PyUnicode_FromObject(str_in);
8623 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008625 sep_obj = PyUnicode_FromObject(sep_in);
8626 if (!sep_obj) {
8627 Py_DECREF(str_obj);
8628 return NULL;
8629 }
8630
8631 out = stringlib_rpartition(
8632 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8633 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8634 );
8635
8636 Py_DECREF(sep_obj);
8637 Py_DECREF(str_obj);
8638
8639 return out;
8640}
8641
8642PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008644\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008645Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008646the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008647found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008648
8649static PyObject*
8650unicode_partition(PyUnicodeObject *self, PyObject *separator)
8651{
8652 return PyUnicode_Partition((PyObject *)self, separator);
8653}
8654
8655PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008656 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008657\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008658Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008659the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008660separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008661
8662static PyObject*
8663unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8664{
8665 return PyUnicode_RPartition((PyObject *)self, separator);
8666}
8667
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008668PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 PyObject *sep,
8670 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008671{
8672 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008673
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008674 s = PyUnicode_FromObject(s);
8675 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008676 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 if (sep != NULL) {
8678 sep = PyUnicode_FromObject(sep);
8679 if (sep == NULL) {
8680 Py_DECREF(s);
8681 return NULL;
8682 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008683 }
8684
8685 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8686
8687 Py_DECREF(s);
8688 Py_XDECREF(sep);
8689 return result;
8690}
8691
8692PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008694\n\
8695Return a list of the words in S, using sep as the\n\
8696delimiter string, starting at the end of the string and\n\
8697working to the front. If maxsplit is given, at most maxsplit\n\
8698splits are done. If sep is not specified, any whitespace string\n\
8699is a separator.");
8700
8701static PyObject*
8702unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8703{
8704 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008705 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008706
Martin v. Löwis18e16552006-02-15 17:27:45 +00008707 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008708 return NULL;
8709
8710 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008712 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008714 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008716}
8717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008718PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720\n\
8721Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008722Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008723is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724
8725static PyObject*
8726unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8727{
Guido van Rossum86662912000-04-11 15:38:46 +00008728 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729
Guido van Rossum86662912000-04-11 15:38:46 +00008730 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 return NULL;
8732
Guido van Rossum86662912000-04-11 15:38:46 +00008733 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734}
8735
8736static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008737PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738{
Walter Dörwald346737f2007-05-31 10:44:43 +00008739 if (PyUnicode_CheckExact(self)) {
8740 Py_INCREF(self);
8741 return self;
8742 } else
8743 /* Subtype -- return genuine unicode string with the same value. */
8744 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8745 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746}
8747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008748PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750\n\
8751Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008752and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753
8754static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008755unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757 return fixup(self, fixswapcase);
8758}
8759
Georg Brandlceee0772007-11-27 23:48:05 +00008760PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008762\n\
8763Return a translation table usable for str.translate().\n\
8764If there is only one argument, it must be a dictionary mapping Unicode\n\
8765ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008766Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008767If there are two arguments, they must be strings of equal length, and\n\
8768in the resulting dictionary, each character in x will be mapped to the\n\
8769character at the same position in y. If there is a third argument, it\n\
8770must be a string, whose characters will be mapped to None in the result.");
8771
8772static PyObject*
8773unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8774{
8775 PyObject *x, *y = NULL, *z = NULL;
8776 PyObject *new = NULL, *key, *value;
8777 Py_ssize_t i = 0;
8778 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008779
Georg Brandlceee0772007-11-27 23:48:05 +00008780 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8781 return NULL;
8782 new = PyDict_New();
8783 if (!new)
8784 return NULL;
8785 if (y != NULL) {
8786 /* x must be a string too, of equal length */
8787 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8788 if (!PyUnicode_Check(x)) {
8789 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8790 "be a string if there is a second argument");
8791 goto err;
8792 }
8793 if (PyUnicode_GET_SIZE(x) != ylen) {
8794 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8795 "arguments must have equal length");
8796 goto err;
8797 }
8798 /* create entries for translating chars in x to those in y */
8799 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008800 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8801 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008802 if (!key || !value)
8803 goto err;
8804 res = PyDict_SetItem(new, key, value);
8805 Py_DECREF(key);
8806 Py_DECREF(value);
8807 if (res < 0)
8808 goto err;
8809 }
8810 /* create entries for deleting chars in z */
8811 if (z != NULL) {
8812 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008813 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008814 if (!key)
8815 goto err;
8816 res = PyDict_SetItem(new, key, Py_None);
8817 Py_DECREF(key);
8818 if (res < 0)
8819 goto err;
8820 }
8821 }
8822 } else {
8823 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008824 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008825 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8826 "to maketrans it must be a dict");
8827 goto err;
8828 }
8829 /* copy entries into the new dict, converting string keys to int keys */
8830 while (PyDict_Next(x, &i, &key, &value)) {
8831 if (PyUnicode_Check(key)) {
8832 /* convert string keys to integer keys */
8833 PyObject *newkey;
8834 if (PyUnicode_GET_SIZE(key) != 1) {
8835 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8836 "table must be of length 1");
8837 goto err;
8838 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008839 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008840 if (!newkey)
8841 goto err;
8842 res = PyDict_SetItem(new, newkey, value);
8843 Py_DECREF(newkey);
8844 if (res < 0)
8845 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008846 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008847 /* just keep integer keys */
8848 if (PyDict_SetItem(new, key, value) < 0)
8849 goto err;
8850 } else {
8851 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8852 "be strings or integers");
8853 goto err;
8854 }
8855 }
8856 }
8857 return new;
8858 err:
8859 Py_DECREF(new);
8860 return NULL;
8861}
8862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008863PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865\n\
8866Return a copy of the string S, where all characters have been mapped\n\
8867through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008868Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008869Unmapped characters are left untouched. Characters mapped to None\n\
8870are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871
8872static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008873unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874{
Georg Brandlceee0772007-11-27 23:48:05 +00008875 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876}
8877
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008878PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008881Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882
8883static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008884unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 return fixup(self, fixupper);
8887}
8888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008889PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008892Pad a numeric string S with zeros on the left, to fill a field\n\
8893of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894
8895static PyObject *
8896unicode_zfill(PyUnicodeObject *self, PyObject *args)
8897{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008898 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899 PyUnicodeObject *u;
8900
Martin v. Löwis18e16552006-02-15 17:27:45 +00008901 Py_ssize_t width;
8902 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 return NULL;
8904
8905 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008906 if (PyUnicode_CheckExact(self)) {
8907 Py_INCREF(self);
8908 return (PyObject*) self;
8909 }
8910 else
8911 return PyUnicode_FromUnicode(
8912 PyUnicode_AS_UNICODE(self),
8913 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915 }
8916
8917 fill = width - self->length;
8918
8919 u = pad(self, fill, 0, '0');
8920
Walter Dörwald068325e2002-04-15 13:36:47 +00008921 if (u == NULL)
8922 return NULL;
8923
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 if (u->str[fill] == '+' || u->str[fill] == '-') {
8925 /* move sign to beginning of string */
8926 u->str[0] = u->str[fill];
8927 u->str[fill] = '0';
8928 }
8929
8930 return (PyObject*) u;
8931}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932
8933#if 0
8934static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008935unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936{
Christian Heimes2202f872008-02-06 14:31:34 +00008937 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008939
8940static PyObject *
8941unicode__decimal2ascii(PyObject *self)
8942{
8943 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
8944 PyUnicode_GET_SIZE(self));
8945}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946#endif
8947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008948PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008951Return True if S starts with the specified prefix, False otherwise.\n\
8952With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008953With optional end, stop comparing S at that position.\n\
8954prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955
8956static PyObject *
8957unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008958 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008960 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008962 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008963 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008964 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008966 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8968 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008969 if (PyTuple_Check(subobj)) {
8970 Py_ssize_t i;
8971 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8972 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008974 if (substring == NULL)
8975 return NULL;
8976 result = tailmatch(self, substring, start, end, -1);
8977 Py_DECREF(substring);
8978 if (result) {
8979 Py_RETURN_TRUE;
8980 }
8981 }
8982 /* nothing matched */
8983 Py_RETURN_FALSE;
8984 }
8985 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008988 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008990 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991}
8992
8993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008994PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008997Return True if S ends with the specified suffix, False otherwise.\n\
8998With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008999With optional end, stop comparing S at that position.\n\
9000suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001
9002static PyObject *
9003unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009006 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009008 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009009 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009010 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009012 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9014 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009015 if (PyTuple_Check(subobj)) {
9016 Py_ssize_t i;
9017 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9018 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009019 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009020 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009022 result = tailmatch(self, substring, start, end, +1);
9023 Py_DECREF(substring);
9024 if (result) {
9025 Py_RETURN_TRUE;
9026 }
9027 }
9028 Py_RETURN_FALSE;
9029 }
9030 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009034 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009036 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037}
9038
Eric Smith8c663262007-08-25 02:26:07 +00009039#include "stringlib/string_format.h"
9040
9041PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009043\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009044Return a formatted version of S, using substitutions from args and kwargs.\n\
9045The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009046
Eric Smith27bbca62010-11-04 17:06:58 +00009047PyDoc_STRVAR(format_map__doc__,
9048 "S.format_map(mapping) -> str\n\
9049\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009050Return a formatted version of S, using substitutions from mapping.\n\
9051The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009052
Eric Smith4a7d76d2008-05-30 18:10:19 +00009053static PyObject *
9054unicode__format__(PyObject* self, PyObject* args)
9055{
9056 PyObject *format_spec;
9057
9058 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9059 return NULL;
9060
9061 return _PyUnicode_FormatAdvanced(self,
9062 PyUnicode_AS_UNICODE(format_spec),
9063 PyUnicode_GET_SIZE(format_spec));
9064}
9065
Eric Smith8c663262007-08-25 02:26:07 +00009066PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009067 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009068\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009069Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009070
9071static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009072unicode__sizeof__(PyUnicodeObject *v)
9073{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009074 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9075 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009076}
9077
9078PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009079 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009080
9081static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009082unicode_getnewargs(PyUnicodeObject *v)
9083{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009084 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009085}
9086
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087static PyMethodDef unicode_methods[] = {
9088
9089 /* Order is according to common usage: often used methods should
9090 appear first, since lookup is done sequentially. */
9091
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009092 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009093 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9094 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009095 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009096 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9097 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9098 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9099 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9100 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9101 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9102 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009103 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009104 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9105 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9106 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009107 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009108 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9109 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9110 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009111 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009112 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009113 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009114 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009115 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9116 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9117 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9118 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9119 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9120 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9121 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9122 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9123 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9124 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9125 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9126 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9127 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9128 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009129 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009130 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009131 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009132 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009133 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009134 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009135 {"maketrans", (PyCFunction) unicode_maketrans,
9136 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009137 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009138#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009139 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140#endif
9141
9142#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009143 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009144 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009145 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146#endif
9147
Benjamin Peterson14339b62009-01-31 16:36:08 +00009148 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149 {NULL, NULL}
9150};
9151
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009152static PyObject *
9153unicode_mod(PyObject *v, PyObject *w)
9154{
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 if (!PyUnicode_Check(v)) {
9156 Py_INCREF(Py_NotImplemented);
9157 return Py_NotImplemented;
9158 }
9159 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009160}
9161
9162static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009163 0, /*nb_add*/
9164 0, /*nb_subtract*/
9165 0, /*nb_multiply*/
9166 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009167};
9168
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009170 (lenfunc) unicode_length, /* sq_length */
9171 PyUnicode_Concat, /* sq_concat */
9172 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9173 (ssizeargfunc) unicode_getitem, /* sq_item */
9174 0, /* sq_slice */
9175 0, /* sq_ass_item */
9176 0, /* sq_ass_slice */
9177 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178};
9179
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009180static PyObject*
9181unicode_subscript(PyUnicodeObject* self, PyObject* item)
9182{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009183 if (PyIndex_Check(item)) {
9184 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009185 if (i == -1 && PyErr_Occurred())
9186 return NULL;
9187 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009188 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009189 return unicode_getitem(self, i);
9190 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009191 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009192 Py_UNICODE* source_buf;
9193 Py_UNICODE* result_buf;
9194 PyObject* result;
9195
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009196 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009197 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009198 return NULL;
9199 }
9200
9201 if (slicelength <= 0) {
9202 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009203 } else if (start == 0 && step == 1 && slicelength == self->length &&
9204 PyUnicode_CheckExact(self)) {
9205 Py_INCREF(self);
9206 return (PyObject *)self;
9207 } else if (step == 1) {
9208 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009209 } else {
9210 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009211 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9212 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009213
Benjamin Peterson29060642009-01-31 22:14:21 +00009214 if (result_buf == NULL)
9215 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009216
9217 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9218 result_buf[i] = source_buf[cur];
9219 }
Tim Petersced69f82003-09-16 20:30:58 +00009220
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009221 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009222 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009223 return result;
9224 }
9225 } else {
9226 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9227 return NULL;
9228 }
9229}
9230
9231static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009232 (lenfunc)unicode_length, /* mp_length */
9233 (binaryfunc)unicode_subscript, /* mp_subscript */
9234 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009235};
9236
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238/* Helpers for PyUnicode_Format() */
9239
9240static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009241getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009243 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009245 (*p_argidx)++;
9246 if (arglen < 0)
9247 return args;
9248 else
9249 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250 }
9251 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009252 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253 return NULL;
9254}
9255
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009256/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009258static PyObject *
9259formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009261 char *p;
9262 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009264
Guido van Rossumd57fd912000-03-10 22:53:23 +00009265 x = PyFloat_AsDouble(v);
9266 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009267 return NULL;
9268
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009270 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009271
Eric Smith0923d1d2009-04-16 20:16:10 +00009272 p = PyOS_double_to_string(x, type, prec,
9273 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009274 if (p == NULL)
9275 return NULL;
9276 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009277 PyMem_Free(p);
9278 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279}
9280
Tim Peters38fd5b62000-09-21 05:43:11 +00009281static PyObject*
9282formatlong(PyObject *val, int flags, int prec, int type)
9283{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009284 char *buf;
9285 int len;
9286 PyObject *str; /* temporary string object. */
9287 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009288
Benjamin Peterson14339b62009-01-31 16:36:08 +00009289 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9290 if (!str)
9291 return NULL;
9292 result = PyUnicode_FromStringAndSize(buf, len);
9293 Py_DECREF(str);
9294 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009295}
9296
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297static int
9298formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009299 size_t buflen,
9300 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009301{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009302 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009303 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009304 if (PyUnicode_GET_SIZE(v) == 1) {
9305 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9306 buf[1] = '\0';
9307 return 1;
9308 }
9309#ifndef Py_UNICODE_WIDE
9310 if (PyUnicode_GET_SIZE(v) == 2) {
9311 /* Decode a valid surrogate pair */
9312 int c0 = PyUnicode_AS_UNICODE(v)[0];
9313 int c1 = PyUnicode_AS_UNICODE(v)[1];
9314 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9315 0xDC00 <= c1 && c1 <= 0xDFFF) {
9316 buf[0] = c0;
9317 buf[1] = c1;
9318 buf[2] = '\0';
9319 return 2;
9320 }
9321 }
9322#endif
9323 goto onError;
9324 }
9325 else {
9326 /* Integer input truncated to a character */
9327 long x;
9328 x = PyLong_AsLong(v);
9329 if (x == -1 && PyErr_Occurred())
9330 goto onError;
9331
9332 if (x < 0 || x > 0x10ffff) {
9333 PyErr_SetString(PyExc_OverflowError,
9334 "%c arg not in range(0x110000)");
9335 return -1;
9336 }
9337
9338#ifndef Py_UNICODE_WIDE
9339 if (x > 0xffff) {
9340 x -= 0x10000;
9341 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9342 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9343 return 2;
9344 }
9345#endif
9346 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009347 buf[1] = '\0';
9348 return 1;
9349 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009350
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009352 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009353 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009354 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355}
9356
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009357/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009358 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009359*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009360#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009361
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009363 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364{
9365 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009366 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367 int args_owned = 0;
9368 PyUnicodeObject *result = NULL;
9369 PyObject *dict = NULL;
9370 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009371
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009373 PyErr_BadInternalCall();
9374 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375 }
9376 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009377 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379 fmt = PyUnicode_AS_UNICODE(uformat);
9380 fmtcnt = PyUnicode_GET_SIZE(uformat);
9381
9382 reslen = rescnt = fmtcnt + 100;
9383 result = _PyUnicode_New(reslen);
9384 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009385 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386 res = PyUnicode_AS_UNICODE(result);
9387
9388 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 arglen = PyTuple_Size(args);
9390 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391 }
9392 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009393 arglen = -1;
9394 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009396 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009397 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009398 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399
9400 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009401 if (*fmt != '%') {
9402 if (--rescnt < 0) {
9403 rescnt = fmtcnt + 100;
9404 reslen += rescnt;
9405 if (_PyUnicode_Resize(&result, reslen) < 0)
9406 goto onError;
9407 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9408 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009409 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009411 }
9412 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009413 /* Got a format specifier */
9414 int flags = 0;
9415 Py_ssize_t width = -1;
9416 int prec = -1;
9417 Py_UNICODE c = '\0';
9418 Py_UNICODE fill;
9419 int isnumok;
9420 PyObject *v = NULL;
9421 PyObject *temp = NULL;
9422 Py_UNICODE *pbuf;
9423 Py_UNICODE sign;
9424 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009425 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009426
Benjamin Peterson29060642009-01-31 22:14:21 +00009427 fmt++;
9428 if (*fmt == '(') {
9429 Py_UNICODE *keystart;
9430 Py_ssize_t keylen;
9431 PyObject *key;
9432 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009433
Benjamin Peterson29060642009-01-31 22:14:21 +00009434 if (dict == NULL) {
9435 PyErr_SetString(PyExc_TypeError,
9436 "format requires a mapping");
9437 goto onError;
9438 }
9439 ++fmt;
9440 --fmtcnt;
9441 keystart = fmt;
9442 /* Skip over balanced parentheses */
9443 while (pcount > 0 && --fmtcnt >= 0) {
9444 if (*fmt == ')')
9445 --pcount;
9446 else if (*fmt == '(')
9447 ++pcount;
9448 fmt++;
9449 }
9450 keylen = fmt - keystart - 1;
9451 if (fmtcnt < 0 || pcount > 0) {
9452 PyErr_SetString(PyExc_ValueError,
9453 "incomplete format key");
9454 goto onError;
9455 }
9456#if 0
9457 /* keys are converted to strings using UTF-8 and
9458 then looked up since Python uses strings to hold
9459 variables names etc. in its namespaces and we
9460 wouldn't want to break common idioms. */
9461 key = PyUnicode_EncodeUTF8(keystart,
9462 keylen,
9463 NULL);
9464#else
9465 key = PyUnicode_FromUnicode(keystart, keylen);
9466#endif
9467 if (key == NULL)
9468 goto onError;
9469 if (args_owned) {
9470 Py_DECREF(args);
9471 args_owned = 0;
9472 }
9473 args = PyObject_GetItem(dict, key);
9474 Py_DECREF(key);
9475 if (args == NULL) {
9476 goto onError;
9477 }
9478 args_owned = 1;
9479 arglen = -1;
9480 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009481 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 while (--fmtcnt >= 0) {
9483 switch (c = *fmt++) {
9484 case '-': flags |= F_LJUST; continue;
9485 case '+': flags |= F_SIGN; continue;
9486 case ' ': flags |= F_BLANK; continue;
9487 case '#': flags |= F_ALT; continue;
9488 case '0': flags |= F_ZERO; continue;
9489 }
9490 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009491 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 if (c == '*') {
9493 v = getnextarg(args, arglen, &argidx);
9494 if (v == NULL)
9495 goto onError;
9496 if (!PyLong_Check(v)) {
9497 PyErr_SetString(PyExc_TypeError,
9498 "* wants int");
9499 goto onError;
9500 }
9501 width = PyLong_AsLong(v);
9502 if (width == -1 && PyErr_Occurred())
9503 goto onError;
9504 if (width < 0) {
9505 flags |= F_LJUST;
9506 width = -width;
9507 }
9508 if (--fmtcnt >= 0)
9509 c = *fmt++;
9510 }
9511 else if (c >= '0' && c <= '9') {
9512 width = c - '0';
9513 while (--fmtcnt >= 0) {
9514 c = *fmt++;
9515 if (c < '0' || c > '9')
9516 break;
9517 if ((width*10) / 10 != width) {
9518 PyErr_SetString(PyExc_ValueError,
9519 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009520 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 }
9522 width = width*10 + (c - '0');
9523 }
9524 }
9525 if (c == '.') {
9526 prec = 0;
9527 if (--fmtcnt >= 0)
9528 c = *fmt++;
9529 if (c == '*') {
9530 v = getnextarg(args, arglen, &argidx);
9531 if (v == NULL)
9532 goto onError;
9533 if (!PyLong_Check(v)) {
9534 PyErr_SetString(PyExc_TypeError,
9535 "* wants int");
9536 goto onError;
9537 }
9538 prec = PyLong_AsLong(v);
9539 if (prec == -1 && PyErr_Occurred())
9540 goto onError;
9541 if (prec < 0)
9542 prec = 0;
9543 if (--fmtcnt >= 0)
9544 c = *fmt++;
9545 }
9546 else if (c >= '0' && c <= '9') {
9547 prec = c - '0';
9548 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009549 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 if (c < '0' || c > '9')
9551 break;
9552 if ((prec*10) / 10 != prec) {
9553 PyErr_SetString(PyExc_ValueError,
9554 "prec too big");
9555 goto onError;
9556 }
9557 prec = prec*10 + (c - '0');
9558 }
9559 }
9560 } /* prec */
9561 if (fmtcnt >= 0) {
9562 if (c == 'h' || c == 'l' || c == 'L') {
9563 if (--fmtcnt >= 0)
9564 c = *fmt++;
9565 }
9566 }
9567 if (fmtcnt < 0) {
9568 PyErr_SetString(PyExc_ValueError,
9569 "incomplete format");
9570 goto onError;
9571 }
9572 if (c != '%') {
9573 v = getnextarg(args, arglen, &argidx);
9574 if (v == NULL)
9575 goto onError;
9576 }
9577 sign = 0;
9578 fill = ' ';
9579 switch (c) {
9580
9581 case '%':
9582 pbuf = formatbuf;
9583 /* presume that buffer length is at least 1 */
9584 pbuf[0] = '%';
9585 len = 1;
9586 break;
9587
9588 case 's':
9589 case 'r':
9590 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009591 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009592 temp = v;
9593 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009594 }
9595 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009596 if (c == 's')
9597 temp = PyObject_Str(v);
9598 else if (c == 'r')
9599 temp = PyObject_Repr(v);
9600 else
9601 temp = PyObject_ASCII(v);
9602 if (temp == NULL)
9603 goto onError;
9604 if (PyUnicode_Check(temp))
9605 /* nothing to do */;
9606 else {
9607 Py_DECREF(temp);
9608 PyErr_SetString(PyExc_TypeError,
9609 "%s argument has non-string str()");
9610 goto onError;
9611 }
9612 }
9613 pbuf = PyUnicode_AS_UNICODE(temp);
9614 len = PyUnicode_GET_SIZE(temp);
9615 if (prec >= 0 && len > prec)
9616 len = prec;
9617 break;
9618
9619 case 'i':
9620 case 'd':
9621 case 'u':
9622 case 'o':
9623 case 'x':
9624 case 'X':
9625 if (c == 'i')
9626 c = 'd';
9627 isnumok = 0;
9628 if (PyNumber_Check(v)) {
9629 PyObject *iobj=NULL;
9630
9631 if (PyLong_Check(v)) {
9632 iobj = v;
9633 Py_INCREF(iobj);
9634 }
9635 else {
9636 iobj = PyNumber_Long(v);
9637 }
9638 if (iobj!=NULL) {
9639 if (PyLong_Check(iobj)) {
9640 isnumok = 1;
9641 temp = formatlong(iobj, flags, prec, c);
9642 Py_DECREF(iobj);
9643 if (!temp)
9644 goto onError;
9645 pbuf = PyUnicode_AS_UNICODE(temp);
9646 len = PyUnicode_GET_SIZE(temp);
9647 sign = 1;
9648 }
9649 else {
9650 Py_DECREF(iobj);
9651 }
9652 }
9653 }
9654 if (!isnumok) {
9655 PyErr_Format(PyExc_TypeError,
9656 "%%%c format: a number is required, "
9657 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9658 goto onError;
9659 }
9660 if (flags & F_ZERO)
9661 fill = '0';
9662 break;
9663
9664 case 'e':
9665 case 'E':
9666 case 'f':
9667 case 'F':
9668 case 'g':
9669 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009670 temp = formatfloat(v, flags, prec, c);
9671 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009672 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009673 pbuf = PyUnicode_AS_UNICODE(temp);
9674 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009675 sign = 1;
9676 if (flags & F_ZERO)
9677 fill = '0';
9678 break;
9679
9680 case 'c':
9681 pbuf = formatbuf;
9682 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9683 if (len < 0)
9684 goto onError;
9685 break;
9686
9687 default:
9688 PyErr_Format(PyExc_ValueError,
9689 "unsupported format character '%c' (0x%x) "
9690 "at index %zd",
9691 (31<=c && c<=126) ? (char)c : '?',
9692 (int)c,
9693 (Py_ssize_t)(fmt - 1 -
9694 PyUnicode_AS_UNICODE(uformat)));
9695 goto onError;
9696 }
9697 if (sign) {
9698 if (*pbuf == '-' || *pbuf == '+') {
9699 sign = *pbuf++;
9700 len--;
9701 }
9702 else if (flags & F_SIGN)
9703 sign = '+';
9704 else if (flags & F_BLANK)
9705 sign = ' ';
9706 else
9707 sign = 0;
9708 }
9709 if (width < len)
9710 width = len;
9711 if (rescnt - (sign != 0) < width) {
9712 reslen -= rescnt;
9713 rescnt = width + fmtcnt + 100;
9714 reslen += rescnt;
9715 if (reslen < 0) {
9716 Py_XDECREF(temp);
9717 PyErr_NoMemory();
9718 goto onError;
9719 }
9720 if (_PyUnicode_Resize(&result, reslen) < 0) {
9721 Py_XDECREF(temp);
9722 goto onError;
9723 }
9724 res = PyUnicode_AS_UNICODE(result)
9725 + reslen - rescnt;
9726 }
9727 if (sign) {
9728 if (fill != ' ')
9729 *res++ = sign;
9730 rescnt--;
9731 if (width > len)
9732 width--;
9733 }
9734 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9735 assert(pbuf[0] == '0');
9736 assert(pbuf[1] == c);
9737 if (fill != ' ') {
9738 *res++ = *pbuf++;
9739 *res++ = *pbuf++;
9740 }
9741 rescnt -= 2;
9742 width -= 2;
9743 if (width < 0)
9744 width = 0;
9745 len -= 2;
9746 }
9747 if (width > len && !(flags & F_LJUST)) {
9748 do {
9749 --rescnt;
9750 *res++ = fill;
9751 } while (--width > len);
9752 }
9753 if (fill == ' ') {
9754 if (sign)
9755 *res++ = sign;
9756 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9757 assert(pbuf[0] == '0');
9758 assert(pbuf[1] == c);
9759 *res++ = *pbuf++;
9760 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009761 }
9762 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009763 Py_UNICODE_COPY(res, pbuf, len);
9764 res += len;
9765 rescnt -= len;
9766 while (--width >= len) {
9767 --rescnt;
9768 *res++ = ' ';
9769 }
9770 if (dict && (argidx < arglen) && c != '%') {
9771 PyErr_SetString(PyExc_TypeError,
9772 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009773 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009774 goto onError;
9775 }
9776 Py_XDECREF(temp);
9777 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009778 } /* until end */
9779 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009780 PyErr_SetString(PyExc_TypeError,
9781 "not all arguments converted during string formatting");
9782 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009783 }
9784
Thomas Woutersa96affe2006-03-12 00:29:36 +00009785 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009786 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009788 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789 }
9790 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009791 return (PyObject *)result;
9792
Benjamin Peterson29060642009-01-31 22:14:21 +00009793 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794 Py_XDECREF(result);
9795 Py_DECREF(uformat);
9796 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009797 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798 }
9799 return NULL;
9800}
9801
Jeremy Hylton938ace62002-07-17 16:30:39 +00009802static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009803unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9804
Tim Peters6d6c1a32001-08-02 04:15:00 +00009805static PyObject *
9806unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9807{
Benjamin Peterson29060642009-01-31 22:14:21 +00009808 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009809 static char *kwlist[] = {"object", "encoding", "errors", 0};
9810 char *encoding = NULL;
9811 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009812
Benjamin Peterson14339b62009-01-31 16:36:08 +00009813 if (type != &PyUnicode_Type)
9814 return unicode_subtype_new(type, args, kwds);
9815 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009816 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009817 return NULL;
9818 if (x == NULL)
9819 return (PyObject *)_PyUnicode_New(0);
9820 if (encoding == NULL && errors == NULL)
9821 return PyObject_Str(x);
9822 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009823 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009824}
9825
Guido van Rossume023fe02001-08-30 03:12:59 +00009826static PyObject *
9827unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9828{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009829 PyUnicodeObject *tmp, *pnew;
9830 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009831
Benjamin Peterson14339b62009-01-31 16:36:08 +00009832 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9833 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9834 if (tmp == NULL)
9835 return NULL;
9836 assert(PyUnicode_Check(tmp));
9837 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9838 if (pnew == NULL) {
9839 Py_DECREF(tmp);
9840 return NULL;
9841 }
9842 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9843 if (pnew->str == NULL) {
9844 _Py_ForgetReference((PyObject *)pnew);
9845 PyObject_Del(pnew);
9846 Py_DECREF(tmp);
9847 return PyErr_NoMemory();
9848 }
9849 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9850 pnew->length = n;
9851 pnew->hash = tmp->hash;
9852 Py_DECREF(tmp);
9853 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009854}
9855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009856PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009857 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009858\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009859Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009860encoding defaults to the current default string encoding.\n\
9861errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009862
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009863static PyObject *unicode_iter(PyObject *seq);
9864
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009866 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009867 "str", /* tp_name */
9868 sizeof(PyUnicodeObject), /* tp_size */
9869 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009871 (destructor)unicode_dealloc, /* tp_dealloc */
9872 0, /* tp_print */
9873 0, /* tp_getattr */
9874 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009875 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009876 unicode_repr, /* tp_repr */
9877 &unicode_as_number, /* tp_as_number */
9878 &unicode_as_sequence, /* tp_as_sequence */
9879 &unicode_as_mapping, /* tp_as_mapping */
9880 (hashfunc) unicode_hash, /* tp_hash*/
9881 0, /* tp_call*/
9882 (reprfunc) unicode_str, /* tp_str */
9883 PyObject_GenericGetAttr, /* tp_getattro */
9884 0, /* tp_setattro */
9885 0, /* tp_as_buffer */
9886 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009887 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009888 unicode_doc, /* tp_doc */
9889 0, /* tp_traverse */
9890 0, /* tp_clear */
9891 PyUnicode_RichCompare, /* tp_richcompare */
9892 0, /* tp_weaklistoffset */
9893 unicode_iter, /* tp_iter */
9894 0, /* tp_iternext */
9895 unicode_methods, /* tp_methods */
9896 0, /* tp_members */
9897 0, /* tp_getset */
9898 &PyBaseObject_Type, /* tp_base */
9899 0, /* tp_dict */
9900 0, /* tp_descr_get */
9901 0, /* tp_descr_set */
9902 0, /* tp_dictoffset */
9903 0, /* tp_init */
9904 0, /* tp_alloc */
9905 unicode_new, /* tp_new */
9906 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907};
9908
9909/* Initialize the Unicode implementation */
9910
Thomas Wouters78890102000-07-22 19:25:51 +00009911void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009913 int i;
9914
Thomas Wouters477c8d52006-05-27 19:21:47 +00009915 /* XXX - move this array to unicodectype.c ? */
9916 Py_UNICODE linebreak[] = {
9917 0x000A, /* LINE FEED */
9918 0x000D, /* CARRIAGE RETURN */
9919 0x001C, /* FILE SEPARATOR */
9920 0x001D, /* GROUP SEPARATOR */
9921 0x001E, /* RECORD SEPARATOR */
9922 0x0085, /* NEXT LINE */
9923 0x2028, /* LINE SEPARATOR */
9924 0x2029, /* PARAGRAPH SEPARATOR */
9925 };
9926
Fred Drakee4315f52000-05-09 19:53:39 +00009927 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009928 free_list = NULL;
9929 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009931 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009932 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009933
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009934 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009935 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009936 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009938
9939 /* initialize the linebreak bloom filter */
9940 bloom_linebreak = make_bloom_mask(
9941 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9942 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009943
9944 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945}
9946
9947/* Finalize the Unicode implementation */
9948
Christian Heimesa156e092008-02-16 07:38:31 +00009949int
9950PyUnicode_ClearFreeList(void)
9951{
9952 int freelist_size = numfree;
9953 PyUnicodeObject *u;
9954
9955 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009956 PyUnicodeObject *v = u;
9957 u = *(PyUnicodeObject **)u;
9958 if (v->str)
9959 PyObject_DEL(v->str);
9960 Py_XDECREF(v->defenc);
9961 PyObject_Del(v);
9962 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009963 }
9964 free_list = NULL;
9965 assert(numfree == 0);
9966 return freelist_size;
9967}
9968
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969void
Thomas Wouters78890102000-07-22 19:25:51 +00009970_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009971{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009972 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009974 Py_XDECREF(unicode_empty);
9975 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009976
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009977 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009978 if (unicode_latin1[i]) {
9979 Py_DECREF(unicode_latin1[i]);
9980 unicode_latin1[i] = NULL;
9981 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009982 }
Christian Heimesa156e092008-02-16 07:38:31 +00009983 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009985
Walter Dörwald16807132007-05-25 13:52:07 +00009986void
9987PyUnicode_InternInPlace(PyObject **p)
9988{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009989 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9990 PyObject *t;
9991 if (s == NULL || !PyUnicode_Check(s))
9992 Py_FatalError(
9993 "PyUnicode_InternInPlace: unicode strings only please!");
9994 /* If it's a subclass, we don't really know what putting
9995 it in the interned dict might do. */
9996 if (!PyUnicode_CheckExact(s))
9997 return;
9998 if (PyUnicode_CHECK_INTERNED(s))
9999 return;
10000 if (interned == NULL) {
10001 interned = PyDict_New();
10002 if (interned == NULL) {
10003 PyErr_Clear(); /* Don't leave an exception */
10004 return;
10005 }
10006 }
10007 /* It might be that the GetItem call fails even
10008 though the key is present in the dictionary,
10009 namely when this happens during a stack overflow. */
10010 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010011 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010012 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010013
Benjamin Peterson29060642009-01-31 22:14:21 +000010014 if (t) {
10015 Py_INCREF(t);
10016 Py_DECREF(*p);
10017 *p = t;
10018 return;
10019 }
Walter Dörwald16807132007-05-25 13:52:07 +000010020
Benjamin Peterson14339b62009-01-31 16:36:08 +000010021 PyThreadState_GET()->recursion_critical = 1;
10022 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10023 PyErr_Clear();
10024 PyThreadState_GET()->recursion_critical = 0;
10025 return;
10026 }
10027 PyThreadState_GET()->recursion_critical = 0;
10028 /* The two references in interned are not counted by refcnt.
10029 The deallocator will take care of this */
10030 Py_REFCNT(s) -= 2;
10031 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010032}
10033
10034void
10035PyUnicode_InternImmortal(PyObject **p)
10036{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010037 PyUnicode_InternInPlace(p);
10038 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10039 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10040 Py_INCREF(*p);
10041 }
Walter Dörwald16807132007-05-25 13:52:07 +000010042}
10043
10044PyObject *
10045PyUnicode_InternFromString(const char *cp)
10046{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010047 PyObject *s = PyUnicode_FromString(cp);
10048 if (s == NULL)
10049 return NULL;
10050 PyUnicode_InternInPlace(&s);
10051 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010052}
10053
10054void _Py_ReleaseInternedUnicodeStrings(void)
10055{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010056 PyObject *keys;
10057 PyUnicodeObject *s;
10058 Py_ssize_t i, n;
10059 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010060
Benjamin Peterson14339b62009-01-31 16:36:08 +000010061 if (interned == NULL || !PyDict_Check(interned))
10062 return;
10063 keys = PyDict_Keys(interned);
10064 if (keys == NULL || !PyList_Check(keys)) {
10065 PyErr_Clear();
10066 return;
10067 }
Walter Dörwald16807132007-05-25 13:52:07 +000010068
Benjamin Peterson14339b62009-01-31 16:36:08 +000010069 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10070 detector, interned unicode strings are not forcibly deallocated;
10071 rather, we give them their stolen references back, and then clear
10072 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010073
Benjamin Peterson14339b62009-01-31 16:36:08 +000010074 n = PyList_GET_SIZE(keys);
10075 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010077 for (i = 0; i < n; i++) {
10078 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10079 switch (s->state) {
10080 case SSTATE_NOT_INTERNED:
10081 /* XXX Shouldn't happen */
10082 break;
10083 case SSTATE_INTERNED_IMMORTAL:
10084 Py_REFCNT(s) += 1;
10085 immortal_size += s->length;
10086 break;
10087 case SSTATE_INTERNED_MORTAL:
10088 Py_REFCNT(s) += 2;
10089 mortal_size += s->length;
10090 break;
10091 default:
10092 Py_FatalError("Inconsistent interned string state.");
10093 }
10094 s->state = SSTATE_NOT_INTERNED;
10095 }
10096 fprintf(stderr, "total size of all interned strings: "
10097 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10098 "mortal/immortal\n", mortal_size, immortal_size);
10099 Py_DECREF(keys);
10100 PyDict_Clear(interned);
10101 Py_DECREF(interned);
10102 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010103}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010104
10105
10106/********************* Unicode Iterator **************************/
10107
10108typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010109 PyObject_HEAD
10110 Py_ssize_t it_index;
10111 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010112} unicodeiterobject;
10113
10114static void
10115unicodeiter_dealloc(unicodeiterobject *it)
10116{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010117 _PyObject_GC_UNTRACK(it);
10118 Py_XDECREF(it->it_seq);
10119 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010120}
10121
10122static int
10123unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10124{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010125 Py_VISIT(it->it_seq);
10126 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010127}
10128
10129static PyObject *
10130unicodeiter_next(unicodeiterobject *it)
10131{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010132 PyUnicodeObject *seq;
10133 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010134
Benjamin Peterson14339b62009-01-31 16:36:08 +000010135 assert(it != NULL);
10136 seq = it->it_seq;
10137 if (seq == NULL)
10138 return NULL;
10139 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010140
Benjamin Peterson14339b62009-01-31 16:36:08 +000010141 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10142 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010143 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010144 if (item != NULL)
10145 ++it->it_index;
10146 return item;
10147 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010148
Benjamin Peterson14339b62009-01-31 16:36:08 +000010149 Py_DECREF(seq);
10150 it->it_seq = NULL;
10151 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010152}
10153
10154static PyObject *
10155unicodeiter_len(unicodeiterobject *it)
10156{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010157 Py_ssize_t len = 0;
10158 if (it->it_seq)
10159 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10160 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010161}
10162
10163PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10164
10165static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010166 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010167 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010168 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010169};
10170
10171PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010172 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10173 "str_iterator", /* tp_name */
10174 sizeof(unicodeiterobject), /* tp_basicsize */
10175 0, /* tp_itemsize */
10176 /* methods */
10177 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10178 0, /* tp_print */
10179 0, /* tp_getattr */
10180 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010181 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010182 0, /* tp_repr */
10183 0, /* tp_as_number */
10184 0, /* tp_as_sequence */
10185 0, /* tp_as_mapping */
10186 0, /* tp_hash */
10187 0, /* tp_call */
10188 0, /* tp_str */
10189 PyObject_GenericGetAttr, /* tp_getattro */
10190 0, /* tp_setattro */
10191 0, /* tp_as_buffer */
10192 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10193 0, /* tp_doc */
10194 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10195 0, /* tp_clear */
10196 0, /* tp_richcompare */
10197 0, /* tp_weaklistoffset */
10198 PyObject_SelfIter, /* tp_iter */
10199 (iternextfunc)unicodeiter_next, /* tp_iternext */
10200 unicodeiter_methods, /* tp_methods */
10201 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010202};
10203
10204static PyObject *
10205unicode_iter(PyObject *seq)
10206{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010207 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010208
Benjamin Peterson14339b62009-01-31 16:36:08 +000010209 if (!PyUnicode_Check(seq)) {
10210 PyErr_BadInternalCall();
10211 return NULL;
10212 }
10213 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10214 if (it == NULL)
10215 return NULL;
10216 it->it_index = 0;
10217 Py_INCREF(seq);
10218 it->it_seq = (PyUnicodeObject *)seq;
10219 _PyObject_GC_TRACK(it);
10220 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010221}
10222
Martin v. Löwis5b222132007-06-10 09:51:05 +000010223size_t
10224Py_UNICODE_strlen(const Py_UNICODE *u)
10225{
10226 int res = 0;
10227 while(*u++)
10228 res++;
10229 return res;
10230}
10231
10232Py_UNICODE*
10233Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10234{
10235 Py_UNICODE *u = s1;
10236 while ((*u++ = *s2++));
10237 return s1;
10238}
10239
10240Py_UNICODE*
10241Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10242{
10243 Py_UNICODE *u = s1;
10244 while ((*u++ = *s2++))
10245 if (n-- == 0)
10246 break;
10247 return s1;
10248}
10249
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010250Py_UNICODE*
10251Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10252{
10253 Py_UNICODE *u1 = s1;
10254 u1 += Py_UNICODE_strlen(u1);
10255 Py_UNICODE_strcpy(u1, s2);
10256 return s1;
10257}
10258
Martin v. Löwis5b222132007-06-10 09:51:05 +000010259int
10260Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10261{
10262 while (*s1 && *s2 && *s1 == *s2)
10263 s1++, s2++;
10264 if (*s1 && *s2)
10265 return (*s1 < *s2) ? -1 : +1;
10266 if (*s1)
10267 return 1;
10268 if (*s2)
10269 return -1;
10270 return 0;
10271}
10272
Victor Stinneref8d95c2010-08-16 22:03:11 +000010273int
10274Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10275{
10276 register Py_UNICODE u1, u2;
10277 for (; n != 0; n--) {
10278 u1 = *s1;
10279 u2 = *s2;
10280 if (u1 != u2)
10281 return (u1 < u2) ? -1 : +1;
10282 if (u1 == '\0')
10283 return 0;
10284 s1++;
10285 s2++;
10286 }
10287 return 0;
10288}
10289
Martin v. Löwis5b222132007-06-10 09:51:05 +000010290Py_UNICODE*
10291Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10292{
10293 const Py_UNICODE *p;
10294 for (p = s; *p; p++)
10295 if (*p == c)
10296 return (Py_UNICODE*)p;
10297 return NULL;
10298}
10299
Victor Stinner331ea922010-08-10 16:37:20 +000010300Py_UNICODE*
10301Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10302{
10303 const Py_UNICODE *p;
10304 p = s + Py_UNICODE_strlen(s);
10305 while (p != s) {
10306 p--;
10307 if (*p == c)
10308 return (Py_UNICODE*)p;
10309 }
10310 return NULL;
10311}
10312
Victor Stinner71133ff2010-09-01 23:43:53 +000010313Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010314PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010315{
10316 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10317 Py_UNICODE *copy;
10318 Py_ssize_t size;
10319
10320 /* Ensure we won't overflow the size. */
10321 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10322 PyErr_NoMemory();
10323 return NULL;
10324 }
10325 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10326 size *= sizeof(Py_UNICODE);
10327 copy = PyMem_Malloc(size);
10328 if (copy == NULL) {
10329 PyErr_NoMemory();
10330 return NULL;
10331 }
10332 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10333 return copy;
10334}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010335
Georg Brandl66c221e2010-10-14 07:04:07 +000010336/* A _string module, to export formatter_parser and formatter_field_name_split
10337 to the string.Formatter class implemented in Python. */
10338
10339static PyMethodDef _string_methods[] = {
10340 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10341 METH_O, PyDoc_STR("split the argument as a field name")},
10342 {"formatter_parser", (PyCFunction) formatter_parser,
10343 METH_O, PyDoc_STR("parse the argument as a format string")},
10344 {NULL, NULL}
10345};
10346
10347static struct PyModuleDef _string_module = {
10348 PyModuleDef_HEAD_INIT,
10349 "_string",
10350 PyDoc_STR("string helper module"),
10351 0,
10352 _string_methods,
10353 NULL,
10354 NULL,
10355 NULL,
10356 NULL
10357};
10358
10359PyMODINIT_FUNC
10360PyInit__string(void)
10361{
10362 return PyModule_Create(&_string_module);
10363}
10364
10365
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010366#ifdef __cplusplus
10367}
10368#endif