blob: cd4e9e9295c854e3792ea5f7e07233e51658d83d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
Serhiy Storchaka05997252013-01-26 12:14:02 +020083NOTE: In the interpreter's initialization phase, some globals are currently
84 initialized dynamically as needed. In the process Unicode objects may
85 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086
87*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000089
90#ifdef __cplusplus
91extern "C" {
92#endif
93
Walter Dörwald16807132007-05-25 13:52:07 +000094/* This dictionary holds all interned unicode strings. Note that references
95 to strings in this dictionary are *not* counted in the string's ob_refcnt.
96 When the interned string reaches a refcnt of 0 the string deallocation
97 function will delete the reference from this dictionary.
98
99 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000100 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000101*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200102static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000103
Guido van Rossumd57fd912000-03-10 22:53:23 +0000104/* Free list for Unicode objects */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200105static PyUnicodeObject *free_list = NULL;
106static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000108/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200109static PyUnicodeObject *unicode_empty = NULL;
110
111#define _Py_RETURN_UNICODE_EMPTY() \
112 do { \
113 if (unicode_empty != NULL) \
114 Py_INCREF(unicode_empty); \
115 else { \
116 unicode_empty = _PyUnicode_New(0); \
117 if (unicode_empty != NULL) \
118 Py_INCREF(unicode_empty); \
119 } \
120 return (PyObject *)unicode_empty; \
121 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000122
123/* Single character Unicode strings in the Latin-1 range are being
124 shared as well. */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200125static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000126
Christian Heimes190d79e2008-01-30 11:58:22 +0000127/* Fast detection of the most frequent whitespace characters */
128const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000129 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000130/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000131/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000132/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000133/* case 0x000C: * FORM FEED */
134/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 1, 1, 1, 1, 1, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000137/* case 0x001C: * FILE SEPARATOR */
138/* case 0x001D: * GROUP SEPARATOR */
139/* case 0x001E: * RECORD SEPARATOR */
140/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000141 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000142/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000143 1, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000147
Benjamin Peterson14339b62009-01-31 16:36:08 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000156};
157
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000158static PyObject *unicode_encode_call_errorhandler(const char *errors,
159 PyObject **errorHandler,const char *encoding, const char *reason,
160 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
161 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
162
Victor Stinner31be90b2010-04-22 19:38:16 +0000163static void raise_encode_exception(PyObject **exceptionObject,
164 const char *encoding,
165 const Py_UNICODE *unicode, Py_ssize_t size,
166 Py_ssize_t startpos, Py_ssize_t endpos,
167 const char *reason);
168
Christian Heimes190d79e2008-01-30 11:58:22 +0000169/* Same for linebreaks */
170static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000172/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000173/* 0x000B, * LINE TABULATION */
174/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000175/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000176 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000178/* 0x001C, * FILE SEPARATOR */
179/* 0x001D, * GROUP SEPARATOR */
180/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000181 0, 0, 0, 0, 1, 1, 1, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000186
Benjamin Peterson14339b62009-01-31 16:36:08 +0000187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000195};
196
197
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000199PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000200{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000201#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000203#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 /* This is actually an illegal character, so it should
205 not be passed to unichr. */
206 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000207#endif
208}
209
Thomas Wouters477c8d52006-05-27 19:21:47 +0000210/* --- Bloom Filters ----------------------------------------------------- */
211
212/* stuff to implement simple "bloom filters" for Unicode characters.
213 to keep things simple, we use a single bitmask, using the least 5
214 bits from each unicode characters as the bit index. */
215
216/* the linebreak mask is set up by Unicode_Init below */
217
Antoine Pitrouf068f942010-01-13 14:19:12 +0000218#if LONG_BIT >= 128
219#define BLOOM_WIDTH 128
220#elif LONG_BIT >= 64
221#define BLOOM_WIDTH 64
222#elif LONG_BIT >= 32
223#define BLOOM_WIDTH 32
224#else
225#error "LONG_BIT is smaller than 32"
226#endif
227
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228#define BLOOM_MASK unsigned long
229
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231
Antoine Pitrouf068f942010-01-13 14:19:12 +0000232#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
233#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234
Benjamin Peterson29060642009-01-31 22:14:21 +0000235#define BLOOM_LINEBREAK(ch) \
236 ((ch) < 128U ? ascii_linebreak[(ch)] : \
237 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000238
239Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
240{
241 /* calculate simple bloom-style bitmask for a given unicode string */
242
Antoine Pitrouf068f942010-01-13 14:19:12 +0000243 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000244 Py_ssize_t i;
245
246 mask = 0;
247 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000248 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000249
250 return mask;
251}
252
253Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
254{
255 Py_ssize_t i;
256
257 for (i = 0; i < setlen; i++)
258 if (set[i] == chr)
259 return 1;
260
261 return 0;
262}
263
Benjamin Peterson29060642009-01-31 22:14:21 +0000264#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000265 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
266
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267/* --- Unicode Object ----------------------------------------------------- */
268
269static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272{
273 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000274
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000275 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000277 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000279 /* Resizing shared object (unicode_empty or single character
280 objects) in-place is not allowed. Use PyUnicode_Resize()
281 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000284 (unicode->length == 1 &&
285 unicode->str[0] < 256U &&
286 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000288 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 return -1;
290 }
291
Thomas Wouters477c8d52006-05-27 19:21:47 +0000292 /* We allocate one more byte to make sure the string is Ux0000 terminated.
293 The overallocation is also used by fastsearch, which assumes that it's
294 safe to look at str[length] (without making any assumptions about what
295 it contains). */
296
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000298 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000301 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 PyErr_NoMemory();
303 return -1;
304 }
305 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307
Benjamin Peterson29060642009-01-31 22:14:21 +0000308 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000311 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000430 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000431 }
432 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000433 *(PyUnicodeObject **)unicode = free_list;
434 free_list = unicode;
435 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436 }
437 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyObject_DEL(unicode->str);
439 Py_XDECREF(unicode->defenc);
440 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441 }
442}
443
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444static
445int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000446{
447 register PyUnicodeObject *v;
448
449 /* Argument checks */
450 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 PyErr_BadInternalCall();
452 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000454 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000455 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000456 PyErr_BadInternalCall();
457 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000458 }
459
460 /* Resizing unicode_empty and single character objects is not
461 possible since these are being shared. We simply return a fresh
462 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000463 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000464 (v == unicode_empty || v->length == 1)) {
465 PyUnicodeObject *w = _PyUnicode_New(length);
466 if (w == NULL)
467 return -1;
468 Py_UNICODE_COPY(w->str, v->str,
469 length < v->length ? length : v->length);
470 Py_DECREF(*unicode);
471 *unicode = w;
472 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000473 }
474
475 /* Note that we don't have to modify *unicode for unshared Unicode
476 objects, since we can modify them in-place. */
477 return unicode_resize(v, length);
478}
479
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000480int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
481{
482 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
483}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000487{
488 PyUnicodeObject *unicode;
489
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000490 /* If the Unicode data is known at construction time, we can apply
491 some optimizations which share commonly used objects. */
492 if (u != NULL) {
493
Benjamin Peterson29060642009-01-31 22:14:21 +0000494 /* Optimization for empty strings */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200495 if (size == 0)
496 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +0000497
498 /* Single character Unicode objects in the Latin-1 range are
499 shared when using this constructor */
500 if (size == 1 && *u < 256) {
501 unicode = unicode_latin1[*u];
502 if (!unicode) {
503 unicode = _PyUnicode_New(1);
504 if (!unicode)
505 return NULL;
506 unicode->str[0] = *u;
507 unicode_latin1[*u] = unicode;
508 }
509 Py_INCREF(unicode);
510 return (PyObject *)unicode;
511 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000512 }
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode = _PyUnicode_New(size);
515 if (!unicode)
516 return NULL;
517
518 /* Copy the Unicode data into the new object */
519 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521
522 return (PyObject *)unicode;
523}
524
Walter Dörwaldd2034312007-05-18 16:29:38 +0000525PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000526{
527 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000528
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 if (size < 0) {
530 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000531 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 return NULL;
533 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000534
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000535 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000536 some optimizations which share commonly used objects.
537 Also, this means the input must be UTF-8, so fall back to the
538 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000539 if (u != NULL) {
540
Benjamin Peterson29060642009-01-31 22:14:21 +0000541 /* Optimization for empty strings */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200542 if (size == 0)
543 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +0000544
545 /* Single characters are shared when using this constructor.
546 Restrict to ASCII, since the input must be UTF-8. */
547 if (size == 1 && Py_CHARMASK(*u) < 128) {
548 unicode = unicode_latin1[Py_CHARMASK(*u)];
549 if (!unicode) {
550 unicode = _PyUnicode_New(1);
551 if (!unicode)
552 return NULL;
553 unicode->str[0] = Py_CHARMASK(*u);
554 unicode_latin1[Py_CHARMASK(*u)] = unicode;
555 }
556 Py_INCREF(unicode);
557 return (PyObject *)unicode;
558 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000559
560 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 }
562
Walter Dörwald55507312007-05-18 13:12:10 +0000563 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 if (!unicode)
565 return NULL;
566
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000567 return (PyObject *)unicode;
568}
569
Walter Dörwaldd2034312007-05-18 16:29:38 +0000570PyObject *PyUnicode_FromString(const char *u)
571{
572 size_t size = strlen(u);
573 if (size > PY_SSIZE_T_MAX) {
574 PyErr_SetString(PyExc_OverflowError, "input too long");
575 return NULL;
576 }
577
578 return PyUnicode_FromStringAndSize(u, size);
579}
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson081dfee2009-03-18 14:47:41 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 if (size == 0)
602 return PyUnicode_FromStringAndSize(NULL, 0);
603 PyErr_BadInternalCall();
604 return NULL;
605 }
606
607 if (size == -1) {
608 size = wcslen(w);
609 }
610
611 alloc = size;
612 orig_w = w;
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF)
615 alloc++;
616 w++;
617 }
618 w = orig_w;
619 unicode = _PyUnicode_New(alloc);
620 if (!unicode)
621 return NULL;
622
623 /* Copy the wchar_t data into the new object */
624 {
625 register Py_UNICODE *u;
626 u = PyUnicode_AS_UNICODE(unicode);
627 for (i = size; i > 0; i--) {
628 if (*w > 0xFFFF) {
629 wchar_t ordinal = *w++;
630 ordinal -= 0x10000;
631 *u++ = 0xD800 | (ordinal >> 10);
632 *u++ = 0xDC00 | (ordinal & 0x3FF);
633 }
634 else
635 *u++ = *w++;
636 }
637 }
638 return (PyObject *)unicode;
639}
640
641#else
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000644 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645{
646 PyUnicodeObject *unicode;
647
648 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == 0)
650 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000651 PyErr_BadInternalCall();
652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 }
654
Martin v. Löwis790465f2008-04-05 20:41:37 +0000655 if (size == -1) {
656 size = wcslen(w);
657 }
658
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 unicode = _PyUnicode_New(size);
660 if (!unicode)
661 return NULL;
662
663 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000664#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000666#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000668 register Py_UNICODE *u;
669 register Py_ssize_t i;
670 u = PyUnicode_AS_UNICODE(unicode);
671 for (i = size; i > 0; i--)
672 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000673 }
674#endif
675
676 return (PyObject *)unicode;
677}
678
Mark Dickinson081dfee2009-03-18 14:47:41 +0000679#endif /* CONVERT_WCHAR_TO_SURROGATES */
680
681#undef CONVERT_WCHAR_TO_SURROGATES
682
Walter Dörwald346737f2007-05-31 10:44:43 +0000683static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000684makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
685 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000686{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000687 *fmt++ = '%';
688 if (width) {
689 if (zeropad)
690 *fmt++ = '0';
691 fmt += sprintf(fmt, "%d", width);
692 }
693 if (precision)
694 fmt += sprintf(fmt, ".%d", precision);
695 if (longflag)
696 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000697 else if (longlongflag) {
698 /* longlongflag should only ever be nonzero on machines with
699 HAVE_LONG_LONG defined */
700#ifdef HAVE_LONG_LONG
701 char *f = PY_FORMAT_LONG_LONG;
702 while (*f)
703 *fmt++ = *f++;
704#else
705 /* we shouldn't ever get here */
706 assert(0);
707 *fmt++ = 'l';
708#endif
709 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000710 else if (size_tflag) {
711 char *f = PY_FORMAT_SIZE_T;
712 while (*f)
713 *fmt++ = *f++;
714 }
715 *fmt++ = c;
716 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000717}
718
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
720
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000721/* size of fixed-size buffer for formatting single arguments */
722#define ITEM_BUFFER_LEN 21
723/* maximum number of characters required for output of %ld. 21 characters
724 allows for 64-bit integers (in decimal) and an optional sign. */
725#define MAX_LONG_CHARS 21
726/* maximum number of characters required for output of %lld.
727 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
728 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
729#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
730
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731PyObject *
732PyUnicode_FromFormatV(const char *format, va_list vargs)
733{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000734 va_list count;
735 Py_ssize_t callcount = 0;
736 PyObject **callresults = NULL;
737 PyObject **callresult = NULL;
738 Py_ssize_t n = 0;
739 int width = 0;
740 int precision = 0;
741 int zeropad;
742 const char* f;
743 Py_UNICODE *s;
744 PyObject *string;
745 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 /* use abuffer instead of buffer, if we need more space
748 * (which can happen if there's a format specifier with width). */
749 char *abuffer = NULL;
750 char *realbuffer;
751 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000752 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000753 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000755 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000756 /* step 1: count the number of %S/%R/%A/%s format specifications
757 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
758 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
759 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000760 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000761 if (*f == '%') {
762 if (*(f+1)=='%')
763 continue;
Victor Stinner2b574a22011-03-01 22:48:49 +0000764 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000765 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000766 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000767 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000768 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 ;
770 if (*f == 's')
771 ++callcount;
772 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000773 else if (128 <= (unsigned char)*f) {
774 PyErr_Format(PyExc_ValueError,
775 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000776 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000777 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000778 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000779 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000780 }
781 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000782 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000783 if (callcount) {
784 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
785 if (!callresults) {
786 PyErr_NoMemory();
787 return NULL;
788 }
789 callresult = callresults;
790 }
791 /* step 3: figure out how large a buffer we need */
792 for (f = format; *f; f++) {
793 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000794#ifdef HAVE_LONG_LONG
795 int longlongflag = 0;
796#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 const char* p = f;
798 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000799 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000801 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000802 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000803
Benjamin Peterson14339b62009-01-31 16:36:08 +0000804 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
805 * they don't affect the amount of space we reserve.
806 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000807 if (*f == 'l') {
808 if (f[1] == 'd' || f[1] == 'u') {
809 ++f;
810 }
811#ifdef HAVE_LONG_LONG
812 else if (f[1] == 'l' &&
813 (f[2] == 'd' || f[2] == 'u')) {
814 longlongflag = 1;
815 f += 2;
816 }
817#endif
818 }
819 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000820 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000821 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000822
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 switch (*f) {
824 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +0000825 {
826#ifndef Py_UNICODE_WIDE
827 int ordinal = va_arg(count, int);
828 if (ordinal > 0xffff)
829 n += 2;
830 else
831 n++;
832#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000833 (void)va_arg(count, int);
Victor Stinner659eb842011-02-23 12:14:22 +0000834 n++;
835#endif
836 break;
837 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000838 case '%':
839 n++;
840 break;
841 case 'd': case 'u': case 'i': case 'x':
842 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000843#ifdef HAVE_LONG_LONG
844 if (longlongflag) {
845 if (width < MAX_LONG_LONG_CHARS)
846 width = MAX_LONG_LONG_CHARS;
847 }
848 else
849#endif
850 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
851 including sign. Decimal takes the most space. This
852 isn't enough for octal. If a width is specified we
853 need more (which we allocate later). */
854 if (width < MAX_LONG_CHARS)
855 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000856 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000857 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000858 if (abuffersize < width)
859 abuffersize = width;
860 break;
861 case 's':
862 {
863 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000864 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000865 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
866 if (!str)
867 goto fail;
868 n += PyUnicode_GET_SIZE(str);
869 /* Remember the str and switch to the next slot */
870 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000871 break;
872 }
873 case 'U':
874 {
875 PyObject *obj = va_arg(count, PyObject *);
876 assert(obj && PyUnicode_Check(obj));
877 n += PyUnicode_GET_SIZE(obj);
878 break;
879 }
880 case 'V':
881 {
882 PyObject *obj = va_arg(count, PyObject *);
883 const char *str = va_arg(count, const char *);
Victor Stinner2b574a22011-03-01 22:48:49 +0000884 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000885 assert(obj || str);
886 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2b574a22011-03-01 22:48:49 +0000887 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000888 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2b574a22011-03-01 22:48:49 +0000889 *callresult++ = NULL;
890 }
891 else {
892 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
893 if (!str_obj)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str_obj);
896 *callresult++ = str_obj;
897 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000898 break;
899 }
900 case 'S':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *str;
904 assert(obj);
905 str = PyObject_Str(obj);
906 if (!str)
907 goto fail;
908 n += PyUnicode_GET_SIZE(str);
909 /* Remember the str and switch to the next slot */
910 *callresult++ = str;
911 break;
912 }
913 case 'R':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *repr;
917 assert(obj);
918 repr = PyObject_Repr(obj);
919 if (!repr)
920 goto fail;
921 n += PyUnicode_GET_SIZE(repr);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = repr;
924 break;
925 }
926 case 'A':
927 {
928 PyObject *obj = va_arg(count, PyObject *);
929 PyObject *ascii;
930 assert(obj);
931 ascii = PyObject_ASCII(obj);
932 if (!ascii)
933 goto fail;
934 n += PyUnicode_GET_SIZE(ascii);
935 /* Remember the repr and switch to the next slot */
936 *callresult++ = ascii;
937 break;
938 }
939 case 'p':
940 (void) va_arg(count, int);
941 /* maximum 64-bit pointer representation:
942 * 0xffffffffffffffff
943 * so 19 characters is enough.
944 * XXX I count 18 -- what's the extra for?
945 */
946 n += 19;
947 break;
948 default:
949 /* if we stumble upon an unknown
950 formatting code, copy the rest of
951 the format string to the output
952 string. (we cannot just skip the
953 code, since there's no way to know
954 what's in the argument list) */
955 n += strlen(p);
956 goto expand;
957 }
958 } else
959 n++;
960 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000961 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000962 if (abuffersize > ITEM_BUFFER_LEN) {
963 /* add 1 for sprintf's trailing null byte */
964 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000965 if (!abuffer) {
966 PyErr_NoMemory();
967 goto fail;
968 }
969 realbuffer = abuffer;
970 }
971 else
972 realbuffer = buffer;
973 /* step 4: fill the buffer */
974 /* Since we've analyzed how much space we need for the worst case,
975 we don't have to resize the string.
976 There can be no errors beyond this point. */
977 string = PyUnicode_FromUnicode(NULL, n);
978 if (!string)
979 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000980
Benjamin Peterson14339b62009-01-31 16:36:08 +0000981 s = PyUnicode_AS_UNICODE(string);
982 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000983
Benjamin Peterson14339b62009-01-31 16:36:08 +0000984 for (f = format; *f; f++) {
985 if (*f == '%') {
986 const char* p = f++;
987 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 int size_tflag = 0;
990 zeropad = (*f == '0');
991 /* parse the width.precision part */
992 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000993 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000994 width = (width*10) + *f++ - '0';
995 precision = 0;
996 if (*f == '.') {
997 f++;
David Malcolm96960882010-11-05 17:23:41 +0000998 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000999 precision = (precision*10) + *f++ - '0';
1000 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001001 /* Handle %ld, %lu, %lld and %llu. */
1002 if (*f == 'l') {
1003 if (f[1] == 'd' || f[1] == 'u') {
1004 longflag = 1;
1005 ++f;
1006 }
1007#ifdef HAVE_LONG_LONG
1008 else if (f[1] == 'l' &&
1009 (f[2] == 'd' || f[2] == 'u')) {
1010 longlongflag = 1;
1011 f += 2;
1012 }
1013#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001014 }
1015 /* handle the size_t flag. */
1016 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1017 size_tflag = 1;
1018 ++f;
1019 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 switch (*f) {
1022 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +00001023 {
1024 int ordinal = va_arg(vargs, int);
1025#ifndef Py_UNICODE_WIDE
1026 if (ordinal > 0xffff) {
1027 ordinal -= 0x10000;
1028 *s++ = 0xD800 | (ordinal >> 10);
1029 *s++ = 0xDC00 | (ordinal & 0x3FF);
1030 } else
1031#endif
1032 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 break;
Victor Stinner659eb842011-02-23 12:14:22 +00001034 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001036 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1037 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001038 if (longflag)
1039 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001040#ifdef HAVE_LONG_LONG
1041 else if (longlongflag)
1042 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1043#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001044 else if (size_tflag)
1045 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1046 else
1047 sprintf(realbuffer, fmt, va_arg(vargs, int));
1048 appendstring(realbuffer);
1049 break;
1050 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001051 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1052 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001053 if (longflag)
1054 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001055#ifdef HAVE_LONG_LONG
1056 else if (longlongflag)
1057 sprintf(realbuffer, fmt, va_arg(vargs,
1058 unsigned PY_LONG_LONG));
1059#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001060 else if (size_tflag)
1061 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1062 else
1063 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1064 appendstring(realbuffer);
1065 break;
1066 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001067 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001068 sprintf(realbuffer, fmt, va_arg(vargs, int));
1069 appendstring(realbuffer);
1070 break;
1071 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001072 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001073 sprintf(realbuffer, fmt, va_arg(vargs, int));
1074 appendstring(realbuffer);
1075 break;
1076 case 's':
1077 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001078 /* unused, since we already have the result */
1079 (void) va_arg(vargs, char *);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1081 PyUnicode_GET_SIZE(*callresult));
1082 s += PyUnicode_GET_SIZE(*callresult);
1083 /* We're done with the unicode()/repr() => forget it */
1084 Py_DECREF(*callresult);
1085 /* switch to next unicode()/repr() result */
1086 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001087 break;
1088 }
1089 case 'U':
1090 {
1091 PyObject *obj = va_arg(vargs, PyObject *);
1092 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1093 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1094 s += size;
1095 break;
1096 }
1097 case 'V':
1098 {
1099 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2b574a22011-03-01 22:48:49 +00001100 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001101 if (obj) {
1102 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1103 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1104 s += size;
1105 } else {
Victor Stinner2b574a22011-03-01 22:48:49 +00001106 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1107 PyUnicode_GET_SIZE(*callresult));
1108 s += PyUnicode_GET_SIZE(*callresult);
1109 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001110 }
Victor Stinner2b574a22011-03-01 22:48:49 +00001111 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001112 break;
1113 }
1114 case 'S':
1115 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001116 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001117 {
1118 Py_UNICODE *ucopy;
1119 Py_ssize_t usize;
1120 Py_ssize_t upos;
1121 /* unused, since we already have the result */
1122 (void) va_arg(vargs, PyObject *);
1123 ucopy = PyUnicode_AS_UNICODE(*callresult);
1124 usize = PyUnicode_GET_SIZE(*callresult);
1125 for (upos = 0; upos<usize;)
1126 *s++ = ucopy[upos++];
1127 /* We're done with the unicode()/repr() => forget it */
1128 Py_DECREF(*callresult);
1129 /* switch to next unicode()/repr() result */
1130 ++callresult;
1131 break;
1132 }
1133 case 'p':
1134 sprintf(buffer, "%p", va_arg(vargs, void*));
1135 /* %p is ill-defined: ensure leading 0x. */
1136 if (buffer[1] == 'X')
1137 buffer[1] = 'x';
1138 else if (buffer[1] != 'x') {
1139 memmove(buffer+2, buffer, strlen(buffer)+1);
1140 buffer[0] = '0';
1141 buffer[1] = 'x';
1142 }
1143 appendstring(buffer);
1144 break;
1145 case '%':
1146 *s++ = '%';
1147 break;
1148 default:
1149 appendstring(p);
1150 goto end;
1151 }
Victor Stinner1205f272010-09-11 00:54:47 +00001152 }
Victor Stinner1205f272010-09-11 00:54:47 +00001153 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 *s++ = *f;
1155 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
Benjamin Peterson29060642009-01-31 22:14:21 +00001157 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 if (callresults)
1159 PyObject_Free(callresults);
1160 if (abuffer)
1161 PyObject_Free(abuffer);
1162 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1163 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001164 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001165 if (callresults) {
1166 PyObject **callresult2 = callresults;
1167 while (callresult2 < callresult) {
Victor Stinner2b574a22011-03-01 22:48:49 +00001168 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001169 ++callresult2;
1170 }
1171 PyObject_Free(callresults);
1172 }
1173 if (abuffer)
1174 PyObject_Free(abuffer);
1175 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001176}
1177
1178#undef appendstring
1179
1180PyObject *
1181PyUnicode_FromFormat(const char *format, ...)
1182{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001183 PyObject* ret;
1184 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001185
1186#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001187 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001188#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001189 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001190#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 ret = PyUnicode_FromFormatV(format, vargs);
1192 va_end(vargs);
1193 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001194}
1195
Victor Stinner5593d8a2010-10-02 11:11:27 +00001196/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1197 convert a Unicode object to a wide character string.
1198
Victor Stinnerd88d9832011-09-06 02:00:05 +02001199 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001200 character) required to convert the unicode object. Ignore size argument.
1201
Victor Stinnerd88d9832011-09-06 02:00:05 +02001202 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001203 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001204 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001205static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001206unicode_aswidechar(PyUnicodeObject *unicode,
1207 wchar_t *w,
1208 Py_ssize_t size)
1209{
1210#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001211 Py_ssize_t res;
1212 if (w != NULL) {
1213 res = PyUnicode_GET_SIZE(unicode);
1214 if (size > res)
1215 size = res + 1;
1216 else
1217 res = size;
1218 memcpy(w, unicode->str, size * sizeof(wchar_t));
1219 return res;
1220 }
1221 else
1222 return PyUnicode_GET_SIZE(unicode) + 1;
1223#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1224 register const Py_UNICODE *u;
1225 const Py_UNICODE *uend;
1226 const wchar_t *worig, *wend;
1227 Py_ssize_t nchar;
1228
Victor Stinner137c34c2010-09-29 10:25:54 +00001229 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001230 uend = u + PyUnicode_GET_SIZE(unicode);
1231 if (w != NULL) {
1232 worig = w;
1233 wend = w + size;
1234 while (u != uend && w != wend) {
1235 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1236 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1237 {
1238 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1239 u += 2;
1240 }
1241 else {
1242 *w = *u;
1243 u++;
1244 }
1245 w++;
1246 }
1247 if (w != wend)
1248 *w = L'\0';
1249 return w - worig;
1250 }
1251 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001252 nchar = 1; /* null character at the end */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001253 while (u != uend) {
1254 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1255 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1256 u += 2;
1257 else
1258 u++;
1259 nchar++;
1260 }
1261 }
1262 return nchar;
1263#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1264 register Py_UNICODE *u, *uend, ordinal;
1265 register Py_ssize_t i;
1266 wchar_t *worig, *wend;
1267 Py_ssize_t nchar;
1268
1269 u = PyUnicode_AS_UNICODE(unicode);
1270 uend = u + PyUnicode_GET_SIZE(u);
1271 if (w != NULL) {
1272 worig = w;
1273 wend = w + size;
1274 while (u != uend && w != wend) {
1275 ordinal = *u;
1276 if (ordinal > 0xffff) {
1277 ordinal -= 0x10000;
1278 *w++ = 0xD800 | (ordinal >> 10);
1279 *w++ = 0xDC00 | (ordinal & 0x3FF);
1280 }
1281 else
1282 *w++ = ordinal;
1283 u++;
1284 }
1285 if (w != wend)
1286 *w = 0;
1287 return w - worig;
1288 }
1289 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001290 nchar = 1; /* null character */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001291 while (u != uend) {
1292 if (*u > 0xffff)
1293 nchar += 2;
1294 else
1295 nchar++;
1296 u++;
1297 }
1298 return nchar;
1299 }
1300#else
1301# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001302#endif
1303}
1304
1305Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001306PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001307 wchar_t *w,
1308 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001309{
1310 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001311 PyErr_BadInternalCall();
1312 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315}
1316
Victor Stinner137c34c2010-09-29 10:25:54 +00001317wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001318PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001319 Py_ssize_t *size)
1320{
1321 wchar_t* buffer;
1322 Py_ssize_t buflen;
1323
1324 if (unicode == NULL) {
1325 PyErr_BadInternalCall();
1326 return NULL;
1327 }
1328
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001329 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001330 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001331 PyErr_NoMemory();
1332 return NULL;
1333 }
1334
Victor Stinner137c34c2010-09-29 10:25:54 +00001335 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1336 if (buffer == NULL) {
1337 PyErr_NoMemory();
1338 return NULL;
1339 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001340 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001341 if (size != NULL)
1342 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001343 return buffer;
1344}
1345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346#endif
1347
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001348PyObject *PyUnicode_FromOrdinal(int ordinal)
1349{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001350 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001351
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001352 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 PyErr_SetString(PyExc_ValueError,
1354 "chr() arg not in range(0x110000)");
1355 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001356 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001357
1358#ifndef Py_UNICODE_WIDE
1359 if (ordinal > 0xffff) {
1360 ordinal -= 0x10000;
1361 s[0] = 0xD800 | (ordinal >> 10);
1362 s[1] = 0xDC00 | (ordinal & 0x3FF);
1363 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001364 }
1365#endif
1366
Hye-Shik Chang40574832004-04-06 07:24:51 +00001367 s[0] = (Py_UNICODE)ordinal;
1368 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001369}
1370
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371PyObject *PyUnicode_FromObject(register PyObject *obj)
1372{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001373 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001375 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 Py_INCREF(obj);
1377 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001378 }
1379 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001380 /* For a Unicode subtype that's not a Unicode object,
1381 return a true Unicode object with the same data. */
1382 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1383 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001384 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001385 PyErr_Format(PyExc_TypeError,
1386 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001387 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001388 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001389}
1390
1391PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001392 const char *encoding,
1393 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001394{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001395 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001396 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001397
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001399 PyErr_BadInternalCall();
1400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001402
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001403 /* Decoding bytes objects is the most common case and should be fast */
1404 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02001405 if (PyBytes_GET_SIZE(obj) == 0)
1406 _Py_RETURN_UNICODE_EMPTY();
1407 v = PyUnicode_Decode(
1408 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1409 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001410 return v;
1411 }
1412
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001413 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001414 PyErr_SetString(PyExc_TypeError,
1415 "decoding str is not supported");
1416 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001417 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001418
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001419 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1420 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1421 PyErr_Format(PyExc_TypeError,
1422 "coercing to str: need bytes, bytearray "
1423 "or buffer-like object, %.80s found",
1424 Py_TYPE(obj)->tp_name);
1425 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001426 }
Tim Petersced69f82003-09-16 20:30:58 +00001427
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001428 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02001429 PyBuffer_Release(&buffer);
1430 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001432
Serhiy Storchaka05997252013-01-26 12:14:02 +02001433 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001434 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001435 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436}
1437
Victor Stinner600d3be2010-06-10 12:00:55 +00001438/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001439 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1440 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01001441int
1442_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00001443 char *lower,
1444 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001446 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001447 char *l;
1448 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001450 e = encoding;
1451 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001452 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001453 while (*e) {
1454 if (l == l_end)
1455 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001456 if (Py_ISUPPER(*e)) {
1457 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001458 }
1459 else if (*e == '_') {
1460 *l++ = '-';
1461 e++;
1462 }
1463 else {
1464 *l++ = *e++;
1465 }
1466 }
1467 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001468 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001469}
1470
1471PyObject *PyUnicode_Decode(const char *s,
1472 Py_ssize_t size,
1473 const char *encoding,
1474 const char *errors)
1475{
1476 PyObject *buffer = NULL, *unicode;
1477 Py_buffer info;
1478 char lower[11]; /* Enough for any encoding shortcut */
1479
1480 if (encoding == NULL)
1481 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001482
1483 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01001484 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Victor Stinner37296e82010-06-10 13:36:23 +00001485 if (strcmp(lower, "utf-8") == 0)
1486 return PyUnicode_DecodeUTF8(s, size, errors);
1487 else if ((strcmp(lower, "latin-1") == 0) ||
1488 (strcmp(lower, "iso-8859-1") == 0))
1489 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001490#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001491 else if (strcmp(lower, "mbcs") == 0)
1492 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001493#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001494 else if (strcmp(lower, "ascii") == 0)
1495 return PyUnicode_DecodeASCII(s, size, errors);
1496 else if (strcmp(lower, "utf-16") == 0)
1497 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1498 else if (strcmp(lower, "utf-32") == 0)
1499 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1500 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501
1502 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001503 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001504 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001505 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001506 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 if (buffer == NULL)
1508 goto onError;
1509 unicode = PyCodec_Decode(buffer, encoding, errors);
1510 if (unicode == NULL)
1511 goto onError;
1512 if (!PyUnicode_Check(unicode)) {
1513 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001514 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001515 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516 Py_DECREF(unicode);
1517 goto onError;
1518 }
1519 Py_DECREF(buffer);
1520 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001521
Benjamin Peterson29060642009-01-31 22:14:21 +00001522 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523 Py_XDECREF(buffer);
1524 return NULL;
1525}
1526
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001527PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1528 const char *encoding,
1529 const char *errors)
1530{
1531 PyObject *v;
1532
1533 if (!PyUnicode_Check(unicode)) {
1534 PyErr_BadArgument();
1535 goto onError;
1536 }
1537
1538 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001539 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001540
1541 /* Decode via the codec registry */
1542 v = PyCodec_Decode(unicode, encoding, errors);
1543 if (v == NULL)
1544 goto onError;
1545 return v;
1546
Benjamin Peterson29060642009-01-31 22:14:21 +00001547 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001548 return NULL;
1549}
1550
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001551PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1552 const char *encoding,
1553 const char *errors)
1554{
1555 PyObject *v;
1556
1557 if (!PyUnicode_Check(unicode)) {
1558 PyErr_BadArgument();
1559 goto onError;
1560 }
1561
1562 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001563 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001564
1565 /* Decode via the codec registry */
1566 v = PyCodec_Decode(unicode, encoding, errors);
1567 if (v == NULL)
1568 goto onError;
1569 if (!PyUnicode_Check(v)) {
1570 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001571 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001572 Py_TYPE(v)->tp_name);
1573 Py_DECREF(v);
1574 goto onError;
1575 }
1576 return v;
1577
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001579 return NULL;
1580}
1581
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001583 Py_ssize_t size,
1584 const char *encoding,
1585 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586{
1587 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001588
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 unicode = PyUnicode_FromUnicode(s, size);
1590 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1593 Py_DECREF(unicode);
1594 return v;
1595}
1596
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001597PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1598 const char *encoding,
1599 const char *errors)
1600{
1601 PyObject *v;
1602
1603 if (!PyUnicode_Check(unicode)) {
1604 PyErr_BadArgument();
1605 goto onError;
1606 }
1607
1608 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001609 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001610
1611 /* Encode via the codec registry */
1612 v = PyCodec_Encode(unicode, encoding, errors);
1613 if (v == NULL)
1614 goto onError;
1615 return v;
1616
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001618 return NULL;
1619}
1620
Victor Stinnerad158722010-10-27 00:25:46 +00001621PyObject *
1622PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001623{
Victor Stinner313a1202010-06-11 23:56:51 +00001624#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001625 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1626 PyUnicode_GET_SIZE(unicode),
1627 NULL);
1628#elif defined(__APPLE__)
1629 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1630 PyUnicode_GET_SIZE(unicode),
1631 "surrogateescape");
1632#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001633 PyInterpreterState *interp = PyThreadState_GET()->interp;
1634 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1635 cannot use it to encode and decode filenames before it is loaded. Load
1636 the Python codec requires to encode at least its own filename. Use the C
1637 version of the locale codec until the codec registry is initialized and
1638 the Python codec is loaded.
1639
1640 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1641 cannot only rely on it: check also interp->fscodec_initialized for
1642 subinterpreters. */
1643 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001644 return PyUnicode_AsEncodedString(unicode,
1645 Py_FileSystemDefaultEncoding,
1646 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001647 }
1648 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001649 /* locale encoding with surrogateescape */
1650 wchar_t *wchar;
1651 char *bytes;
1652 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001653 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001654
1655 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1656 if (wchar == NULL)
1657 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001658 bytes = _Py_wchar2char(wchar, &error_pos);
1659 if (bytes == NULL) {
1660 if (error_pos != (size_t)-1) {
1661 char *errmsg = strerror(errno);
1662 PyObject *exc = NULL;
1663 if (errmsg == NULL)
1664 errmsg = "Py_wchar2char() failed";
1665 raise_encode_exception(&exc,
1666 "filesystemencoding",
1667 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1668 error_pos, error_pos+1,
1669 errmsg);
1670 Py_XDECREF(exc);
1671 }
1672 else
1673 PyErr_NoMemory();
1674 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001675 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001676 }
1677 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001678
1679 bytes_obj = PyBytes_FromString(bytes);
1680 PyMem_Free(bytes);
1681 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001682 }
Victor Stinnerad158722010-10-27 00:25:46 +00001683#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001684}
1685
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1687 const char *encoding,
1688 const char *errors)
1689{
1690 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001691 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001692
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 if (!PyUnicode_Check(unicode)) {
1694 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001695 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 }
Fred Drakee4315f52000-05-09 19:53:39 +00001697
Tim Petersced69f82003-09-16 20:30:58 +00001698 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001699 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001700
1701 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01001702 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Victor Stinner37296e82010-06-10 13:36:23 +00001703 if (strcmp(lower, "utf-8") == 0)
1704 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1705 PyUnicode_GET_SIZE(unicode),
1706 errors);
1707 else if ((strcmp(lower, "latin-1") == 0) ||
1708 (strcmp(lower, "iso-8859-1") == 0))
1709 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1710 PyUnicode_GET_SIZE(unicode),
1711 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001712#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001713 else if (strcmp(lower, "mbcs") == 0)
1714 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1715 PyUnicode_GET_SIZE(unicode),
1716 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001717#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001718 else if (strcmp(lower, "ascii") == 0)
1719 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1720 PyUnicode_GET_SIZE(unicode),
1721 errors);
1722 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001723 /* During bootstrap, we may need to find the encodings
1724 package, to load the file system encoding, and require the
1725 file system encoding in order to load the encodings
1726 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001727
Victor Stinner59e62db2010-05-15 13:14:32 +00001728 Break out of this dependency by assuming that the path to
1729 the encodings module is ASCII-only. XXX could try wcstombs
1730 instead, if the file system encoding is the locale's
1731 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001732 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001733 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1734 !PyThreadState_GET()->interp->codecs_initialized)
1735 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1736 PyUnicode_GET_SIZE(unicode),
1737 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738
1739 /* Encode via the codec registry */
1740 v = PyCodec_Encode(unicode, encoding, errors);
1741 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001742 return NULL;
1743
1744 /* The normal path */
1745 if (PyBytes_Check(v))
1746 return v;
1747
1748 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001749 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001750 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001751 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001752
1753 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1754 "encoder %s returned bytearray instead of bytes",
1755 encoding);
1756 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001757 Py_DECREF(v);
1758 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001759 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001760
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001761 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1762 Py_DECREF(v);
1763 return b;
1764 }
1765
1766 PyErr_Format(PyExc_TypeError,
1767 "encoder did not return a bytes object (type=%.400s)",
1768 Py_TYPE(v)->tp_name);
1769 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001770 return NULL;
1771}
1772
1773PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1774 const char *encoding,
1775 const char *errors)
1776{
1777 PyObject *v;
1778
1779 if (!PyUnicode_Check(unicode)) {
1780 PyErr_BadArgument();
1781 goto onError;
1782 }
1783
1784 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001786
1787 /* Encode via the codec registry */
1788 v = PyCodec_Encode(unicode, encoding, errors);
1789 if (v == NULL)
1790 goto onError;
1791 if (!PyUnicode_Check(v)) {
1792 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001793 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001794 Py_TYPE(v)->tp_name);
1795 Py_DECREF(v);
1796 goto onError;
1797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001799
Benjamin Peterson29060642009-01-31 22:14:21 +00001800 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 return NULL;
1802}
1803
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001804PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001805 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001806{
1807 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001808 if (v)
1809 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001810 if (errors != NULL)
1811 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001812 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001813 PyUnicode_GET_SIZE(unicode),
1814 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001815 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001816 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001817 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001818 return v;
1819}
1820
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001821PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001822PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001823 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001824 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1825}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001826
Christian Heimes5894ba72007-11-04 11:43:14 +00001827PyObject*
1828PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1829{
Victor Stinnerad158722010-10-27 00:25:46 +00001830#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1831 return PyUnicode_DecodeMBCS(s, size, NULL);
1832#elif defined(__APPLE__)
1833 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1834#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001835 PyInterpreterState *interp = PyThreadState_GET()->interp;
1836 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1837 cannot use it to encode and decode filenames before it is loaded. Load
1838 the Python codec requires to encode at least its own filename. Use the C
1839 version of the locale codec until the codec registry is initialized and
1840 the Python codec is loaded.
1841
1842 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1843 cannot only rely on it: check also interp->fscodec_initialized for
1844 subinterpreters. */
1845 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001846 return PyUnicode_Decode(s, size,
1847 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001848 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001849 }
1850 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001851 /* locale encoding with surrogateescape */
1852 wchar_t *wchar;
1853 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001854 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001855
1856 if (s[size] != '\0' || size != strlen(s)) {
1857 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1858 return NULL;
1859 }
1860
Victor Stinner168e1172010-10-16 23:16:16 +00001861 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001862 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001863 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001864
Victor Stinner168e1172010-10-16 23:16:16 +00001865 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001866 PyMem_Free(wchar);
1867 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001868 }
Victor Stinnerad158722010-10-27 00:25:46 +00001869#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001870}
1871
Martin v. Löwis011e8422009-05-05 04:43:17 +00001872
1873int
Antoine Pitrou13348842012-01-29 18:36:34 +01001874_PyUnicode_HasNULChars(PyObject* s)
1875{
1876 static PyObject *nul = NULL;
1877
1878 if (nul == NULL)
1879 nul = PyUnicode_FromStringAndSize("\0", 1);
1880 if (nul == NULL)
1881 return -1;
1882 return PyUnicode_Contains(s, nul);
1883}
1884
1885
1886int
Martin v. Löwis011e8422009-05-05 04:43:17 +00001887PyUnicode_FSConverter(PyObject* arg, void* addr)
1888{
1889 PyObject *output = NULL;
1890 Py_ssize_t size;
1891 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001892 if (arg == NULL) {
1893 Py_DECREF(*(PyObject**)addr);
1894 return 1;
1895 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001896 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001897 output = arg;
1898 Py_INCREF(output);
1899 }
1900 else {
1901 arg = PyUnicode_FromObject(arg);
1902 if (!arg)
1903 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001904 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001905 Py_DECREF(arg);
1906 if (!output)
1907 return 0;
1908 if (!PyBytes_Check(output)) {
1909 Py_DECREF(output);
1910 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1911 return 0;
1912 }
1913 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001914 size = PyBytes_GET_SIZE(output);
1915 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001916 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05001917 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001918 Py_DECREF(output);
1919 return 0;
1920 }
1921 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001922 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001923}
1924
1925
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001926int
1927PyUnicode_FSDecoder(PyObject* arg, void* addr)
1928{
1929 PyObject *output = NULL;
1930 Py_ssize_t size;
1931 void *data;
1932 if (arg == NULL) {
1933 Py_DECREF(*(PyObject**)addr);
1934 return 1;
1935 }
1936 if (PyUnicode_Check(arg)) {
1937 output = arg;
1938 Py_INCREF(output);
1939 }
1940 else {
1941 arg = PyBytes_FromObject(arg);
1942 if (!arg)
1943 return 0;
1944 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1945 PyBytes_GET_SIZE(arg));
1946 Py_DECREF(arg);
1947 if (!output)
1948 return 0;
1949 if (!PyUnicode_Check(output)) {
1950 Py_DECREF(output);
1951 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1952 return 0;
1953 }
1954 }
1955 size = PyUnicode_GET_SIZE(output);
1956 data = PyUnicode_AS_UNICODE(output);
1957 if (size != Py_UNICODE_strlen(data)) {
1958 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1959 Py_DECREF(output);
1960 return 0;
1961 }
1962 *(PyObject**)addr = output;
1963 return Py_CLEANUP_SUPPORTED;
1964}
1965
1966
Martin v. Löwis5b222132007-06-10 09:51:05 +00001967char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001968_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001969{
Christian Heimesf3863112007-11-22 07:46:41 +00001970 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001971 if (!PyUnicode_Check(unicode)) {
1972 PyErr_BadArgument();
1973 return NULL;
1974 }
Christian Heimesf3863112007-11-22 07:46:41 +00001975 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1976 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001977 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001978 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001979 *psize = PyBytes_GET_SIZE(bytes);
1980 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001981}
1982
1983char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001984_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001985{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001986 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001987}
1988
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1990{
1991 if (!PyUnicode_Check(unicode)) {
1992 PyErr_BadArgument();
1993 goto onError;
1994 }
1995 return PyUnicode_AS_UNICODE(unicode);
1996
Benjamin Peterson29060642009-01-31 22:14:21 +00001997 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998 return NULL;
1999}
2000
Martin v. Löwis18e16552006-02-15 17:27:45 +00002001Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002002{
2003 if (!PyUnicode_Check(unicode)) {
2004 PyErr_BadArgument();
2005 goto onError;
2006 }
2007 return PyUnicode_GET_SIZE(unicode);
2008
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 return -1;
2011}
2012
Thomas Wouters78890102000-07-22 19:25:51 +00002013const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002014{
Victor Stinner42cb4622010-09-01 19:39:01 +00002015 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002016}
2017
Victor Stinner554f3f02010-06-16 23:33:54 +00002018/* create or adjust a UnicodeDecodeError */
2019static void
2020make_decode_exception(PyObject **exceptionObject,
2021 const char *encoding,
2022 const char *input, Py_ssize_t length,
2023 Py_ssize_t startpos, Py_ssize_t endpos,
2024 const char *reason)
2025{
2026 if (*exceptionObject == NULL) {
2027 *exceptionObject = PyUnicodeDecodeError_Create(
2028 encoding, input, length, startpos, endpos, reason);
2029 }
2030 else {
2031 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2032 goto onError;
2033 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2034 goto onError;
2035 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2036 goto onError;
2037 }
2038 return;
2039
2040onError:
2041 Py_DECREF(*exceptionObject);
2042 *exceptionObject = NULL;
2043}
2044
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002045/* error handling callback helper:
2046 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002047 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 and adjust various state variables.
2049 return 0 on success, -1 on error
2050*/
2051
2052static
2053int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 const char *encoding, const char *reason,
2055 const char **input, const char **inend, Py_ssize_t *startinpos,
2056 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2057 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002059 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060
2061 PyObject *restuple = NULL;
2062 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002063 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002064 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002065 Py_ssize_t requiredsize;
2066 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002068 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002069 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 int res = -1;
2071
2072 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002073 *errorHandler = PyCodec_LookupError(errors);
2074 if (*errorHandler == NULL)
2075 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002076 }
2077
Victor Stinner554f3f02010-06-16 23:33:54 +00002078 make_decode_exception(exceptionObject,
2079 encoding,
2080 *input, *inend - *input,
2081 *startinpos, *endinpos,
2082 reason);
2083 if (*exceptionObject == NULL)
2084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085
2086 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2087 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002088 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002089 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002090 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002092 }
2093 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002094 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002095
2096 /* Copy back the bytes variables, which might have been modified by the
2097 callback */
2098 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2099 if (!inputobj)
2100 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002101 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002102 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002103 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002104 *input = PyBytes_AS_STRING(inputobj);
2105 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002106 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002107 /* we can DECREF safely, as the exception has another reference,
2108 so the object won't go away. */
2109 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002110
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002111 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002112 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002113 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002114 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2115 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002116 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117
2118 /* need more space? (at least enough for what we
2119 have+the replacement+the rest of the string (starting
2120 at the new input position), so we won't have to check space
2121 when there are no errors in the rest of the string) */
2122 repptr = PyUnicode_AS_UNICODE(repunicode);
2123 repsize = PyUnicode_GET_SIZE(repunicode);
2124 requiredsize = *outpos + repsize + insize-newpos;
2125 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002126 if (requiredsize<2*outsize)
2127 requiredsize = 2*outsize;
2128 if (_PyUnicode_Resize(output, requiredsize) < 0)
2129 goto onError;
2130 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002131 }
2132 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002133 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 Py_UNICODE_COPY(*outptr, repptr, repsize);
2135 *outptr += repsize;
2136 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002137
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 /* we made it! */
2139 res = 0;
2140
Benjamin Peterson29060642009-01-31 22:14:21 +00002141 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 Py_XDECREF(restuple);
2143 return res;
2144}
2145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002146/* --- UTF-7 Codec -------------------------------------------------------- */
2147
Antoine Pitrou244651a2009-05-04 18:56:13 +00002148/* See RFC2152 for details. We encode conservatively and decode liberally. */
2149
2150/* Three simple macros defining base-64. */
2151
2152/* Is c a base-64 character? */
2153
2154#define IS_BASE64(c) \
2155 (((c) >= 'A' && (c) <= 'Z') || \
2156 ((c) >= 'a' && (c) <= 'z') || \
2157 ((c) >= '0' && (c) <= '9') || \
2158 (c) == '+' || (c) == '/')
2159
2160/* given that c is a base-64 character, what is its base-64 value? */
2161
2162#define FROM_BASE64(c) \
2163 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2164 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2165 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2166 (c) == '+' ? 62 : 63)
2167
2168/* What is the base-64 character of the bottom 6 bits of n? */
2169
2170#define TO_BASE64(n) \
2171 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2172
2173/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2174 * decoded as itself. We are permissive on decoding; the only ASCII
2175 * byte not decoding to itself is the + which begins a base64
2176 * string. */
2177
2178#define DECODE_DIRECT(c) \
2179 ((c) <= 127 && (c) != '+')
2180
2181/* The UTF-7 encoder treats ASCII characters differently according to
2182 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2183 * the above). See RFC2152. This array identifies these different
2184 * sets:
2185 * 0 : "Set D"
2186 * alphanumeric and '(),-./:?
2187 * 1 : "Set O"
2188 * !"#$%&*;<=>@[]^_`{|}
2189 * 2 : "whitespace"
2190 * ht nl cr sp
2191 * 3 : special (must be base64 encoded)
2192 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2193 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002194
Tim Petersced69f82003-09-16 20:30:58 +00002195static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002196char utf7_category[128] = {
2197/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2198 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2199/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2200 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2201/* sp ! " # $ % & ' ( ) * + , - . / */
2202 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2203/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2205/* @ A B C D E F G H I J K L M N O */
2206 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2207/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2209/* ` a b c d e f g h i j k l m n o */
2210 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2211/* p q r s t u v w x y z { | } ~ del */
2212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002213};
2214
Antoine Pitrou244651a2009-05-04 18:56:13 +00002215/* ENCODE_DIRECT: this character should be encoded as itself. The
2216 * answer depends on whether we are encoding set O as itself, and also
2217 * on whether we are encoding whitespace as itself. RFC2152 makes it
2218 * clear that the answers to these questions vary between
2219 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002220
Antoine Pitrou244651a2009-05-04 18:56:13 +00002221#define ENCODE_DIRECT(c, directO, directWS) \
2222 ((c) < 128 && (c) > 0 && \
2223 ((utf7_category[(c)] == 0) || \
2224 (directWS && (utf7_category[(c)] == 2)) || \
2225 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002226
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002227PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002228 Py_ssize_t size,
2229 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002230{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002231 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2232}
2233
Antoine Pitrou244651a2009-05-04 18:56:13 +00002234/* The decoder. The only state we preserve is our read position,
2235 * i.e. how many characters we have consumed. So if we end in the
2236 * middle of a shift sequence we have to back off the read position
2237 * and the output to the beginning of the sequence, otherwise we lose
2238 * all the shift state (seen bits, number of bits seen, high
2239 * surrogate). */
2240
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002241PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002242 Py_ssize_t size,
2243 const char *errors,
2244 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002245{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002246 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002247 Py_ssize_t startinpos;
2248 Py_ssize_t endinpos;
2249 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002250 const char *e;
2251 PyUnicodeObject *unicode;
2252 Py_UNICODE *p;
2253 const char *errmsg = "";
2254 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002255 Py_UNICODE *shiftOutStart;
2256 unsigned int base64bits = 0;
2257 unsigned long base64buffer = 0;
2258 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 PyObject *errorHandler = NULL;
2260 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002261
2262 unicode = _PyUnicode_New(size);
2263 if (!unicode)
2264 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002265 if (size == 0) {
2266 if (consumed)
2267 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002268 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002269 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002270
2271 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002272 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002273 e = s + size;
2274
2275 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002277 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002278 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002279
Antoine Pitrou244651a2009-05-04 18:56:13 +00002280 if (inShift) { /* in a base-64 section */
2281 if (IS_BASE64(ch)) { /* consume a base-64 character */
2282 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2283 base64bits += 6;
2284 s++;
2285 if (base64bits >= 16) {
2286 /* we have enough bits for a UTF-16 value */
2287 Py_UNICODE outCh = (Py_UNICODE)
2288 (base64buffer >> (base64bits-16));
2289 base64bits -= 16;
2290 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2291 if (surrogate) {
2292 /* expecting a second surrogate */
2293 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2294#ifdef Py_UNICODE_WIDE
2295 *p++ = (((surrogate & 0x3FF)<<10)
2296 | (outCh & 0x3FF)) + 0x10000;
2297#else
2298 *p++ = surrogate;
2299 *p++ = outCh;
2300#endif
2301 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002302 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002303 }
2304 else {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002305 *p++ = surrogate;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002306 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002307 }
2308 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002309 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002310 /* first surrogate */
2311 surrogate = outCh;
2312 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002313 else {
2314 *p++ = outCh;
2315 }
2316 }
2317 }
2318 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002319 inShift = 0;
2320 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002321 if (surrogate) {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002322 *p++ = surrogate;
2323 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002324 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002325 if (base64bits > 0) { /* left-over bits */
2326 if (base64bits >= 6) {
2327 /* We've seen at least one base-64 character */
2328 errmsg = "partial character in shift sequence";
2329 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002330 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002331 else {
2332 /* Some bits remain; they should be zero */
2333 if (base64buffer != 0) {
2334 errmsg = "non-zero padding bits in shift sequence";
2335 goto utf7Error;
2336 }
2337 }
2338 }
2339 if (ch != '-') {
2340 /* '-' is absorbed; other terminating
2341 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002342 *p++ = ch;
2343 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002344 }
2345 }
2346 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002347 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002348 s++; /* consume '+' */
2349 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002350 s++;
2351 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002352 }
2353 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002354 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002355 shiftOutStart = p;
2356 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002357 }
2358 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002359 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002360 *p++ = ch;
2361 s++;
2362 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002363 else {
2364 startinpos = s-starts;
2365 s++;
2366 errmsg = "unexpected special character";
2367 goto utf7Error;
2368 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002369 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002370utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002371 outpos = p-PyUnicode_AS_UNICODE(unicode);
2372 endinpos = s-starts;
2373 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002374 errors, &errorHandler,
2375 "utf7", errmsg,
2376 &starts, &e, &startinpos, &endinpos, &exc, &s,
2377 &unicode, &outpos, &p))
2378 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002379 }
2380
Antoine Pitrou244651a2009-05-04 18:56:13 +00002381 /* end of string */
2382
2383 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2384 /* if we're in an inconsistent state, that's an error */
2385 if (surrogate ||
2386 (base64bits >= 6) ||
2387 (base64bits > 0 && base64buffer != 0)) {
2388 outpos = p-PyUnicode_AS_UNICODE(unicode);
2389 endinpos = size;
2390 if (unicode_decode_call_errorhandler(
2391 errors, &errorHandler,
2392 "utf7", "unterminated shift sequence",
2393 &starts, &e, &startinpos, &endinpos, &exc, &s,
2394 &unicode, &outpos, &p))
2395 goto onError;
2396 if (s < e)
2397 goto restart;
2398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002400
2401 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002402 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002403 if (inShift) {
2404 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002405 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002406 }
2407 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002408 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002409 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002410 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002411
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002412 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002413 goto onError;
2414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002417 return (PyObject *)unicode;
2418
Benjamin Peterson29060642009-01-31 22:14:21 +00002419 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002420 Py_XDECREF(errorHandler);
2421 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002422 Py_DECREF(unicode);
2423 return NULL;
2424}
2425
2426
2427PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002428 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002429 int base64SetO,
2430 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002431 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002432{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002433 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002434 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002435 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002436 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002437 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002438 unsigned int base64bits = 0;
2439 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002440 char * out;
2441 char * start;
2442
2443 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002444 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002445
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002446 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002447 return PyErr_NoMemory();
2448
Antoine Pitrou244651a2009-05-04 18:56:13 +00002449 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002450 if (v == NULL)
2451 return NULL;
2452
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002453 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002454 for (;i < size; ++i) {
2455 Py_UNICODE ch = s[i];
2456
Antoine Pitrou244651a2009-05-04 18:56:13 +00002457 if (inShift) {
2458 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2459 /* shifting out */
2460 if (base64bits) { /* output remaining bits */
2461 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2462 base64buffer = 0;
2463 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002464 }
2465 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002466 /* Characters not in the BASE64 set implicitly unshift the sequence
2467 so no '-' is required, except if the character is itself a '-' */
2468 if (IS_BASE64(ch) || ch == '-') {
2469 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002471 *out++ = (char) ch;
2472 }
2473 else {
2474 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002475 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002476 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002477 else { /* not in a shift sequence */
2478 if (ch == '+') {
2479 *out++ = '+';
2480 *out++ = '-';
2481 }
2482 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2483 *out++ = (char) ch;
2484 }
2485 else {
2486 *out++ = '+';
2487 inShift = 1;
2488 goto encode_char;
2489 }
2490 }
2491 continue;
2492encode_char:
2493#ifdef Py_UNICODE_WIDE
2494 if (ch >= 0x10000) {
2495 /* code first surrogate */
2496 base64bits += 16;
2497 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2498 while (base64bits >= 6) {
2499 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2500 base64bits -= 6;
2501 }
2502 /* prepare second surrogate */
2503 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2504 }
2505#endif
2506 base64bits += 16;
2507 base64buffer = (base64buffer << 16) | ch;
2508 while (base64bits >= 6) {
2509 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2510 base64bits -= 6;
2511 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002512 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002513 if (base64bits)
2514 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2515 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002516 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002517 if (_PyBytes_Resize(&v, out - start) < 0)
2518 return NULL;
2519 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002520}
2521
Antoine Pitrou244651a2009-05-04 18:56:13 +00002522#undef IS_BASE64
2523#undef FROM_BASE64
2524#undef TO_BASE64
2525#undef DECODE_DIRECT
2526#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002527
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528/* --- UTF-8 Codec -------------------------------------------------------- */
2529
Tim Petersced69f82003-09-16 20:30:58 +00002530static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002532 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2533 illegal prefix. See RFC 3629 for details */
2534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2544 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2546 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2547 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2548 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2549 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550};
2551
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002553 Py_ssize_t size,
2554 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555{
Walter Dörwald69652032004-09-07 20:24:22 +00002556 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2557}
2558
Antoine Pitrouab868312009-01-10 15:40:25 +00002559/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2560#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2561
2562/* Mask to quickly check whether a C 'long' contains a
2563 non-ASCII, UTF8-encoded char. */
2564#if (SIZEOF_LONG == 8)
2565# define ASCII_CHAR_MASK 0x8080808080808080L
2566#elif (SIZEOF_LONG == 4)
2567# define ASCII_CHAR_MASK 0x80808080L
2568#else
2569# error C 'long' size should be either 4 or 8!
2570#endif
2571
Walter Dörwald69652032004-09-07 20:24:22 +00002572PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002573 Py_ssize_t size,
2574 const char *errors,
2575 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002576{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002579 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002580 Py_ssize_t startinpos;
2581 Py_ssize_t endinpos;
2582 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002583 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 PyUnicodeObject *unicode;
2585 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002586 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002587 PyObject *errorHandler = NULL;
2588 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589
2590 /* Note: size will always be longer than the resulting Unicode
2591 character count */
2592 unicode = _PyUnicode_New(size);
2593 if (!unicode)
2594 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002595 if (size == 0) {
2596 if (consumed)
2597 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600
2601 /* Unpack UTF-8 encoded data */
2602 p = unicode->str;
2603 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002604 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605
2606 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002607 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608
2609 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002610 /* Fast path for runs of ASCII characters. Given that common UTF-8
2611 input will consist of an overwhelming majority of ASCII
2612 characters, we try to optimize for this case by checking
2613 as many characters as a C 'long' can contain.
2614 First, check if we can do an aligned read, as most CPUs have
2615 a penalty for unaligned reads.
2616 */
2617 if (!((size_t) s & LONG_PTR_MASK)) {
2618 /* Help register allocation */
2619 register const char *_s = s;
2620 register Py_UNICODE *_p = p;
2621 while (_s < aligned_end) {
2622 /* Read a whole long at a time (either 4 or 8 bytes),
2623 and do a fast unrolled copy if it only contains ASCII
2624 characters. */
2625 unsigned long data = *(unsigned long *) _s;
2626 if (data & ASCII_CHAR_MASK)
2627 break;
2628 _p[0] = (unsigned char) _s[0];
2629 _p[1] = (unsigned char) _s[1];
2630 _p[2] = (unsigned char) _s[2];
2631 _p[3] = (unsigned char) _s[3];
2632#if (SIZEOF_LONG == 8)
2633 _p[4] = (unsigned char) _s[4];
2634 _p[5] = (unsigned char) _s[5];
2635 _p[6] = (unsigned char) _s[6];
2636 _p[7] = (unsigned char) _s[7];
2637#endif
2638 _s += SIZEOF_LONG;
2639 _p += SIZEOF_LONG;
2640 }
2641 s = _s;
2642 p = _p;
2643 if (s == e)
2644 break;
2645 ch = (unsigned char)*s;
2646 }
2647 }
2648
2649 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002650 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 s++;
2652 continue;
2653 }
2654
2655 n = utf8_code_length[ch];
2656
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002657 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002658 if (consumed)
2659 break;
2660 else {
2661 errmsg = "unexpected end of data";
2662 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002663 endinpos = startinpos+1;
2664 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2665 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002666 goto utf8Error;
2667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669
2670 switch (n) {
2671
2672 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002673 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002674 startinpos = s-starts;
2675 endinpos = startinpos+1;
2676 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677
2678 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002679 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 startinpos = s-starts;
2681 endinpos = startinpos+1;
2682 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683
2684 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002685 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002686 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002687 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002688 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 goto utf8Error;
2690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002692 assert ((ch > 0x007F) && (ch <= 0x07FF));
2693 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 break;
2695
2696 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002697 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2698 will result in surrogates in range d800-dfff. Surrogates are
2699 not valid UTF-8 so they are rejected.
2700 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2701 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002702 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002703 (s[2] & 0xc0) != 0x80 ||
2704 ((unsigned char)s[0] == 0xE0 &&
2705 (unsigned char)s[1] < 0xA0) ||
2706 ((unsigned char)s[0] == 0xED &&
2707 (unsigned char)s[1] > 0x9F)) {
2708 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002709 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002710 endinpos = startinpos + 1;
2711
2712 /* if s[1] first two bits are 1 and 0, then the invalid
2713 continuation byte is s[2], so increment endinpos by 1,
2714 if not, s[1] is invalid and endinpos doesn't need to
2715 be incremented. */
2716 if ((s[1] & 0xC0) == 0x80)
2717 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 goto utf8Error;
2719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002721 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2722 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002723 break;
2724
2725 case 4:
2726 if ((s[1] & 0xc0) != 0x80 ||
2727 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002728 (s[3] & 0xc0) != 0x80 ||
2729 ((unsigned char)s[0] == 0xF0 &&
2730 (unsigned char)s[1] < 0x90) ||
2731 ((unsigned char)s[0] == 0xF4 &&
2732 (unsigned char)s[1] > 0x8F)) {
2733 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002734 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002735 endinpos = startinpos + 1;
2736 if ((s[1] & 0xC0) == 0x80) {
2737 endinpos++;
2738 if ((s[2] & 0xC0) == 0x80)
2739 endinpos++;
2740 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002741 goto utf8Error;
2742 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002743 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002744 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2745 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2746
Fredrik Lundh8f455852001-06-27 18:59:43 +00002747#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002749#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002750 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002751
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002752 /* translate from 10000..10FFFF to 0..FFFF */
2753 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002754
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002755 /* high surrogate = top 10 bits added to D800 */
2756 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002757
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002758 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002759 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002760#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 }
2763 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002765
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 utf8Error:
2767 outpos = p-PyUnicode_AS_UNICODE(unicode);
2768 if (unicode_decode_call_errorhandler(
2769 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01002770 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 &starts, &e, &startinpos, &endinpos, &exc, &s,
2772 &unicode, &outpos, &p))
2773 goto onError;
2774 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 }
Walter Dörwald69652032004-09-07 20:24:22 +00002776 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002777 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778
2779 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002780 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 goto onError;
2782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002783 Py_XDECREF(errorHandler);
2784 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 return (PyObject *)unicode;
2786
Benjamin Peterson29060642009-01-31 22:14:21 +00002787 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788 Py_XDECREF(errorHandler);
2789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 Py_DECREF(unicode);
2791 return NULL;
2792}
2793
Antoine Pitrouab868312009-01-10 15:40:25 +00002794#undef ASCII_CHAR_MASK
2795
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002796#ifdef __APPLE__
2797
2798/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01002799 used to decode the command line arguments on Mac OS X.
2800
2801 Return a pointer to a newly allocated wide character string (use
2802 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002803
2804wchar_t*
2805_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2806{
2807 int n;
2808 const char *e;
2809 wchar_t *unicode, *p;
2810
2811 /* Note: size will always be longer than the resulting Unicode
2812 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01002813 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002814 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002815 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2816 if (!unicode)
2817 return NULL;
2818
2819 /* Unpack UTF-8 encoded data */
2820 p = unicode;
2821 e = s + size;
2822 while (s < e) {
2823 Py_UCS4 ch = (unsigned char)*s;
2824
2825 if (ch < 0x80) {
2826 *p++ = (wchar_t)ch;
2827 s++;
2828 continue;
2829 }
2830
2831 n = utf8_code_length[ch];
2832 if (s + n > e) {
2833 goto surrogateescape;
2834 }
2835
2836 switch (n) {
2837 case 0:
2838 case 1:
2839 goto surrogateescape;
2840
2841 case 2:
2842 if ((s[1] & 0xc0) != 0x80)
2843 goto surrogateescape;
2844 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2845 assert ((ch > 0x007F) && (ch <= 0x07FF));
2846 *p++ = (wchar_t)ch;
2847 break;
2848
2849 case 3:
2850 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2851 will result in surrogates in range d800-dfff. Surrogates are
2852 not valid UTF-8 so they are rejected.
2853 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2854 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2855 if ((s[1] & 0xc0) != 0x80 ||
2856 (s[2] & 0xc0) != 0x80 ||
2857 ((unsigned char)s[0] == 0xE0 &&
2858 (unsigned char)s[1] < 0xA0) ||
2859 ((unsigned char)s[0] == 0xED &&
2860 (unsigned char)s[1] > 0x9F)) {
2861
2862 goto surrogateescape;
2863 }
2864 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2865 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2866 *p++ = (Py_UNICODE)ch;
2867 break;
2868
2869 case 4:
2870 if ((s[1] & 0xc0) != 0x80 ||
2871 (s[2] & 0xc0) != 0x80 ||
2872 (s[3] & 0xc0) != 0x80 ||
2873 ((unsigned char)s[0] == 0xF0 &&
2874 (unsigned char)s[1] < 0x90) ||
2875 ((unsigned char)s[0] == 0xF4 &&
2876 (unsigned char)s[1] > 0x8F)) {
2877 goto surrogateescape;
2878 }
2879 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2880 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2881 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2882
2883#if SIZEOF_WCHAR_T == 4
2884 *p++ = (wchar_t)ch;
2885#else
2886 /* compute and append the two surrogates: */
2887
2888 /* translate from 10000..10FFFF to 0..FFFF */
2889 ch -= 0x10000;
2890
2891 /* high surrogate = top 10 bits added to D800 */
2892 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2893
2894 /* low surrogate = bottom 10 bits added to DC00 */
2895 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2896#endif
2897 break;
2898 }
2899 s += n;
2900 continue;
2901
2902 surrogateescape:
2903 *p++ = 0xDC00 + ch;
2904 s++;
2905 }
2906 *p = L'\0';
2907 return unicode;
2908}
2909
2910#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002911
Tim Peters602f7402002-04-27 18:03:26 +00002912/* Allocation strategy: if the string is short, convert into a stack buffer
2913 and allocate exactly as much space needed at the end. Else allocate the
2914 maximum possible needed (4 result bytes per Unicode character), and return
2915 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002916*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002917PyObject *
2918PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 Py_ssize_t size,
2920 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921{
Tim Peters602f7402002-04-27 18:03:26 +00002922#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002923
Guido van Rossum98297ee2007-11-06 21:34:58 +00002924 Py_ssize_t i; /* index into s of next input byte */
2925 PyObject *result; /* result string object */
2926 char *p; /* next free byte in output buffer */
2927 Py_ssize_t nallocated; /* number of result bytes allocated */
2928 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002929 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002930 PyObject *errorHandler = NULL;
2931 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002932
Tim Peters602f7402002-04-27 18:03:26 +00002933 assert(s != NULL);
2934 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935
Tim Peters602f7402002-04-27 18:03:26 +00002936 if (size <= MAX_SHORT_UNICHARS) {
2937 /* Write into the stack buffer; nallocated can't overflow.
2938 * At the end, we'll allocate exactly as much heap space as it
2939 * turns out we need.
2940 */
2941 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002942 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002943 p = stackbuf;
2944 }
2945 else {
2946 /* Overallocate on the heap, and give the excess back at the end. */
2947 nallocated = size * 4;
2948 if (nallocated / 4 != size) /* overflow! */
2949 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002950 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002951 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002952 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002953 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002954 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002955
Tim Peters602f7402002-04-27 18:03:26 +00002956 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002957 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002958
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002959 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002960 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002962
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002964 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002965 *p++ = (char)(0xc0 | (ch >> 6));
2966 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002967 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002968#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002969 /* Special case: check for high and low surrogate */
2970 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2971 Py_UCS4 ch2 = s[i];
2972 /* Combine the two surrogates to form a UCS4 value */
2973 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2974 i++;
2975
2976 /* Encode UCS4 Unicode ordinals */
2977 *p++ = (char)(0xf0 | (ch >> 18));
2978 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002979 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2980 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002981 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002982#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002983 Py_ssize_t newpos;
2984 PyObject *rep;
2985 Py_ssize_t repsize, k;
2986 rep = unicode_encode_call_errorhandler
2987 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2988 s, size, &exc, i-1, i, &newpos);
2989 if (!rep)
2990 goto error;
2991
2992 if (PyBytes_Check(rep))
2993 repsize = PyBytes_GET_SIZE(rep);
2994 else
2995 repsize = PyUnicode_GET_SIZE(rep);
2996
2997 if (repsize > 4) {
2998 Py_ssize_t offset;
2999
3000 if (result == NULL)
3001 offset = p - stackbuf;
3002 else
3003 offset = p - PyBytes_AS_STRING(result);
3004
3005 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3006 /* integer overflow */
3007 PyErr_NoMemory();
3008 goto error;
3009 }
3010 nallocated += repsize - 4;
3011 if (result != NULL) {
3012 if (_PyBytes_Resize(&result, nallocated) < 0)
3013 goto error;
3014 } else {
3015 result = PyBytes_FromStringAndSize(NULL, nallocated);
3016 if (result == NULL)
3017 goto error;
3018 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3019 }
3020 p = PyBytes_AS_STRING(result) + offset;
3021 }
3022
3023 if (PyBytes_Check(rep)) {
3024 char *prep = PyBytes_AS_STRING(rep);
3025 for(k = repsize; k > 0; k--)
3026 *p++ = *prep++;
3027 } else /* rep is unicode */ {
3028 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3029 Py_UNICODE c;
3030
3031 for(k=0; k<repsize; k++) {
3032 c = prep[k];
3033 if (0x80 <= c) {
3034 raise_encode_exception(&exc, "utf-8", s, size,
3035 i-1, i, "surrogates not allowed");
3036 goto error;
3037 }
3038 *p++ = (char)prep[k];
3039 }
3040 }
3041 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003042#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003043 }
Victor Stinner445a6232010-04-22 20:01:57 +00003044#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003045 } else if (ch < 0x10000) {
3046 *p++ = (char)(0xe0 | (ch >> 12));
3047 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3048 *p++ = (char)(0x80 | (ch & 0x3f));
3049 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003050 /* Encode UCS4 Unicode ordinals */
3051 *p++ = (char)(0xf0 | (ch >> 18));
3052 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3053 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3054 *p++ = (char)(0x80 | (ch & 0x3f));
3055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003057
Guido van Rossum98297ee2007-11-06 21:34:58 +00003058 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003059 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003060 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003061 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003062 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003063 }
3064 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003065 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003066 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003067 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003068 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003069 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003070 Py_XDECREF(errorHandler);
3071 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003072 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003073 error:
3074 Py_XDECREF(errorHandler);
3075 Py_XDECREF(exc);
3076 Py_XDECREF(result);
3077 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003078
Tim Peters602f7402002-04-27 18:03:26 +00003079#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080}
3081
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3083{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 if (!PyUnicode_Check(unicode)) {
3085 PyErr_BadArgument();
3086 return NULL;
3087 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003088 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003089 PyUnicode_GET_SIZE(unicode),
3090 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091}
3092
Walter Dörwald41980ca2007-08-16 21:55:45 +00003093/* --- UTF-32 Codec ------------------------------------------------------- */
3094
3095PyObject *
3096PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 Py_ssize_t size,
3098 const char *errors,
3099 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003100{
3101 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3102}
3103
3104PyObject *
3105PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003106 Py_ssize_t size,
3107 const char *errors,
3108 int *byteorder,
3109 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003110{
3111 const char *starts = s;
3112 Py_ssize_t startinpos;
3113 Py_ssize_t endinpos;
3114 Py_ssize_t outpos;
3115 PyUnicodeObject *unicode;
3116 Py_UNICODE *p;
3117#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003118 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003119 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003120#else
3121 const int pairs = 0;
3122#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003123 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003124 int bo = 0; /* assume native ordering by default */
3125 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003126 /* Offsets from q for retrieving bytes in the right order. */
3127#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3128 int iorder[] = {0, 1, 2, 3};
3129#else
3130 int iorder[] = {3, 2, 1, 0};
3131#endif
3132 PyObject *errorHandler = NULL;
3133 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003134
Walter Dörwald41980ca2007-08-16 21:55:45 +00003135 q = (unsigned char *)s;
3136 e = q + size;
3137
3138 if (byteorder)
3139 bo = *byteorder;
3140
3141 /* Check for BOM marks (U+FEFF) in the input and adjust current
3142 byte order setting accordingly. In native mode, the leading BOM
3143 mark is skipped, in all other modes, it is copied to the output
3144 stream as-is (giving a ZWNBSP character). */
3145 if (bo == 0) {
3146 if (size >= 4) {
3147 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003148 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003149#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003150 if (bom == 0x0000FEFF) {
3151 q += 4;
3152 bo = -1;
3153 }
3154 else if (bom == 0xFFFE0000) {
3155 q += 4;
3156 bo = 1;
3157 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003158#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003159 if (bom == 0x0000FEFF) {
3160 q += 4;
3161 bo = 1;
3162 }
3163 else if (bom == 0xFFFE0000) {
3164 q += 4;
3165 bo = -1;
3166 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003167#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003169 }
3170
3171 if (bo == -1) {
3172 /* force LE */
3173 iorder[0] = 0;
3174 iorder[1] = 1;
3175 iorder[2] = 2;
3176 iorder[3] = 3;
3177 }
3178 else if (bo == 1) {
3179 /* force BE */
3180 iorder[0] = 3;
3181 iorder[1] = 2;
3182 iorder[2] = 1;
3183 iorder[3] = 0;
3184 }
3185
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003186 /* On narrow builds we split characters outside the BMP into two
3187 codepoints => count how much extra space we need. */
3188#ifndef Py_UNICODE_WIDE
Serhiy Storchakadec798e2013-01-08 22:45:42 +02003189 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003190 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3191 pairs++;
3192#endif
3193
3194 /* This might be one to much, because of a BOM */
3195 unicode = _PyUnicode_New((size+3)/4+pairs);
3196 if (!unicode)
3197 return NULL;
3198 if (size == 0)
3199 return (PyObject *)unicode;
3200
3201 /* Unpack UTF-32 encoded data */
3202 p = unicode->str;
3203
Walter Dörwald41980ca2007-08-16 21:55:45 +00003204 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 Py_UCS4 ch;
3206 /* remaining bytes at the end? (size should be divisible by 4) */
3207 if (e-q<4) {
3208 if (consumed)
3209 break;
3210 errmsg = "truncated data";
3211 startinpos = ((const char *)q)-starts;
3212 endinpos = ((const char *)e)-starts;
3213 goto utf32Error;
3214 /* The remaining input chars are ignored if the callback
3215 chooses to skip the input */
3216 }
3217 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3218 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003219
Benjamin Peterson29060642009-01-31 22:14:21 +00003220 if (ch >= 0x110000)
3221 {
3222 errmsg = "codepoint not in range(0x110000)";
3223 startinpos = ((const char *)q)-starts;
3224 endinpos = startinpos+4;
3225 goto utf32Error;
3226 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003227#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 if (ch >= 0x10000)
3229 {
3230 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3231 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3232 }
3233 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003234#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 *p++ = ch;
3236 q += 4;
3237 continue;
3238 utf32Error:
3239 outpos = p-PyUnicode_AS_UNICODE(unicode);
3240 if (unicode_decode_call_errorhandler(
3241 errors, &errorHandler,
3242 "utf32", errmsg,
3243 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3244 &unicode, &outpos, &p))
3245 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003246 }
3247
3248 if (byteorder)
3249 *byteorder = bo;
3250
3251 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003252 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003253
3254 /* Adjust length */
3255 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3256 goto onError;
3257
3258 Py_XDECREF(errorHandler);
3259 Py_XDECREF(exc);
3260 return (PyObject *)unicode;
3261
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003263 Py_DECREF(unicode);
3264 Py_XDECREF(errorHandler);
3265 Py_XDECREF(exc);
3266 return NULL;
3267}
3268
3269PyObject *
3270PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003271 Py_ssize_t size,
3272 const char *errors,
3273 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003274{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003275 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003276 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003277 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003278#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003279 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003280#else
3281 const int pairs = 0;
3282#endif
3283 /* Offsets from p for storing byte pairs in the right order. */
3284#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3285 int iorder[] = {0, 1, 2, 3};
3286#else
3287 int iorder[] = {3, 2, 1, 0};
3288#endif
3289
Benjamin Peterson29060642009-01-31 22:14:21 +00003290#define STORECHAR(CH) \
3291 do { \
3292 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3293 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3294 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3295 p[iorder[0]] = (CH) & 0xff; \
3296 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003297 } while(0)
3298
3299 /* In narrow builds we can output surrogate pairs as one codepoint,
3300 so we need less space. */
3301#ifndef Py_UNICODE_WIDE
3302 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3304 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3305 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003306#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003307 nsize = (size - pairs + (byteorder == 0));
3308 bytesize = nsize * 4;
3309 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003311 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003312 if (v == NULL)
3313 return NULL;
3314
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003315 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003316 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003317 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003318 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003319 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003320
3321 if (byteorder == -1) {
3322 /* force LE */
3323 iorder[0] = 0;
3324 iorder[1] = 1;
3325 iorder[2] = 2;
3326 iorder[3] = 3;
3327 }
3328 else if (byteorder == 1) {
3329 /* force BE */
3330 iorder[0] = 3;
3331 iorder[1] = 2;
3332 iorder[2] = 1;
3333 iorder[3] = 0;
3334 }
3335
3336 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003337 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003338#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003339 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3340 Py_UCS4 ch2 = *s;
3341 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3342 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3343 s++;
3344 size--;
3345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003346 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003347#endif
3348 STORECHAR(ch);
3349 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003350
3351 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003352 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003353#undef STORECHAR
3354}
3355
3356PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3357{
3358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_BadArgument();
3360 return NULL;
3361 }
3362 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003363 PyUnicode_GET_SIZE(unicode),
3364 NULL,
3365 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003366}
3367
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368/* --- UTF-16 Codec ------------------------------------------------------- */
3369
Tim Peters772747b2001-08-09 22:21:55 +00003370PyObject *
3371PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 Py_ssize_t size,
3373 const char *errors,
3374 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375{
Walter Dörwald69652032004-09-07 20:24:22 +00003376 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3377}
3378
Antoine Pitrouab868312009-01-10 15:40:25 +00003379/* Two masks for fast checking of whether a C 'long' may contain
3380 UTF16-encoded surrogate characters. This is an efficient heuristic,
3381 assuming that non-surrogate characters with a code point >= 0x8000 are
3382 rare in most input.
3383 FAST_CHAR_MASK is used when the input is in native byte ordering,
3384 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003385*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003386#if (SIZEOF_LONG == 8)
3387# define FAST_CHAR_MASK 0x8000800080008000L
3388# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3389#elif (SIZEOF_LONG == 4)
3390# define FAST_CHAR_MASK 0x80008000L
3391# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3392#else
3393# error C 'long' size should be either 4 or 8!
3394#endif
3395
Walter Dörwald69652032004-09-07 20:24:22 +00003396PyObject *
3397PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003398 Py_ssize_t size,
3399 const char *errors,
3400 int *byteorder,
3401 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003402{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003404 Py_ssize_t startinpos;
3405 Py_ssize_t endinpos;
3406 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 PyUnicodeObject *unicode;
3408 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003409 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003410 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003411 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003412 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003413 /* Offsets from q for retrieving byte pairs in the right order. */
3414#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3415 int ihi = 1, ilo = 0;
3416#else
3417 int ihi = 0, ilo = 1;
3418#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 PyObject *errorHandler = NULL;
3420 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421
3422 /* Note: size will always be longer than the resulting Unicode
3423 character count */
3424 unicode = _PyUnicode_New(size);
3425 if (!unicode)
3426 return NULL;
3427 if (size == 0)
3428 return (PyObject *)unicode;
3429
3430 /* Unpack UTF-16 encoded data */
3431 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003432 q = (unsigned char *)s;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003433 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434
3435 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003436 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003438 /* Check for BOM marks (U+FEFF) in the input and adjust current
3439 byte order setting accordingly. In native mode, the leading BOM
3440 mark is skipped, in all other modes, it is copied to the output
3441 stream as-is (giving a ZWNBSP character). */
3442 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003443 if (size >= 2) {
3444 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003445#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 if (bom == 0xFEFF) {
3447 q += 2;
3448 bo = -1;
3449 }
3450 else if (bom == 0xFFFE) {
3451 q += 2;
3452 bo = 1;
3453 }
Tim Petersced69f82003-09-16 20:30:58 +00003454#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 if (bom == 0xFEFF) {
3456 q += 2;
3457 bo = 1;
3458 }
3459 else if (bom == 0xFFFE) {
3460 q += 2;
3461 bo = -1;
3462 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003463#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466
Tim Peters772747b2001-08-09 22:21:55 +00003467 if (bo == -1) {
3468 /* force LE */
3469 ihi = 1;
3470 ilo = 0;
3471 }
3472 else if (bo == 1) {
3473 /* force BE */
3474 ihi = 0;
3475 ilo = 1;
3476 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003477#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3478 native_ordering = ilo < ihi;
3479#else
3480 native_ordering = ilo > ihi;
3481#endif
Tim Peters772747b2001-08-09 22:21:55 +00003482
Antoine Pitrouab868312009-01-10 15:40:25 +00003483 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003484 while (1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003485 Py_UNICODE ch;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003486 if (e - q < 2) {
3487 /* remaining byte at the end? (size should be even) */
3488 if (q == e || consumed)
3489 break;
3490 errmsg = "truncated data";
3491 startinpos = ((const char *)q) - starts;
3492 endinpos = ((const char *)e) - starts;
3493 outpos = p - PyUnicode_AS_UNICODE(unicode);
3494 goto utf16Error;
3495 /* The remaining input chars are ignored if the callback
3496 chooses to skip the input */
3497 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003498 /* First check for possible aligned read of a C 'long'. Unaligned
3499 reads are more expensive, better to defer to another iteration. */
3500 if (!((size_t) q & LONG_PTR_MASK)) {
3501 /* Fast path for runs of non-surrogate chars. */
3502 register const unsigned char *_q = q;
3503 Py_UNICODE *_p = p;
3504 if (native_ordering) {
3505 /* Native ordering is simple: as long as the input cannot
3506 possibly contain a surrogate char, do an unrolled copy
3507 of several 16-bit code points to the target object.
3508 The non-surrogate check is done on several input bytes
3509 at a time (as many as a C 'long' can contain). */
3510 while (_q < aligned_end) {
3511 unsigned long data = * (unsigned long *) _q;
3512 if (data & FAST_CHAR_MASK)
3513 break;
3514 _p[0] = ((unsigned short *) _q)[0];
3515 _p[1] = ((unsigned short *) _q)[1];
3516#if (SIZEOF_LONG == 8)
3517 _p[2] = ((unsigned short *) _q)[2];
3518 _p[3] = ((unsigned short *) _q)[3];
3519#endif
3520 _q += SIZEOF_LONG;
3521 _p += SIZEOF_LONG / 2;
3522 }
3523 }
3524 else {
3525 /* Byteswapped ordering is similar, but we must decompose
3526 the copy bytewise, and take care of zero'ing out the
3527 upper bytes if the target object is in 32-bit units
3528 (that is, in UCS-4 builds). */
3529 while (_q < aligned_end) {
3530 unsigned long data = * (unsigned long *) _q;
3531 if (data & SWAPPED_FAST_CHAR_MASK)
3532 break;
3533 /* Zero upper bytes in UCS-4 builds */
3534#if (Py_UNICODE_SIZE > 2)
3535 _p[0] = 0;
3536 _p[1] = 0;
3537#if (SIZEOF_LONG == 8)
3538 _p[2] = 0;
3539 _p[3] = 0;
3540#endif
3541#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003542 /* Issue #4916; UCS-4 builds on big endian machines must
3543 fill the two last bytes of each 4-byte unit. */
3544#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3545# define OFF 2
3546#else
3547# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003548#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003549 ((unsigned char *) _p)[OFF + 1] = _q[0];
3550 ((unsigned char *) _p)[OFF + 0] = _q[1];
3551 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3552 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3553#if (SIZEOF_LONG == 8)
3554 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3555 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3556 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3557 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3558#endif
3559#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003560 _q += SIZEOF_LONG;
3561 _p += SIZEOF_LONG / 2;
3562 }
3563 }
3564 p = _p;
3565 q = _q;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003566 if (e - q < 2)
3567 continue;
Antoine Pitrouab868312009-01-10 15:40:25 +00003568 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570
Benjamin Peterson14339b62009-01-31 16:36:08 +00003571 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003572
3573 if (ch < 0xD800 || ch > 0xDFFF) {
3574 *p++ = ch;
3575 continue;
3576 }
3577
3578 /* UTF-16 code pair: */
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003579 if (e - q < 2) {
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02003580 q -= 2;
3581 if (consumed)
3582 break;
Benjamin Peterson29060642009-01-31 22:14:21 +00003583 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02003584 startinpos = ((const char *)q) - starts;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003585 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00003586 goto utf16Error;
3587 }
3588 if (0xD800 <= ch && ch <= 0xDBFF) {
3589 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3590 q += 2;
3591 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003592#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003593 *p++ = ch;
3594 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003595#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003597#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 continue;
3599 }
3600 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003601 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 startinpos = (((const char *)q)-4)-starts;
3603 endinpos = startinpos+2;
3604 goto utf16Error;
3605 }
3606
Benjamin Peterson14339b62009-01-31 16:36:08 +00003607 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003608 errmsg = "illegal encoding";
3609 startinpos = (((const char *)q)-2)-starts;
3610 endinpos = startinpos+2;
3611 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003612
Benjamin Peterson29060642009-01-31 22:14:21 +00003613 utf16Error:
3614 outpos = p - PyUnicode_AS_UNICODE(unicode);
3615 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003616 errors,
3617 &errorHandler,
3618 "utf16", errmsg,
3619 &starts,
3620 (const char **)&e,
3621 &startinpos,
3622 &endinpos,
3623 &exc,
3624 (const char **)&q,
3625 &unicode,
3626 &outpos,
3627 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003628 goto onError;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003629 /* Update data because unicode_decode_call_errorhandler might have
3630 changed the input object. */
3631 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Antoine Pitrouab868312009-01-10 15:40:25 +00003632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633
3634 if (byteorder)
3635 *byteorder = bo;
3636
Walter Dörwald69652032004-09-07 20:24:22 +00003637 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003639
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003641 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 goto onError;
3643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 Py_XDECREF(errorHandler);
3645 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 return (PyObject *)unicode;
3647
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 Py_XDECREF(errorHandler);
3651 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 return NULL;
3653}
3654
Antoine Pitrouab868312009-01-10 15:40:25 +00003655#undef FAST_CHAR_MASK
3656#undef SWAPPED_FAST_CHAR_MASK
3657
Tim Peters772747b2001-08-09 22:21:55 +00003658PyObject *
3659PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 Py_ssize_t size,
3661 const char *errors,
3662 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003664 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003665 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003666 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003667#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003668 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003669#else
3670 const int pairs = 0;
3671#endif
Tim Peters772747b2001-08-09 22:21:55 +00003672 /* Offsets from p for storing byte pairs in the right order. */
3673#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3674 int ihi = 1, ilo = 0;
3675#else
3676 int ihi = 0, ilo = 1;
3677#endif
3678
Benjamin Peterson29060642009-01-31 22:14:21 +00003679#define STORECHAR(CH) \
3680 do { \
3681 p[ihi] = ((CH) >> 8) & 0xff; \
3682 p[ilo] = (CH) & 0xff; \
3683 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003684 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003686#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003687 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 if (s[i] >= 0x10000)
3689 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003690#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003691 /* 2 * (size + pairs + (byteorder == 0)) */
3692 if (size > PY_SSIZE_T_MAX ||
3693 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003695 nsize = size + pairs + (byteorder == 0);
3696 bytesize = nsize * 2;
3697 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003699 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700 if (v == NULL)
3701 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003703 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003706 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003707 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003708
3709 if (byteorder == -1) {
3710 /* force LE */
3711 ihi = 1;
3712 ilo = 0;
3713 }
3714 else if (byteorder == 1) {
3715 /* force BE */
3716 ihi = 0;
3717 ilo = 1;
3718 }
3719
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003720 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 Py_UNICODE ch = *s++;
3722 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003723#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003724 if (ch >= 0x10000) {
3725 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3726 ch = 0xD800 | ((ch-0x10000) >> 10);
3727 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003728#endif
Tim Peters772747b2001-08-09 22:21:55 +00003729 STORECHAR(ch);
3730 if (ch2)
3731 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003732 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003733
3734 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003735 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003736#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737}
3738
3739PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3740{
3741 if (!PyUnicode_Check(unicode)) {
3742 PyErr_BadArgument();
3743 return NULL;
3744 }
3745 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 PyUnicode_GET_SIZE(unicode),
3747 NULL,
3748 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749}
3750
3751/* --- Unicode Escape Codec ----------------------------------------------- */
3752
Fredrik Lundh06d12682001-01-24 07:59:11 +00003753static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003754
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 Py_ssize_t size,
3757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003760 Py_ssize_t startinpos;
3761 Py_ssize_t endinpos;
3762 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003766 char* message;
3767 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 PyObject *errorHandler = NULL;
3769 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003770
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 /* Escaped strings will always be longer than the resulting
3772 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003773 length after conversion to the true value.
3774 (but if the error callback returns a long replacement string
3775 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 v = _PyUnicode_New(size);
3777 if (v == NULL)
3778 goto onError;
3779 if (size == 0)
3780 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003784
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 while (s < end) {
3786 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003787 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789
3790 /* Non-escape characters are interpreted as Unicode ordinals */
3791 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003792 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 continue;
3794 }
3795
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 /* \ - Escapes */
3798 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003799 c = *s++;
3800 if (s > end)
3801 c = '\0'; /* Invalid after \ */
3802 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803
Benjamin Peterson29060642009-01-31 22:14:21 +00003804 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 case '\n': break;
3806 case '\\': *p++ = '\\'; break;
3807 case '\'': *p++ = '\''; break;
3808 case '\"': *p++ = '\"'; break;
3809 case 'b': *p++ = '\b'; break;
3810 case 'f': *p++ = '\014'; break; /* FF */
3811 case 't': *p++ = '\t'; break;
3812 case 'n': *p++ = '\n'; break;
3813 case 'r': *p++ = '\r'; break;
3814 case 'v': *p++ = '\013'; break; /* VT */
3815 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3816
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 case '0': case '1': case '2': case '3':
3819 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003820 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003821 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003822 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003823 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003824 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003826 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 break;
3828
Benjamin Peterson29060642009-01-31 22:14:21 +00003829 /* hex escapes */
3830 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003832 digits = 2;
3833 message = "truncated \\xXX escape";
3834 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835
Benjamin Peterson29060642009-01-31 22:14:21 +00003836 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003838 digits = 4;
3839 message = "truncated \\uXXXX escape";
3840 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003843 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003844 digits = 8;
3845 message = "truncated \\UXXXXXXXX escape";
3846 hexescape:
3847 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02003848 if (end - s < digits) {
3849 /* count only hex digits */
3850 for (; s < end; ++s) {
3851 c = (unsigned char)*s;
3852 if (!Py_ISXDIGIT(c))
3853 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003854 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02003855 goto error;
3856 }
3857 for (; digits--; ++s) {
3858 c = (unsigned char)*s;
3859 if (!Py_ISXDIGIT(c))
3860 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003861 chr = (chr<<4) & ~0xF;
3862 if (c >= '0' && c <= '9')
3863 chr += c - '0';
3864 else if (c >= 'a' && c <= 'f')
3865 chr += 10 + c - 'a';
3866 else
3867 chr += 10 + c - 'A';
3868 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003869 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003870 /* _decoding_error will have already written into the
3871 target buffer. */
3872 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003873 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003874 /* when we get here, chr is a 32-bit unicode character */
3875 if (chr <= 0xffff)
3876 /* UCS-2 character */
3877 *p++ = (Py_UNICODE) chr;
3878 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003879 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003880 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003881#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003882 *p++ = chr;
3883#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003884 chr -= 0x10000L;
3885 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003886 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003887#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003888 } else {
Serhiy Storchakad6793772013-01-29 10:20:44 +02003889 message = "illegal Unicode character";
3890 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003891 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003892 break;
3893
Benjamin Peterson29060642009-01-31 22:14:21 +00003894 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003895 case 'N':
3896 message = "malformed \\N character escape";
3897 if (ucnhash_CAPI == NULL) {
3898 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003899 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003900 if (ucnhash_CAPI == NULL)
3901 goto ucnhashError;
3902 }
3903 if (*s == '{') {
3904 const char *start = s+1;
3905 /* look for the closing brace */
3906 while (*s != '}' && s < end)
3907 s++;
3908 if (s > start && s < end && *s == '}') {
3909 /* found a name. look it up in the unicode database */
3910 message = "unknown Unicode character name";
3911 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02003912 if (s - start - 1 <= INT_MAX &&
3913 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003914 goto store;
3915 }
3916 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02003917 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003918
3919 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003920 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 message = "\\ at end of string";
3922 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02003923 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00003924 }
3925 else {
3926 *p++ = '\\';
3927 *p++ = (unsigned char)s[-1];
3928 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003929 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02003931 continue;
3932
3933 error:
3934 endinpos = s-starts;
3935 outpos = p-PyUnicode_AS_UNICODE(v);
3936 if (unicode_decode_call_errorhandler(
3937 errors, &errorHandler,
3938 "unicodeescape", message,
3939 &starts, &end, &startinpos, &endinpos, &exc, &s,
3940 &v, &outpos, &p))
3941 goto onError;
3942 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003944 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003946 Py_XDECREF(errorHandler);
3947 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003949
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003951 PyErr_SetString(
3952 PyExc_UnicodeError,
3953 "\\N escapes not supported (can't load unicodedata module)"
3954 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003955 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 Py_XDECREF(errorHandler);
3957 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003958 return NULL;
3959
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 Py_XDECREF(errorHandler);
3963 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 return NULL;
3965}
3966
3967/* Return a Unicode-Escape string version of the Unicode object.
3968
3969 If quotes is true, the string is enclosed in u"" or u'' quotes as
3970 appropriate.
3971
3972*/
3973
Thomas Wouters477c8d52006-05-27 19:21:47 +00003974Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 Py_ssize_t size,
3976 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003977{
3978 /* like wcschr, but doesn't stop at NULL characters */
3979
3980 while (size-- > 0) {
3981 if (*s == ch)
3982 return s;
3983 s++;
3984 }
3985
3986 return NULL;
3987}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003988
Walter Dörwald79e913e2007-05-12 11:08:06 +00003989static const char *hexdigits = "0123456789abcdef";
3990
3991PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003994 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003997#ifdef Py_UNICODE_WIDE
3998 const Py_ssize_t expandsize = 10;
3999#else
4000 const Py_ssize_t expandsize = 6;
4001#endif
4002
Thomas Wouters89f507f2006-12-13 04:49:30 +00004003 /* XXX(nnorwitz): rather than over-allocating, it would be
4004 better to choose a different scheme. Perhaps scan the
4005 first N-chars of the string and allocate based on that size.
4006 */
4007 /* Initial allocation is based on the longest-possible unichr
4008 escape.
4009
4010 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4011 unichr, so in this case it's the longest unichr escape. In
4012 narrow (UTF-16) builds this is five chars per source unichr
4013 since there are two unichrs in the surrogate pair, so in narrow
4014 (UTF-16) builds it's not the longest unichr escape.
4015
4016 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4017 so in the narrow (UTF-16) build case it's the longest unichr
4018 escape.
4019 */
4020
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004021 if (size == 0)
4022 return PyBytes_FromStringAndSize(NULL, 0);
4023
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004024 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004025 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004026
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004027 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 2
4029 + expandsize*size
4030 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 if (repr == NULL)
4032 return NULL;
4033
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004034 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 while (size-- > 0) {
4037 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004038
Walter Dörwald79e913e2007-05-12 11:08:06 +00004039 /* Escape backslashes */
4040 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 *p++ = '\\';
4042 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004043 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004044 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004045
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004046#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004047 /* Map 21-bit characters to '\U00xxxxxx' */
4048 else if (ch >= 0x10000) {
4049 *p++ = '\\';
4050 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004051 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4052 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4053 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4054 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4055 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4056 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4057 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4058 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004060 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004061#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4063 else if (ch >= 0xD800 && ch < 0xDC00) {
4064 Py_UNICODE ch2;
4065 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004066
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 ch2 = *s++;
4068 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004069 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4071 *p++ = '\\';
4072 *p++ = 'U';
4073 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4074 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4075 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4076 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4077 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4078 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4079 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4080 *p++ = hexdigits[ucs & 0x0000000F];
4081 continue;
4082 }
4083 /* Fall through: isolated surrogates are copied as-is */
4084 s--;
4085 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004086 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004087#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004088
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004090 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091 *p++ = '\\';
4092 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004093 *p++ = hexdigits[(ch >> 12) & 0x000F];
4094 *p++ = hexdigits[(ch >> 8) & 0x000F];
4095 *p++ = hexdigits[(ch >> 4) & 0x000F];
4096 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004098
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004099 /* Map special whitespace to '\t', \n', '\r' */
4100 else if (ch == '\t') {
4101 *p++ = '\\';
4102 *p++ = 't';
4103 }
4104 else if (ch == '\n') {
4105 *p++ = '\\';
4106 *p++ = 'n';
4107 }
4108 else if (ch == '\r') {
4109 *p++ = '\\';
4110 *p++ = 'r';
4111 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004112
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004113 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004114 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004116 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004117 *p++ = hexdigits[(ch >> 4) & 0x000F];
4118 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004119 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004120
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 /* Copy everything else as-is */
4122 else
4123 *p++ = (char) ch;
4124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004126 assert(p - PyBytes_AS_STRING(repr) > 0);
4127 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4128 return NULL;
4129 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130}
4131
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004132PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004134 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 if (!PyUnicode_Check(unicode)) {
4136 PyErr_BadArgument();
4137 return NULL;
4138 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004139 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4140 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004141 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142}
4143
4144/* --- Raw Unicode Escape Codec ------------------------------------------- */
4145
4146PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 Py_ssize_t size,
4148 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004151 Py_ssize_t startinpos;
4152 Py_ssize_t endinpos;
4153 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 const char *end;
4157 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 PyObject *errorHandler = NULL;
4159 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004160
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161 /* Escaped strings will always be longer than the resulting
4162 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 length after conversion to the true value. (But decoding error
4164 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 v = _PyUnicode_New(size);
4166 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 end = s + size;
4172 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 unsigned char c;
4174 Py_UCS4 x;
4175 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004176 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 /* Non-escape characters are interpreted as Unicode ordinals */
4179 if (*s != '\\') {
4180 *p++ = (unsigned char)*s++;
4181 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004182 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 startinpos = s-starts;
4184
4185 /* \u-escapes are only interpreted iff the number of leading
4186 backslashes if odd */
4187 bs = s;
4188 for (;s < end;) {
4189 if (*s != '\\')
4190 break;
4191 *p++ = (unsigned char)*s++;
4192 }
4193 if (((s - bs) & 1) == 0 ||
4194 s >= end ||
4195 (*s != 'u' && *s != 'U')) {
4196 continue;
4197 }
4198 p--;
4199 count = *s=='u' ? 4 : 8;
4200 s++;
4201
4202 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4203 outpos = p-PyUnicode_AS_UNICODE(v);
4204 for (x = 0, i = 0; i < count; ++i, ++s) {
4205 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004206 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 endinpos = s-starts;
4208 if (unicode_decode_call_errorhandler(
4209 errors, &errorHandler,
4210 "rawunicodeescape", "truncated \\uXXXX",
4211 &starts, &end, &startinpos, &endinpos, &exc, &s,
4212 &v, &outpos, &p))
4213 goto onError;
4214 goto nextByte;
4215 }
4216 x = (x<<4) & ~0xF;
4217 if (c >= '0' && c <= '9')
4218 x += c - '0';
4219 else if (c >= 'a' && c <= 'f')
4220 x += 10 + c - 'a';
4221 else
4222 x += 10 + c - 'A';
4223 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004224 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 /* UCS-2 character */
4226 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004227 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 /* UCS-4 character. Either store directly, or as
4229 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004230#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004232#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004233 x -= 0x10000L;
4234 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4235 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004236#endif
4237 } else {
4238 endinpos = s-starts;
4239 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004240 if (unicode_decode_call_errorhandler(
4241 errors, &errorHandler,
4242 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 &starts, &end, &startinpos, &endinpos, &exc, &s,
4244 &v, &outpos, &p))
4245 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004246 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 nextByte:
4248 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004250 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 Py_XDECREF(errorHandler);
4253 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004255
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 Py_XDECREF(errorHandler);
4259 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 return NULL;
4261}
4262
4263PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004264 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004266 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267 char *p;
4268 char *q;
4269
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004270#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004271 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004272#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004273 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004274#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004275
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004276 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004277 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004278
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004279 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 if (repr == NULL)
4281 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004282 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004283 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004285 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 while (size-- > 0) {
4287 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004288#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 /* Map 32-bit characters to '\Uxxxxxxxx' */
4290 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004291 *p++ = '\\';
4292 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004293 *p++ = hexdigits[(ch >> 28) & 0xf];
4294 *p++ = hexdigits[(ch >> 24) & 0xf];
4295 *p++ = hexdigits[(ch >> 20) & 0xf];
4296 *p++ = hexdigits[(ch >> 16) & 0xf];
4297 *p++ = hexdigits[(ch >> 12) & 0xf];
4298 *p++ = hexdigits[(ch >> 8) & 0xf];
4299 *p++ = hexdigits[(ch >> 4) & 0xf];
4300 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004301 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004302 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004303#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4305 if (ch >= 0xD800 && ch < 0xDC00) {
4306 Py_UNICODE ch2;
4307 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004308
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 ch2 = *s++;
4310 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004311 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4313 *p++ = '\\';
4314 *p++ = 'U';
4315 *p++ = hexdigits[(ucs >> 28) & 0xf];
4316 *p++ = hexdigits[(ucs >> 24) & 0xf];
4317 *p++ = hexdigits[(ucs >> 20) & 0xf];
4318 *p++ = hexdigits[(ucs >> 16) & 0xf];
4319 *p++ = hexdigits[(ucs >> 12) & 0xf];
4320 *p++ = hexdigits[(ucs >> 8) & 0xf];
4321 *p++ = hexdigits[(ucs >> 4) & 0xf];
4322 *p++ = hexdigits[ucs & 0xf];
4323 continue;
4324 }
4325 /* Fall through: isolated surrogates are copied as-is */
4326 s--;
4327 size++;
4328 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004329#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004330 /* Map 16-bit characters to '\uxxxx' */
4331 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 *p++ = '\\';
4333 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004334 *p++ = hexdigits[(ch >> 12) & 0xf];
4335 *p++ = hexdigits[(ch >> 8) & 0xf];
4336 *p++ = hexdigits[(ch >> 4) & 0xf];
4337 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 /* Copy everything else as-is */
4340 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 *p++ = (char) ch;
4342 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004343 size = p - q;
4344
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004345 assert(size > 0);
4346 if (_PyBytes_Resize(&repr, size) < 0)
4347 return NULL;
4348 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349}
4350
4351PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4352{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004353 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004355 PyErr_BadArgument();
4356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004358 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4359 PyUnicode_GET_SIZE(unicode));
4360
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004361 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362}
4363
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004364/* --- Unicode Internal Codec ------------------------------------------- */
4365
4366PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 Py_ssize_t size,
4368 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004369{
4370 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004371 Py_ssize_t startinpos;
4372 Py_ssize_t endinpos;
4373 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004374 PyUnicodeObject *v;
4375 Py_UNICODE *p;
4376 const char *end;
4377 const char *reason;
4378 PyObject *errorHandler = NULL;
4379 PyObject *exc = NULL;
4380
Neal Norwitzd43069c2006-01-08 01:12:10 +00004381#ifdef Py_UNICODE_WIDE
4382 Py_UNICODE unimax = PyUnicode_GetMax();
4383#endif
4384
Thomas Wouters89f507f2006-12-13 04:49:30 +00004385 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004386 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4387 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004389 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004391 p = PyUnicode_AS_UNICODE(v);
4392 end = s + size;
4393
4394 while (s < end) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02004395 if (end-s < Py_UNICODE_SIZE) {
4396 endinpos = end-starts;
4397 reason = "truncated input";
4398 goto error;
4399 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00004400 memcpy(p, s, sizeof(Py_UNICODE));
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02004401#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004402 /* We have to sanity check the raw data, otherwise doom looms for
4403 some malformed UCS-4 data. */
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02004404 if (*p > unimax || *p < 0) {
4405 endinpos = s - starts + Py_UNICODE_SIZE;
4406 reason = "illegal code point (> 0x10FFFF)";
4407 goto error;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004408 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02004409#endif
4410 p++;
4411 s += Py_UNICODE_SIZE;
4412 continue;
4413
4414 error:
4415 startinpos = s - starts;
4416 outpos = p - PyUnicode_AS_UNICODE(v);
4417 if (unicode_decode_call_errorhandler(
4418 errors, &errorHandler,
4419 "unicode_internal", reason,
4420 &starts, &end, &startinpos, &endinpos, &exc, &s,
4421 &v, &outpos, &p)) {
4422 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004423 }
4424 }
4425
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004426 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004427 goto onError;
4428 Py_XDECREF(errorHandler);
4429 Py_XDECREF(exc);
4430 return (PyObject *)v;
4431
Benjamin Peterson29060642009-01-31 22:14:21 +00004432 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004433 Py_XDECREF(v);
4434 Py_XDECREF(errorHandler);
4435 Py_XDECREF(exc);
4436 return NULL;
4437}
4438
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439/* --- Latin-1 Codec ------------------------------------------------------ */
4440
4441PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 Py_ssize_t size,
4443 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444{
4445 PyUnicodeObject *v;
4446 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004447 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004448
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004450 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004451 Py_UNICODE r = *(unsigned char*)s;
4452 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004453 }
4454
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 v = _PyUnicode_New(size);
4456 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004457 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004461 e = s + size;
4462 /* Unrolling the copy makes it much faster by reducing the looping
4463 overhead. This is similar to what many memcpy() implementations do. */
4464 unrolled_end = e - 4;
4465 while (s < unrolled_end) {
4466 p[0] = (unsigned char) s[0];
4467 p[1] = (unsigned char) s[1];
4468 p[2] = (unsigned char) s[2];
4469 p[3] = (unsigned char) s[3];
4470 s += 4;
4471 p += 4;
4472 }
4473 while (s < e)
4474 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004476
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 Py_XDECREF(v);
4479 return NULL;
4480}
4481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482/* create or adjust a UnicodeEncodeError */
4483static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004484 const char *encoding,
4485 const Py_UNICODE *unicode, Py_ssize_t size,
4486 Py_ssize_t startpos, Py_ssize_t endpos,
4487 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 *exceptionObject = PyUnicodeEncodeError_Create(
4491 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 }
4493 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4495 goto onError;
4496 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4497 goto onError;
4498 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4499 goto onError;
4500 return;
4501 onError:
4502 Py_DECREF(*exceptionObject);
4503 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 }
4505}
4506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507/* raises a UnicodeEncodeError */
4508static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 const char *encoding,
4510 const Py_UNICODE *unicode, Py_ssize_t size,
4511 Py_ssize_t startpos, Py_ssize_t endpos,
4512 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513{
4514 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518}
4519
4520/* error handling callback helper:
4521 build arguments, call the callback and check the arguments,
4522 put the result into newpos and return the replacement string, which
4523 has to be freed by the caller */
4524static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 PyObject **errorHandler,
4526 const char *encoding, const char *reason,
4527 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4528 Py_ssize_t startpos, Py_ssize_t endpos,
4529 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004531 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532
4533 PyObject *restuple;
4534 PyObject *resunicode;
4535
4536 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004537 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 }
4541
4542 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546
4547 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004552 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 Py_DECREF(restuple);
4554 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004556 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004557 &resunicode, newpos)) {
4558 Py_DECREF(restuple);
4559 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004561 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4562 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4563 Py_DECREF(restuple);
4564 return NULL;
4565 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004568 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4570 Py_DECREF(restuple);
4571 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004572 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 Py_INCREF(resunicode);
4574 Py_DECREF(restuple);
4575 return resunicode;
4576}
4577
4578static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 Py_ssize_t size,
4580 const char *errors,
4581 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582{
4583 /* output object */
4584 PyObject *res;
4585 /* pointers to the beginning and end+1 of input */
4586 const Py_UNICODE *startp = p;
4587 const Py_UNICODE *endp = p + size;
4588 /* pointer to the beginning of the unencodable characters */
4589 /* const Py_UNICODE *badp = NULL; */
4590 /* pointer into the output */
4591 char *str;
4592 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004593 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004594 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4595 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 PyObject *errorHandler = NULL;
4597 PyObject *exc = NULL;
4598 /* the following variable is used for caching string comparisons
4599 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4600 int known_errorHandler = -1;
4601
4602 /* allocate enough for a simple encoding without
4603 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004604 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004605 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004606 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004607 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004608 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004609 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 ressize = size;
4611
4612 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 /* can we encode this? */
4616 if (c<limit) {
4617 /* no overflow check, because we know that the space is enough */
4618 *str++ = (char)c;
4619 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004620 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 else {
4622 Py_ssize_t unicodepos = p-startp;
4623 Py_ssize_t requiredsize;
4624 PyObject *repunicode;
4625 Py_ssize_t repsize;
4626 Py_ssize_t newpos;
4627 Py_ssize_t respos;
4628 Py_UNICODE *uni2;
4629 /* startpos for collecting unencodable chars */
4630 const Py_UNICODE *collstart = p;
4631 const Py_UNICODE *collend = p;
4632 /* find all unecodable characters */
4633 while ((collend < endp) && ((*collend)>=limit))
4634 ++collend;
4635 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4636 if (known_errorHandler==-1) {
4637 if ((errors==NULL) || (!strcmp(errors, "strict")))
4638 known_errorHandler = 1;
4639 else if (!strcmp(errors, "replace"))
4640 known_errorHandler = 2;
4641 else if (!strcmp(errors, "ignore"))
4642 known_errorHandler = 3;
4643 else if (!strcmp(errors, "xmlcharrefreplace"))
4644 known_errorHandler = 4;
4645 else
4646 known_errorHandler = 0;
4647 }
4648 switch (known_errorHandler) {
4649 case 1: /* strict */
4650 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4651 goto onError;
4652 case 2: /* replace */
4653 while (collstart++<collend)
4654 *str++ = '?'; /* fall through */
4655 case 3: /* ignore */
4656 p = collend;
4657 break;
4658 case 4: /* xmlcharrefreplace */
4659 respos = str - PyBytes_AS_STRING(res);
4660 /* determine replacement size (temporarily (mis)uses p) */
4661 for (p = collstart, repsize = 0; p < collend; ++p) {
4662 if (*p<10)
4663 repsize += 2+1+1;
4664 else if (*p<100)
4665 repsize += 2+2+1;
4666 else if (*p<1000)
4667 repsize += 2+3+1;
4668 else if (*p<10000)
4669 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004670#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 else
4672 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004673#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 else if (*p<100000)
4675 repsize += 2+5+1;
4676 else if (*p<1000000)
4677 repsize += 2+6+1;
4678 else
4679 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004680#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 }
4682 requiredsize = respos+repsize+(endp-collend);
4683 if (requiredsize > ressize) {
4684 if (requiredsize<2*ressize)
4685 requiredsize = 2*ressize;
4686 if (_PyBytes_Resize(&res, requiredsize))
4687 goto onError;
4688 str = PyBytes_AS_STRING(res) + respos;
4689 ressize = requiredsize;
4690 }
4691 /* generate replacement (temporarily (mis)uses p) */
4692 for (p = collstart; p < collend; ++p) {
4693 str += sprintf(str, "&#%d;", (int)*p);
4694 }
4695 p = collend;
4696 break;
4697 default:
4698 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4699 encoding, reason, startp, size, &exc,
4700 collstart-startp, collend-startp, &newpos);
4701 if (repunicode == NULL)
4702 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004703 if (PyBytes_Check(repunicode)) {
4704 /* Directly copy bytes result to output. */
4705 repsize = PyBytes_Size(repunicode);
4706 if (repsize > 1) {
4707 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004708 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004709 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4710 Py_DECREF(repunicode);
4711 goto onError;
4712 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004713 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004714 ressize += repsize-1;
4715 }
4716 memcpy(str, PyBytes_AsString(repunicode), repsize);
4717 str += repsize;
4718 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004719 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004720 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004721 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 /* need more space? (at least enough for what we
4723 have+the replacement+the rest of the string, so
4724 we won't have to check space for encodable characters) */
4725 respos = str - PyBytes_AS_STRING(res);
4726 repsize = PyUnicode_GET_SIZE(repunicode);
4727 requiredsize = respos+repsize+(endp-collend);
4728 if (requiredsize > ressize) {
4729 if (requiredsize<2*ressize)
4730 requiredsize = 2*ressize;
4731 if (_PyBytes_Resize(&res, requiredsize)) {
4732 Py_DECREF(repunicode);
4733 goto onError;
4734 }
4735 str = PyBytes_AS_STRING(res) + respos;
4736 ressize = requiredsize;
4737 }
4738 /* check if there is anything unencodable in the replacement
4739 and copy it to the output */
4740 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4741 c = *uni2;
4742 if (c >= limit) {
4743 raise_encode_exception(&exc, encoding, startp, size,
4744 unicodepos, unicodepos+1, reason);
4745 Py_DECREF(repunicode);
4746 goto onError;
4747 }
4748 *str = (char)c;
4749 }
4750 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004751 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004752 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004753 }
4754 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004755 /* Resize if we allocated to much */
4756 size = str - PyBytes_AS_STRING(res);
4757 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004758 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004759 if (_PyBytes_Resize(&res, size) < 0)
4760 goto onError;
4761 }
4762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 Py_XDECREF(errorHandler);
4764 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004765 return res;
4766
4767 onError:
4768 Py_XDECREF(res);
4769 Py_XDECREF(errorHandler);
4770 Py_XDECREF(exc);
4771 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772}
4773
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004775 Py_ssize_t size,
4776 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004778 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779}
4780
4781PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4782{
4783 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004784 PyErr_BadArgument();
4785 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 }
4787 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 PyUnicode_GET_SIZE(unicode),
4789 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790}
4791
4792/* --- 7-bit ASCII Codec -------------------------------------------------- */
4793
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004795 Py_ssize_t size,
4796 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 PyUnicodeObject *v;
4800 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004801 Py_ssize_t startinpos;
4802 Py_ssize_t endinpos;
4803 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 const char *e;
4805 PyObject *errorHandler = NULL;
4806 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004807
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004809 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 Py_UNICODE r = *(unsigned char*)s;
4811 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004812 }
Tim Petersced69f82003-09-16 20:30:58 +00004813
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 v = _PyUnicode_New(size);
4815 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 e = s + size;
4821 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004822 register unsigned char c = (unsigned char)*s;
4823 if (c < 128) {
4824 *p++ = c;
4825 ++s;
4826 }
4827 else {
4828 startinpos = s-starts;
4829 endinpos = startinpos + 1;
4830 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4831 if (unicode_decode_call_errorhandler(
4832 errors, &errorHandler,
4833 "ascii", "ordinal not in range(128)",
4834 &starts, &e, &startinpos, &endinpos, &exc, &s,
4835 &v, &outpos, &p))
4836 goto onError;
4837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004839 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4841 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 Py_XDECREF(errorHandler);
4843 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004845
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 Py_XDECREF(errorHandler);
4849 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 return NULL;
4851}
4852
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 Py_ssize_t size,
4855 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858}
4859
4860PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4861{
4862 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004863 PyErr_BadArgument();
4864 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 }
4866 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 PyUnicode_GET_SIZE(unicode),
4868 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869}
4870
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004871#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004872
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004873/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004874
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004875#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004876#define NEED_RETRY
4877#endif
4878
4879/* XXX This code is limited to "true" double-byte encodings, as
4880 a) it assumes an incomplete character consists of a single byte, and
4881 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004882 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004883
4884static int is_dbcs_lead_byte(const char *s, int offset)
4885{
4886 const char *curr = s + offset;
4887
4888 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004889 const char *prev = CharPrev(s, curr);
4890 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004891 }
4892 return 0;
4893}
4894
4895/*
4896 * Decode MBCS string into unicode object. If 'final' is set, converts
4897 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4898 */
4899static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 const char *s, /* MBCS string */
4901 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004902 int final,
4903 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004904{
4905 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004906 Py_ssize_t n;
4907 DWORD usize;
4908 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004909
4910 assert(size >= 0);
4911
Victor Stinner554f3f02010-06-16 23:33:54 +00004912 /* check and handle 'errors' arg */
4913 if (errors==NULL || strcmp(errors, "strict")==0)
4914 flags = MB_ERR_INVALID_CHARS;
4915 else if (strcmp(errors, "ignore")==0)
4916 flags = 0;
4917 else {
4918 PyErr_Format(PyExc_ValueError,
4919 "mbcs encoding does not support errors='%s'",
4920 errors);
4921 return -1;
4922 }
4923
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004924 /* Skip trailing lead-byte unless 'final' is set */
4925 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004927
4928 /* First get the size of the result */
4929 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004930 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4931 if (usize==0)
4932 goto mbcs_decode_error;
4933 } else
4934 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004935
4936 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 /* Create unicode object */
4938 *v = _PyUnicode_New(usize);
4939 if (*v == NULL)
4940 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004941 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004942 }
4943 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004944 /* Extend unicode object */
4945 n = PyUnicode_GET_SIZE(*v);
4946 if (_PyUnicode_Resize(v, n + usize) < 0)
4947 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004948 }
4949
4950 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004951 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004952 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004953 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4954 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004956 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004957 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004958
4959mbcs_decode_error:
4960 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4961 we raise a UnicodeDecodeError - else it is a 'generic'
4962 windows error
4963 */
4964 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4965 /* Ideally, we should get reason from FormatMessage - this
4966 is the Windows 2000 English version of the message
4967 */
4968 PyObject *exc = NULL;
4969 const char *reason = "No mapping for the Unicode character exists "
4970 "in the target multi-byte code page.";
4971 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4972 if (exc != NULL) {
4973 PyCodec_StrictErrors(exc);
4974 Py_DECREF(exc);
4975 }
4976 } else {
4977 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4978 }
4979 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004980}
4981
4982PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 Py_ssize_t size,
4984 const char *errors,
4985 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004986{
4987 PyUnicodeObject *v = NULL;
4988 int done;
4989
4990 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004991 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004992
4993#ifdef NEED_RETRY
4994 retry:
4995 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004996 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004997 else
4998#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004999 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005000
5001 if (done < 0) {
5002 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005004 }
5005
5006 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005008
5009#ifdef NEED_RETRY
5010 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 s += done;
5012 size -= done;
5013 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005014 }
5015#endif
5016
5017 return (PyObject *)v;
5018}
5019
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005020PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 Py_ssize_t size,
5022 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005023{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005024 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5025}
5026
5027/*
5028 * Convert unicode into string object (MBCS).
5029 * Returns 0 if succeed, -1 otherwise.
5030 */
5031static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005033 int size, /* size of unicode */
5034 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005035{
Victor Stinner554f3f02010-06-16 23:33:54 +00005036 BOOL usedDefaultChar = FALSE;
5037 BOOL *pusedDefaultChar;
5038 int mbcssize;
5039 Py_ssize_t n;
5040 PyObject *exc = NULL;
5041 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005042
5043 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005044
Victor Stinner554f3f02010-06-16 23:33:54 +00005045 /* check and handle 'errors' arg */
5046 if (errors==NULL || strcmp(errors, "strict")==0) {
5047 flags = WC_NO_BEST_FIT_CHARS;
5048 pusedDefaultChar = &usedDefaultChar;
5049 } else if (strcmp(errors, "replace")==0) {
5050 flags = 0;
5051 pusedDefaultChar = NULL;
5052 } else {
5053 PyErr_Format(PyExc_ValueError,
5054 "mbcs encoding does not support errors='%s'",
5055 errors);
5056 return -1;
5057 }
5058
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005059 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005060 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005061 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5062 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 if (mbcssize == 0) {
5064 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5065 return -1;
5066 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005067 /* If we used a default char, then we failed! */
5068 if (pusedDefaultChar && *pusedDefaultChar)
5069 goto mbcs_encode_error;
5070 } else {
5071 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005072 }
5073
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005074 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 /* Create string object */
5076 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5077 if (*repr == NULL)
5078 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005079 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005080 }
5081 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 /* Extend string object */
5083 n = PyBytes_Size(*repr);
5084 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5085 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005086 }
5087
5088 /* Do the conversion */
5089 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005090 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005091 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5092 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5094 return -1;
5095 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005096 if (pusedDefaultChar && *pusedDefaultChar)
5097 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005098 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005099 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005100
5101mbcs_encode_error:
5102 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5103 Py_XDECREF(exc);
5104 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005105}
5106
5107PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 Py_ssize_t size,
5109 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005110{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005111 PyObject *repr = NULL;
5112 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005113
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005114#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005116 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005117 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005118 else
5119#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005120 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005121
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005122 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 Py_XDECREF(repr);
5124 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005125 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005126
5127#ifdef NEED_RETRY
5128 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 p += INT_MAX;
5130 size -= INT_MAX;
5131 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005132 }
5133#endif
5134
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005135 return repr;
5136}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005137
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005138PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5139{
5140 if (!PyUnicode_Check(unicode)) {
5141 PyErr_BadArgument();
5142 return NULL;
5143 }
5144 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 PyUnicode_GET_SIZE(unicode),
5146 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005147}
5148
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005149#undef NEED_RETRY
5150
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005151#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005152
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153/* --- Character Mapping Codec -------------------------------------------- */
5154
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005156 Py_ssize_t size,
5157 PyObject *mapping,
5158 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005160 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005161 Py_ssize_t startinpos;
5162 Py_ssize_t endinpos;
5163 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005164 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 PyUnicodeObject *v;
5166 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005167 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005168 PyObject *errorHandler = NULL;
5169 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005170 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005171 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005172
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173 /* Default to Latin-1 */
5174 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176
5177 v = _PyUnicode_New(size);
5178 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005183 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005184 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 mapstring = PyUnicode_AS_UNICODE(mapping);
5186 maplen = PyUnicode_GET_SIZE(mapping);
5187 while (s < e) {
5188 unsigned char ch = *s;
5189 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 if (ch < maplen)
5192 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 if (x == 0xfffe) {
5195 /* undefined mapping */
5196 outpos = p-PyUnicode_AS_UNICODE(v);
5197 startinpos = s-starts;
5198 endinpos = startinpos+1;
5199 if (unicode_decode_call_errorhandler(
5200 errors, &errorHandler,
5201 "charmap", "character maps to <undefined>",
5202 &starts, &e, &startinpos, &endinpos, &exc, &s,
5203 &v, &outpos, &p)) {
5204 goto onError;
5205 }
5206 continue;
5207 }
5208 *p++ = x;
5209 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005210 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005211 }
5212 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005213 while (s < e) {
5214 unsigned char ch = *s;
5215 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005216
Benjamin Peterson29060642009-01-31 22:14:21 +00005217 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5218 w = PyLong_FromLong((long)ch);
5219 if (w == NULL)
5220 goto onError;
5221 x = PyObject_GetItem(mapping, w);
5222 Py_DECREF(w);
5223 if (x == NULL) {
5224 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5225 /* No mapping found means: mapping is undefined. */
5226 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005227 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 } else
5229 goto onError;
5230 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005231
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005233 if (x == Py_None)
5234 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 if (PyLong_Check(x)) {
5236 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005237 if (value == 0xFFFE)
5238 goto Undefined;
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005239 if (value < 0 || value > 0x10FFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 PyErr_SetString(PyExc_TypeError,
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005241 "character mapping must be in range(0x110000)");
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 Py_DECREF(x);
5243 goto onError;
5244 }
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005245
5246#ifndef Py_UNICODE_WIDE
5247 if (value > 0xFFFF) {
5248 /* see the code for 1-n mapping below */
5249 if (extrachars < 2) {
5250 /* resize first */
5251 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5252 Py_ssize_t needed = 10 - extrachars;
5253 extrachars += needed;
5254 /* XXX overflow detection missing */
5255 if (_PyUnicode_Resize(&v,
5256 PyUnicode_GET_SIZE(v) + needed) < 0) {
5257 Py_DECREF(x);
5258 goto onError;
5259 }
5260 p = PyUnicode_AS_UNICODE(v) + oldpos;
5261 }
5262 value -= 0x10000;
5263 *p++ = 0xD800 | (value >> 10);
5264 *p++ = 0xDC00 | (value & 0x3FF);
5265 extrachars -= 2;
5266 }
5267 else
5268#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 *p++ = (Py_UNICODE)value;
5270 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 else if (PyUnicode_Check(x)) {
5272 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005273
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005274 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005275 /* 1-1 mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005276 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
5277 if (value == 0xFFFE)
5278 goto Undefined;
5279 *p++ = value;
5280 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 else if (targetsize > 1) {
5282 /* 1-n mapping */
5283 if (targetsize > extrachars) {
5284 /* resize first */
5285 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5286 Py_ssize_t needed = (targetsize - extrachars) + \
5287 (targetsize << 2);
5288 extrachars += needed;
5289 /* XXX overflow detection missing */
5290 if (_PyUnicode_Resize(&v,
5291 PyUnicode_GET_SIZE(v) + needed) < 0) {
5292 Py_DECREF(x);
5293 goto onError;
5294 }
5295 p = PyUnicode_AS_UNICODE(v) + oldpos;
5296 }
5297 Py_UNICODE_COPY(p,
5298 PyUnicode_AS_UNICODE(x),
5299 targetsize);
5300 p += targetsize;
5301 extrachars -= targetsize;
5302 }
5303 /* 1-0 mapping: skip the character */
5304 }
5305 else {
5306 /* wrong return value */
5307 PyErr_SetString(PyExc_TypeError,
5308 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005309 Py_DECREF(x);
5310 goto onError;
5311 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 Py_DECREF(x);
5313 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005314 continue;
5315Undefined:
5316 /* undefined mapping */
5317 Py_XDECREF(x);
5318 outpos = p-PyUnicode_AS_UNICODE(v);
5319 startinpos = s-starts;
5320 endinpos = startinpos+1;
5321 if (unicode_decode_call_errorhandler(
5322 errors, &errorHandler,
5323 "charmap", "character maps to <undefined>",
5324 &starts, &e, &startinpos, &endinpos, &exc, &s,
5325 &v, &outpos, &p)) {
5326 goto onError;
5327 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 }
5330 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5332 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005333 Py_XDECREF(errorHandler);
5334 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005336
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005338 Py_XDECREF(errorHandler);
5339 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 Py_XDECREF(v);
5341 return NULL;
5342}
5343
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005344/* Charmap encoding: the lookup table */
5345
5346struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 PyObject_HEAD
5348 unsigned char level1[32];
5349 int count2, count3;
5350 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005351};
5352
5353static PyObject*
5354encoding_map_size(PyObject *obj, PyObject* args)
5355{
5356 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005357 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005359}
5360
5361static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005362 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 PyDoc_STR("Return the size (in bytes) of this object") },
5364 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005365};
5366
5367static void
5368encoding_map_dealloc(PyObject* o)
5369{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005370 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005371}
5372
5373static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005374 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 "EncodingMap", /*tp_name*/
5376 sizeof(struct encoding_map), /*tp_basicsize*/
5377 0, /*tp_itemsize*/
5378 /* methods */
5379 encoding_map_dealloc, /*tp_dealloc*/
5380 0, /*tp_print*/
5381 0, /*tp_getattr*/
5382 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005383 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 0, /*tp_repr*/
5385 0, /*tp_as_number*/
5386 0, /*tp_as_sequence*/
5387 0, /*tp_as_mapping*/
5388 0, /*tp_hash*/
5389 0, /*tp_call*/
5390 0, /*tp_str*/
5391 0, /*tp_getattro*/
5392 0, /*tp_setattro*/
5393 0, /*tp_as_buffer*/
5394 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5395 0, /*tp_doc*/
5396 0, /*tp_traverse*/
5397 0, /*tp_clear*/
5398 0, /*tp_richcompare*/
5399 0, /*tp_weaklistoffset*/
5400 0, /*tp_iter*/
5401 0, /*tp_iternext*/
5402 encoding_map_methods, /*tp_methods*/
5403 0, /*tp_members*/
5404 0, /*tp_getset*/
5405 0, /*tp_base*/
5406 0, /*tp_dict*/
5407 0, /*tp_descr_get*/
5408 0, /*tp_descr_set*/
5409 0, /*tp_dictoffset*/
5410 0, /*tp_init*/
5411 0, /*tp_alloc*/
5412 0, /*tp_new*/
5413 0, /*tp_free*/
5414 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005415};
5416
5417PyObject*
5418PyUnicode_BuildEncodingMap(PyObject* string)
5419{
5420 Py_UNICODE *decode;
5421 PyObject *result;
5422 struct encoding_map *mresult;
5423 int i;
5424 int need_dict = 0;
5425 unsigned char level1[32];
5426 unsigned char level2[512];
5427 unsigned char *mlevel1, *mlevel2, *mlevel3;
5428 int count2 = 0, count3 = 0;
5429
5430 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5431 PyErr_BadArgument();
5432 return NULL;
5433 }
5434 decode = PyUnicode_AS_UNICODE(string);
5435 memset(level1, 0xFF, sizeof level1);
5436 memset(level2, 0xFF, sizeof level2);
5437
5438 /* If there isn't a one-to-one mapping of NULL to \0,
5439 or if there are non-BMP characters, we need to use
5440 a mapping dictionary. */
5441 if (decode[0] != 0)
5442 need_dict = 1;
5443 for (i = 1; i < 256; i++) {
5444 int l1, l2;
5445 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005446#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005447 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005448#endif
5449 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005450 need_dict = 1;
5451 break;
5452 }
5453 if (decode[i] == 0xFFFE)
5454 /* unmapped character */
5455 continue;
5456 l1 = decode[i] >> 11;
5457 l2 = decode[i] >> 7;
5458 if (level1[l1] == 0xFF)
5459 level1[l1] = count2++;
5460 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005461 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005462 }
5463
5464 if (count2 >= 0xFF || count3 >= 0xFF)
5465 need_dict = 1;
5466
5467 if (need_dict) {
5468 PyObject *result = PyDict_New();
5469 PyObject *key, *value;
5470 if (!result)
5471 return NULL;
5472 for (i = 0; i < 256; i++) {
5473 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005474 key = PyLong_FromLong(decode[i]);
5475 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005476 if (!key || !value)
5477 goto failed1;
5478 if (PyDict_SetItem(result, key, value) == -1)
5479 goto failed1;
5480 Py_DECREF(key);
5481 Py_DECREF(value);
5482 }
5483 return result;
5484 failed1:
5485 Py_XDECREF(key);
5486 Py_XDECREF(value);
5487 Py_DECREF(result);
5488 return NULL;
5489 }
5490
5491 /* Create a three-level trie */
5492 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5493 16*count2 + 128*count3 - 1);
5494 if (!result)
5495 return PyErr_NoMemory();
5496 PyObject_Init(result, &EncodingMapType);
5497 mresult = (struct encoding_map*)result;
5498 mresult->count2 = count2;
5499 mresult->count3 = count3;
5500 mlevel1 = mresult->level1;
5501 mlevel2 = mresult->level23;
5502 mlevel3 = mresult->level23 + 16*count2;
5503 memcpy(mlevel1, level1, 32);
5504 memset(mlevel2, 0xFF, 16*count2);
5505 memset(mlevel3, 0, 128*count3);
5506 count3 = 0;
5507 for (i = 1; i < 256; i++) {
5508 int o1, o2, o3, i2, i3;
5509 if (decode[i] == 0xFFFE)
5510 /* unmapped character */
5511 continue;
5512 o1 = decode[i]>>11;
5513 o2 = (decode[i]>>7) & 0xF;
5514 i2 = 16*mlevel1[o1] + o2;
5515 if (mlevel2[i2] == 0xFF)
5516 mlevel2[i2] = count3++;
5517 o3 = decode[i] & 0x7F;
5518 i3 = 128*mlevel2[i2] + o3;
5519 mlevel3[i3] = i;
5520 }
5521 return result;
5522}
5523
5524static int
5525encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5526{
5527 struct encoding_map *map = (struct encoding_map*)mapping;
5528 int l1 = c>>11;
5529 int l2 = (c>>7) & 0xF;
5530 int l3 = c & 0x7F;
5531 int i;
5532
5533#ifdef Py_UNICODE_WIDE
5534 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005535 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005536 }
5537#endif
5538 if (c == 0)
5539 return 0;
5540 /* level 1*/
5541 i = map->level1[l1];
5542 if (i == 0xFF) {
5543 return -1;
5544 }
5545 /* level 2*/
5546 i = map->level23[16*i+l2];
5547 if (i == 0xFF) {
5548 return -1;
5549 }
5550 /* level 3 */
5551 i = map->level23[16*map->count2 + 128*i + l3];
5552 if (i == 0) {
5553 return -1;
5554 }
5555 return i;
5556}
5557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558/* Lookup the character ch in the mapping. If the character
5559 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005560 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562{
Christian Heimes217cfd12007-12-02 14:31:20 +00005563 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564 PyObject *x;
5565
5566 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005568 x = PyObject_GetItem(mapping, w);
5569 Py_DECREF(w);
5570 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5572 /* No mapping found means: mapping is undefined. */
5573 PyErr_Clear();
5574 x = Py_None;
5575 Py_INCREF(x);
5576 return x;
5577 } else
5578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005580 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005582 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 long value = PyLong_AS_LONG(x);
5584 if (value < 0 || value > 255) {
5585 PyErr_SetString(PyExc_TypeError,
5586 "character mapping must be in range(256)");
5587 Py_DECREF(x);
5588 return NULL;
5589 }
5590 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005592 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 /* wrong return value */
5596 PyErr_Format(PyExc_TypeError,
5597 "character mapping must return integer, bytes or None, not %.400s",
5598 x->ob_type->tp_name);
5599 Py_DECREF(x);
5600 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 }
5602}
5603
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005604static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005605charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005606{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005607 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5608 /* exponentially overallocate to minimize reallocations */
5609 if (requiredsize < 2*outsize)
5610 requiredsize = 2*outsize;
5611 if (_PyBytes_Resize(outobj, requiredsize))
5612 return -1;
5613 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005614}
5615
Benjamin Peterson14339b62009-01-31 16:36:08 +00005616typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005618}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005620 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005621 space is available. Return a new reference to the object that
5622 was put in the output buffer, or Py_None, if the mapping was undefined
5623 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005624 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005626charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005629 PyObject *rep;
5630 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005631 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632
Christian Heimes90aa7642007-12-19 02:45:37 +00005633 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005634 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005636 if (res == -1)
5637 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005638 if (outsize<requiredsize)
5639 if (charmapencode_resize(outobj, outpos, requiredsize))
5640 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005641 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 outstart[(*outpos)++] = (char)res;
5643 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005644 }
5645
5646 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005649 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 Py_DECREF(rep);
5651 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005652 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 if (PyLong_Check(rep)) {
5654 Py_ssize_t requiredsize = *outpos+1;
5655 if (outsize<requiredsize)
5656 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5657 Py_DECREF(rep);
5658 return enc_EXCEPTION;
5659 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005660 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005662 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 else {
5664 const char *repchars = PyBytes_AS_STRING(rep);
5665 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5666 Py_ssize_t requiredsize = *outpos+repsize;
5667 if (outsize<requiredsize)
5668 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5669 Py_DECREF(rep);
5670 return enc_EXCEPTION;
5671 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005672 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 memcpy(outstart + *outpos, repchars, repsize);
5674 *outpos += repsize;
5675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005677 Py_DECREF(rep);
5678 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679}
5680
5681/* handle an error in PyUnicode_EncodeCharmap
5682 Return 0 on success, -1 on error */
5683static
5684int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005685 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005687 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005688 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689{
5690 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005691 Py_ssize_t repsize;
5692 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 Py_UNICODE *uni2;
5694 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005695 Py_ssize_t collstartpos = *inpos;
5696 Py_ssize_t collendpos = *inpos+1;
5697 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 char *encoding = "charmap";
5699 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005700 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702 /* find all unencodable characters */
5703 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005704 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005705 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 int res = encoding_map_lookup(p[collendpos], mapping);
5707 if (res != -1)
5708 break;
5709 ++collendpos;
5710 continue;
5711 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005712
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 rep = charmapencode_lookup(p[collendpos], mapping);
5714 if (rep==NULL)
5715 return -1;
5716 else if (rep!=Py_None) {
5717 Py_DECREF(rep);
5718 break;
5719 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005720 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 }
5723 /* cache callback name lookup
5724 * (if not done yet, i.e. it's the first error) */
5725 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 if ((errors==NULL) || (!strcmp(errors, "strict")))
5727 *known_errorHandler = 1;
5728 else if (!strcmp(errors, "replace"))
5729 *known_errorHandler = 2;
5730 else if (!strcmp(errors, "ignore"))
5731 *known_errorHandler = 3;
5732 else if (!strcmp(errors, "xmlcharrefreplace"))
5733 *known_errorHandler = 4;
5734 else
5735 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005736 }
5737 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005738 case 1: /* strict */
5739 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5740 return -1;
5741 case 2: /* replace */
5742 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 x = charmapencode_output('?', mapping, res, respos);
5744 if (x==enc_EXCEPTION) {
5745 return -1;
5746 }
5747 else if (x==enc_FAILED) {
5748 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5749 return -1;
5750 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005751 }
5752 /* fall through */
5753 case 3: /* ignore */
5754 *inpos = collendpos;
5755 break;
5756 case 4: /* xmlcharrefreplace */
5757 /* generate replacement (temporarily (mis)uses p) */
5758 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 char buffer[2+29+1+1];
5760 char *cp;
5761 sprintf(buffer, "&#%d;", (int)p[collpos]);
5762 for (cp = buffer; *cp; ++cp) {
5763 x = charmapencode_output(*cp, mapping, res, respos);
5764 if (x==enc_EXCEPTION)
5765 return -1;
5766 else if (x==enc_FAILED) {
5767 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5768 return -1;
5769 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005770 }
5771 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005772 *inpos = collendpos;
5773 break;
5774 default:
5775 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 encoding, reason, p, size, exceptionObject,
5777 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005778 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005780 if (PyBytes_Check(repunicode)) {
5781 /* Directly copy bytes result to output. */
5782 Py_ssize_t outsize = PyBytes_Size(*res);
5783 Py_ssize_t requiredsize;
5784 repsize = PyBytes_Size(repunicode);
5785 requiredsize = *respos + repsize;
5786 if (requiredsize > outsize)
5787 /* Make room for all additional bytes. */
5788 if (charmapencode_resize(res, respos, requiredsize)) {
5789 Py_DECREF(repunicode);
5790 return -1;
5791 }
5792 memcpy(PyBytes_AsString(*res) + *respos,
5793 PyBytes_AsString(repunicode), repsize);
5794 *respos += repsize;
5795 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005796 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005797 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005798 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005799 /* generate replacement */
5800 repsize = PyUnicode_GET_SIZE(repunicode);
5801 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 x = charmapencode_output(*uni2, mapping, res, respos);
5803 if (x==enc_EXCEPTION) {
5804 return -1;
5805 }
5806 else if (x==enc_FAILED) {
5807 Py_DECREF(repunicode);
5808 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5809 return -1;
5810 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005811 }
5812 *inpos = newpos;
5813 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005814 }
5815 return 0;
5816}
5817
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 Py_ssize_t size,
5820 PyObject *mapping,
5821 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005823 /* output object */
5824 PyObject *res = NULL;
5825 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005826 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005827 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005828 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 PyObject *errorHandler = NULL;
5830 PyObject *exc = NULL;
5831 /* the following variable is used for caching string comparisons
5832 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5833 * 3=ignore, 4=xmlcharrefreplace */
5834 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835
5836 /* Default to Latin-1 */
5837 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005840 /* allocate enough for a simple encoding without
5841 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005842 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005843 if (res == NULL)
5844 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005845 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 /* try to encode it */
5850 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5851 if (x==enc_EXCEPTION) /* error */
5852 goto onError;
5853 if (x==enc_FAILED) { /* unencodable character */
5854 if (charmap_encoding_error(p, size, &inpos, mapping,
5855 &exc,
5856 &known_errorHandler, &errorHandler, errors,
5857 &res, &respos)) {
5858 goto onError;
5859 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005860 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 else
5862 /* done with this character => adjust input position */
5863 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005867 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005868 if (_PyBytes_Resize(&res, respos) < 0)
5869 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005870
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871 Py_XDECREF(exc);
5872 Py_XDECREF(errorHandler);
5873 return res;
5874
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 Py_XDECREF(res);
5877 Py_XDECREF(exc);
5878 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 return NULL;
5880}
5881
5882PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884{
5885 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 PyErr_BadArgument();
5887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 }
5889 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005890 PyUnicode_GET_SIZE(unicode),
5891 mapping,
5892 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893}
5894
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895/* create or adjust a UnicodeTranslateError */
5896static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 const Py_UNICODE *unicode, Py_ssize_t size,
5898 Py_ssize_t startpos, Py_ssize_t endpos,
5899 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005901 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005902 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 }
5905 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5907 goto onError;
5908 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5909 goto onError;
5910 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5911 goto onError;
5912 return;
5913 onError:
5914 Py_DECREF(*exceptionObject);
5915 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 }
5917}
5918
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919/* raises a UnicodeTranslateError */
5920static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 const Py_UNICODE *unicode, Py_ssize_t size,
5922 Py_ssize_t startpos, Py_ssize_t endpos,
5923 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005924{
5925 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005929}
5930
5931/* error handling callback helper:
5932 build arguments, call the callback and check the arguments,
5933 put the result into newpos and return the replacement string, which
5934 has to be freed by the caller */
5935static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 PyObject **errorHandler,
5937 const char *reason,
5938 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5939 Py_ssize_t startpos, Py_ssize_t endpos,
5940 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005942 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005943
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005944 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005945 PyObject *restuple;
5946 PyObject *resunicode;
5947
5948 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005950 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005952 }
5953
5954 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005956 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958
5959 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005963 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005964 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005965 Py_DECREF(restuple);
5966 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967 }
5968 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 &resunicode, &i_newpos)) {
5970 Py_DECREF(restuple);
5971 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005972 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005973 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005975 else
5976 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005977 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5979 Py_DECREF(restuple);
5980 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005981 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005982 Py_INCREF(resunicode);
5983 Py_DECREF(restuple);
5984 return resunicode;
5985}
5986
5987/* Lookup the character ch in the mapping and put the result in result,
5988 which must be decrefed by the caller.
5989 Return 0 on success, -1 on error */
5990static
5991int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5992{
Christian Heimes217cfd12007-12-02 14:31:20 +00005993 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 PyObject *x;
5995
5996 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 x = PyObject_GetItem(mapping, w);
5999 Py_DECREF(w);
6000 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6002 /* No mapping found means: use 1:1 mapping. */
6003 PyErr_Clear();
6004 *result = NULL;
6005 return 0;
6006 } else
6007 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006008 }
6009 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 *result = x;
6011 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006012 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006013 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 long value = PyLong_AS_LONG(x);
6015 long max = PyUnicode_GetMax();
6016 if (value < 0 || value > max) {
6017 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006018 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 Py_DECREF(x);
6020 return -1;
6021 }
6022 *result = x;
6023 return 0;
6024 }
6025 else if (PyUnicode_Check(x)) {
6026 *result = x;
6027 return 0;
6028 }
6029 else {
6030 /* wrong return value */
6031 PyErr_SetString(PyExc_TypeError,
6032 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006033 Py_DECREF(x);
6034 return -1;
6035 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006036}
6037/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 if not reallocate and adjust various state variables.
6039 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006040static
Walter Dörwald4894c302003-10-24 14:25:28 +00006041int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006044 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006045 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 /* remember old output position */
6047 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6048 /* exponentially overallocate to minimize reallocations */
6049 if (requiredsize < 2 * oldsize)
6050 requiredsize = 2 * oldsize;
6051 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6052 return -1;
6053 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006054 }
6055 return 0;
6056}
6057/* lookup the character, put the result in the output string and adjust
6058 various state variables. Return a new reference to the object that
6059 was put in the output buffer in *result, or Py_None, if the mapping was
6060 undefined (in which case no character was written).
6061 The called must decref result.
6062 Return 0 on success, -1 on error. */
6063static
Walter Dörwald4894c302003-10-24 14:25:28 +00006064int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6066 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006067{
Walter Dörwald4894c302003-10-24 14:25:28 +00006068 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 /* not found => default to 1:1 mapping */
6072 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006073 }
6074 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006076 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 /* no overflow check, because we know that the space is enough */
6078 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 }
6080 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6082 if (repsize==1) {
6083 /* no overflow check, because we know that the space is enough */
6084 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6085 }
6086 else if (repsize!=0) {
6087 /* more than one character */
6088 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6089 (insize - (curinp-startinp)) +
6090 repsize - 1;
6091 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6092 return -1;
6093 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6094 *outp += repsize;
6095 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006096 }
6097 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099 return 0;
6100}
6101
6102PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 Py_ssize_t size,
6104 PyObject *mapping,
6105 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006107 /* output object */
6108 PyObject *res = NULL;
6109 /* pointers to the beginning and end+1 of input */
6110 const Py_UNICODE *startp = p;
6111 const Py_UNICODE *endp = p + size;
6112 /* pointer into the output */
6113 Py_UNICODE *str;
6114 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006115 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006116 char *reason = "character maps to <undefined>";
6117 PyObject *errorHandler = NULL;
6118 PyObject *exc = NULL;
6119 /* the following variable is used for caching string comparisons
6120 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6121 * 3=ignore, 4=xmlcharrefreplace */
6122 int known_errorHandler = -1;
6123
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 PyErr_BadArgument();
6126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128
6129 /* allocate enough for a simple 1:1 translation without
6130 replacements, if we need more, we'll resize */
6131 res = PyUnicode_FromUnicode(NULL, size);
6132 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 /* try to encode it */
6140 PyObject *x = NULL;
6141 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6142 Py_XDECREF(x);
6143 goto onError;
6144 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006145 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 if (x!=Py_None) /* it worked => adjust input pointer */
6147 ++p;
6148 else { /* untranslatable character */
6149 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6150 Py_ssize_t repsize;
6151 Py_ssize_t newpos;
6152 Py_UNICODE *uni2;
6153 /* startpos for collecting untranslatable chars */
6154 const Py_UNICODE *collstart = p;
6155 const Py_UNICODE *collend = p+1;
6156 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 /* find all untranslatable characters */
6159 while (collend < endp) {
6160 if (charmaptranslate_lookup(*collend, mapping, &x))
6161 goto onError;
6162 Py_XDECREF(x);
6163 if (x!=Py_None)
6164 break;
6165 ++collend;
6166 }
6167 /* cache callback name lookup
6168 * (if not done yet, i.e. it's the first error) */
6169 if (known_errorHandler==-1) {
6170 if ((errors==NULL) || (!strcmp(errors, "strict")))
6171 known_errorHandler = 1;
6172 else if (!strcmp(errors, "replace"))
6173 known_errorHandler = 2;
6174 else if (!strcmp(errors, "ignore"))
6175 known_errorHandler = 3;
6176 else if (!strcmp(errors, "xmlcharrefreplace"))
6177 known_errorHandler = 4;
6178 else
6179 known_errorHandler = 0;
6180 }
6181 switch (known_errorHandler) {
6182 case 1: /* strict */
6183 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006184 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 case 2: /* replace */
6186 /* No need to check for space, this is a 1:1 replacement */
6187 for (coll = collstart; coll<collend; ++coll)
6188 *str++ = '?';
6189 /* fall through */
6190 case 3: /* ignore */
6191 p = collend;
6192 break;
6193 case 4: /* xmlcharrefreplace */
6194 /* generate replacement (temporarily (mis)uses p) */
6195 for (p = collstart; p < collend; ++p) {
6196 char buffer[2+29+1+1];
6197 char *cp;
6198 sprintf(buffer, "&#%d;", (int)*p);
6199 if (charmaptranslate_makespace(&res, &str,
6200 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6201 goto onError;
6202 for (cp = buffer; *cp; ++cp)
6203 *str++ = *cp;
6204 }
6205 p = collend;
6206 break;
6207 default:
6208 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6209 reason, startp, size, &exc,
6210 collstart-startp, collend-startp, &newpos);
6211 if (repunicode == NULL)
6212 goto onError;
6213 /* generate replacement */
6214 repsize = PyUnicode_GET_SIZE(repunicode);
6215 if (charmaptranslate_makespace(&res, &str,
6216 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6217 Py_DECREF(repunicode);
6218 goto onError;
6219 }
6220 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6221 *str++ = *uni2;
6222 p = startp + newpos;
6223 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006224 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006225 }
6226 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006227 /* Resize if we allocated to much */
6228 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006229 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 if (PyUnicode_Resize(&res, respos) < 0)
6231 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006232 }
6233 Py_XDECREF(exc);
6234 Py_XDECREF(errorHandler);
6235 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006238 Py_XDECREF(res);
6239 Py_XDECREF(exc);
6240 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 return NULL;
6242}
6243
6244PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 PyObject *mapping,
6246 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247{
6248 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006249
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 str = PyUnicode_FromObject(str);
6251 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 PyUnicode_GET_SIZE(str),
6255 mapping,
6256 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 Py_DECREF(str);
6258 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006259
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 Py_XDECREF(str);
6262 return NULL;
6263}
Tim Petersced69f82003-09-16 20:30:58 +00006264
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006265PyObject *
6266PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6267 Py_ssize_t length)
6268{
6269 PyObject *result;
6270 Py_UNICODE *p; /* write pointer into result */
6271 Py_ssize_t i;
6272 /* Copy to a new string */
6273 result = (PyObject *)_PyUnicode_New(length);
6274 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6275 if (result == NULL)
6276 return result;
6277 p = PyUnicode_AS_UNICODE(result);
6278 /* Iterate over code points */
6279 for (i = 0; i < length; i++) {
6280 Py_UNICODE ch =s[i];
6281 if (ch > 127) {
6282 int decimal = Py_UNICODE_TODECIMAL(ch);
6283 if (decimal >= 0)
6284 p[i] = '0' + decimal;
6285 }
6286 }
6287 return result;
6288}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006289/* --- Decimal Encoder ---------------------------------------------------- */
6290
6291int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 Py_ssize_t length,
6293 char *output,
6294 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006295{
6296 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006297 PyObject *errorHandler = NULL;
6298 PyObject *exc = NULL;
6299 const char *encoding = "decimal";
6300 const char *reason = "invalid decimal Unicode string";
6301 /* the following variable is used for caching string comparisons
6302 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6303 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006304
6305 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 PyErr_BadArgument();
6307 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006308 }
6309
6310 p = s;
6311 end = s + length;
6312 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 register Py_UNICODE ch = *p;
6314 int decimal;
6315 PyObject *repunicode;
6316 Py_ssize_t repsize;
6317 Py_ssize_t newpos;
6318 Py_UNICODE *uni2;
6319 Py_UNICODE *collstart;
6320 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006321
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006323 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 ++p;
6325 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006326 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 decimal = Py_UNICODE_TODECIMAL(ch);
6328 if (decimal >= 0) {
6329 *output++ = '0' + decimal;
6330 ++p;
6331 continue;
6332 }
6333 if (0 < ch && ch < 256) {
6334 *output++ = (char)ch;
6335 ++p;
6336 continue;
6337 }
6338 /* All other characters are considered unencodable */
6339 collstart = p;
Victor Stinnerab1d16b2011-11-22 01:45:37 +01006340 for (collend = p+1; collend < end; collend++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 if ((0 < *collend && *collend < 256) ||
Victor Stinnerab1d16b2011-11-22 01:45:37 +01006342 Py_UNICODE_ISSPACE(*collend) ||
6343 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 break;
6345 }
6346 /* cache callback name lookup
6347 * (if not done yet, i.e. it's the first error) */
6348 if (known_errorHandler==-1) {
6349 if ((errors==NULL) || (!strcmp(errors, "strict")))
6350 known_errorHandler = 1;
6351 else if (!strcmp(errors, "replace"))
6352 known_errorHandler = 2;
6353 else if (!strcmp(errors, "ignore"))
6354 known_errorHandler = 3;
6355 else if (!strcmp(errors, "xmlcharrefreplace"))
6356 known_errorHandler = 4;
6357 else
6358 known_errorHandler = 0;
6359 }
6360 switch (known_errorHandler) {
6361 case 1: /* strict */
6362 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6363 goto onError;
6364 case 2: /* replace */
6365 for (p = collstart; p < collend; ++p)
6366 *output++ = '?';
6367 /* fall through */
6368 case 3: /* ignore */
6369 p = collend;
6370 break;
6371 case 4: /* xmlcharrefreplace */
6372 /* generate replacement (temporarily (mis)uses p) */
6373 for (p = collstart; p < collend; ++p)
6374 output += sprintf(output, "&#%d;", (int)*p);
6375 p = collend;
6376 break;
6377 default:
6378 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6379 encoding, reason, s, length, &exc,
6380 collstart-s, collend-s, &newpos);
6381 if (repunicode == NULL)
6382 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006383 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006384 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006385 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6386 Py_DECREF(repunicode);
6387 goto onError;
6388 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 /* generate replacement */
6390 repsize = PyUnicode_GET_SIZE(repunicode);
6391 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6392 Py_UNICODE ch = *uni2;
6393 if (Py_UNICODE_ISSPACE(ch))
6394 *output++ = ' ';
6395 else {
6396 decimal = Py_UNICODE_TODECIMAL(ch);
6397 if (decimal >= 0)
6398 *output++ = '0' + decimal;
6399 else if (0 < ch && ch < 256)
6400 *output++ = (char)ch;
6401 else {
6402 Py_DECREF(repunicode);
6403 raise_encode_exception(&exc, encoding,
6404 s, length, collstart-s, collend-s, reason);
6405 goto onError;
6406 }
6407 }
6408 }
6409 p = s + newpos;
6410 Py_DECREF(repunicode);
6411 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006412 }
6413 /* 0-terminate the output string */
6414 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 Py_XDECREF(exc);
6416 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006417 return 0;
6418
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420 Py_XDECREF(exc);
6421 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006422 return -1;
6423}
6424
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425/* --- Helpers ------------------------------------------------------------ */
6426
Eric Smith8c663262007-08-25 02:26:07 +00006427#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006428#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006429
Thomas Wouters477c8d52006-05-27 19:21:47 +00006430#include "stringlib/count.h"
6431#include "stringlib/find.h"
6432#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006433#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006434
Eric Smith5807c412008-05-11 21:00:57 +00006435#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006436#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006437#include "stringlib/localeutil.h"
6438
Thomas Wouters477c8d52006-05-27 19:21:47 +00006439/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006440#define ADJUST_INDICES(start, end, len) \
6441 if (end > len) \
6442 end = len; \
6443 else if (end < 0) { \
6444 end += len; \
6445 if (end < 0) \
6446 end = 0; \
6447 } \
6448 if (start < 0) { \
6449 start += len; \
6450 if (start < 0) \
6451 start = 0; \
6452 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006453
Ezio Melotti93e7afc2011-08-22 14:08:38 +03006454/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
6455 * by 'ptr', possibly combining surrogate pairs on narrow builds.
6456 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
6457 * that should be returned and 'end' pointing to the end of the buffer.
6458 * ('end' is used on narrow builds to detect a lone surrogate at the
6459 * end of the buffer that should be returned unchanged.)
6460 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
6461 * The type of the returned char is always Py_UCS4.
6462 *
6463 * Note: the macro advances ptr to next char, so it might have side-effects
6464 * (especially if used with other macros).
6465 */
6466
6467/* helper macros used by _Py_UNICODE_NEXT */
6468#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
6469#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
6470/* Join two surrogate characters and return a single Py_UCS4 value. */
6471#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
6472 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
6473 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
6474
6475#ifdef Py_UNICODE_WIDE
6476#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
6477#else
6478#define _Py_UNICODE_NEXT(ptr, end) \
6479 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
6480 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
6481 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
6482 (Py_UCS4)*(ptr)++)
6483#endif
6484
Martin v. Löwis18e16552006-02-15 17:27:45 +00006485Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006486 PyObject *substr,
6487 Py_ssize_t start,
6488 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006490 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006491 PyUnicodeObject* str_obj;
6492 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006493
Thomas Wouters477c8d52006-05-27 19:21:47 +00006494 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6495 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006497 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6498 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 Py_DECREF(str_obj);
6500 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 }
Tim Petersced69f82003-09-16 20:30:58 +00006502
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006503 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006504 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006505 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6506 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006507 );
6508
6509 Py_DECREF(sub_obj);
6510 Py_DECREF(str_obj);
6511
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 return result;
6513}
6514
Martin v. Löwis18e16552006-02-15 17:27:45 +00006515Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006516 PyObject *sub,
6517 Py_ssize_t start,
6518 Py_ssize_t end,
6519 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006521 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006522
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006524 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006526 sub = PyUnicode_FromObject(sub);
6527 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 Py_DECREF(str);
6529 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 }
Tim Petersced69f82003-09-16 20:30:58 +00006531
Thomas Wouters477c8d52006-05-27 19:21:47 +00006532 if (direction > 0)
6533 result = stringlib_find_slice(
6534 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6535 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6536 start, end
6537 );
6538 else
6539 result = stringlib_rfind_slice(
6540 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6541 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6542 start, end
6543 );
6544
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006546 Py_DECREF(sub);
6547
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 return result;
6549}
6550
Tim Petersced69f82003-09-16 20:30:58 +00006551static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 PyUnicodeObject *substring,
6554 Py_ssize_t start,
6555 Py_ssize_t end,
6556 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 if (substring->length == 0)
6559 return 1;
6560
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006561 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 end -= substring->length;
6563 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565
6566 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 if (Py_UNICODE_MATCH(self, end, substring))
6568 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 } else {
6570 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 }
6573
6574 return 0;
6575}
6576
Martin v. Löwis18e16552006-02-15 17:27:45 +00006577Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 PyObject *substr,
6579 Py_ssize_t start,
6580 Py_ssize_t end,
6581 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006583 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006584
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 str = PyUnicode_FromObject(str);
6586 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 substr = PyUnicode_FromObject(substr);
6589 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 Py_DECREF(str);
6591 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 }
Tim Petersced69f82003-09-16 20:30:58 +00006593
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 (PyUnicodeObject *)substr,
6596 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 Py_DECREF(str);
6598 Py_DECREF(substr);
6599 return result;
6600}
6601
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602/* Apply fixfct filter to the Unicode object self and return a
6603 reference to the modified object */
6604
Tim Petersced69f82003-09-16 20:30:58 +00006605static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608{
6609
6610 PyUnicodeObject *u;
6611
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006612 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006615
6616 Py_UNICODE_COPY(u->str, self->str, self->length);
6617
Tim Peters7a29bd52001-09-12 03:03:31 +00006618 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 /* fixfct should return TRUE if it modified the buffer. If
6620 FALSE, return a reference to the original buffer instead
6621 (to save space, not time) */
6622 Py_INCREF(self);
6623 Py_DECREF(u);
6624 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 }
6626 return (PyObject*) u;
6627}
6628
Tim Petersced69f82003-09-16 20:30:58 +00006629static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630int fixupper(PyUnicodeObject *self)
6631{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006632 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 Py_UNICODE *s = self->str;
6634 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006635
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006637 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006638
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 ch = Py_UNICODE_TOUPPER(*s);
6640 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 *s = ch;
6643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 s++;
6645 }
6646
6647 return status;
6648}
6649
Tim Petersced69f82003-09-16 20:30:58 +00006650static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651int fixlower(PyUnicodeObject *self)
6652{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006653 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 Py_UNICODE *s = self->str;
6655 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006656
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006659
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 ch = Py_UNICODE_TOLOWER(*s);
6661 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 *s = ch;
6664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 s++;
6666 }
6667
6668 return status;
6669}
6670
Tim Petersced69f82003-09-16 20:30:58 +00006671static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672int fixswapcase(PyUnicodeObject *self)
6673{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006674 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 Py_UNICODE *s = self->str;
6676 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006677
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 while (len-- > 0) {
6679 if (Py_UNICODE_ISUPPER(*s)) {
6680 *s = Py_UNICODE_TOLOWER(*s);
6681 status = 1;
6682 } else if (Py_UNICODE_ISLOWER(*s)) {
6683 *s = Py_UNICODE_TOUPPER(*s);
6684 status = 1;
6685 }
6686 s++;
6687 }
6688
6689 return status;
6690}
6691
Tim Petersced69f82003-09-16 20:30:58 +00006692static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693int fixcapitalize(PyUnicodeObject *self)
6694{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006695 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006696 Py_UNICODE *s = self->str;
6697 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006698
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006699 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 return 0;
Ezio Melottiee8d9982011-08-15 09:09:57 +03006701 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 *s = Py_UNICODE_TOUPPER(*s);
6703 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006705 s++;
6706 while (--len > 0) {
Ezio Melottiee8d9982011-08-15 09:09:57 +03006707 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006708 *s = Py_UNICODE_TOLOWER(*s);
6709 status = 1;
6710 }
6711 s++;
6712 }
6713 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714}
6715
6716static
6717int fixtitle(PyUnicodeObject *self)
6718{
6719 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6720 register Py_UNICODE *e;
6721 int previous_is_cased;
6722
6723 /* Shortcut for single character strings */
6724 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6726 if (*p != ch) {
6727 *p = ch;
6728 return 1;
6729 }
6730 else
6731 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 }
Tim Petersced69f82003-09-16 20:30:58 +00006733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 e = p + PyUnicode_GET_SIZE(self);
6735 previous_is_cased = 0;
6736 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006738
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 if (previous_is_cased)
6740 *p = Py_UNICODE_TOLOWER(ch);
6741 else
6742 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006743
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 if (Py_UNICODE_ISLOWER(ch) ||
6745 Py_UNICODE_ISUPPER(ch) ||
6746 Py_UNICODE_ISTITLE(ch))
6747 previous_is_cased = 1;
6748 else
6749 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 }
6751 return 1;
6752}
6753
Tim Peters8ce9f162004-08-27 01:49:32 +00006754PyObject *
6755PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756{
Skip Montanaro6543b452004-09-16 03:28:13 +00006757 const Py_UNICODE blank = ' ';
6758 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006759 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006760 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006761 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6762 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006763 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6764 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006765 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006766 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767
Tim Peters05eba1f2004-08-27 21:32:02 +00006768 fseq = PySequence_Fast(seq, "");
6769 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006770 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006771 }
6772
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006773 /* NOTE: the following code can't call back into Python code,
6774 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006775 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006776
Tim Peters05eba1f2004-08-27 21:32:02 +00006777 seqlen = PySequence_Fast_GET_SIZE(fseq);
6778 /* If empty sequence, return u"". */
6779 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006780 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6781 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006782 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006783 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006784 /* If singleton sequence with an exact Unicode, return that. */
6785 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 item = items[0];
6787 if (PyUnicode_CheckExact(item)) {
6788 Py_INCREF(item);
6789 res = (PyUnicodeObject *)item;
6790 goto Done;
6791 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006792 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006793 else {
6794 /* Set up sep and seplen */
6795 if (separator == NULL) {
6796 sep = &blank;
6797 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006798 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006799 else {
6800 if (!PyUnicode_Check(separator)) {
6801 PyErr_Format(PyExc_TypeError,
6802 "separator: expected str instance,"
6803 " %.80s found",
6804 Py_TYPE(separator)->tp_name);
6805 goto onError;
6806 }
6807 sep = PyUnicode_AS_UNICODE(separator);
6808 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006809 }
6810 }
6811
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006812 /* There are at least two things to join, or else we have a subclass
6813 * of str in the sequence.
6814 * Do a pre-pass to figure out the total amount of space we'll
6815 * need (sz), and see whether all argument are strings.
6816 */
6817 sz = 0;
6818 for (i = 0; i < seqlen; i++) {
6819 const Py_ssize_t old_sz = sz;
6820 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 if (!PyUnicode_Check(item)) {
6822 PyErr_Format(PyExc_TypeError,
6823 "sequence item %zd: expected str instance,"
6824 " %.80s found",
6825 i, Py_TYPE(item)->tp_name);
6826 goto onError;
6827 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006828 sz += PyUnicode_GET_SIZE(item);
6829 if (i != 0)
6830 sz += seplen;
6831 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6832 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006834 goto onError;
6835 }
6836 }
Tim Petersced69f82003-09-16 20:30:58 +00006837
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006838 res = _PyUnicode_New(sz);
6839 if (res == NULL)
6840 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006841
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006842 /* Catenate everything. */
6843 res_p = PyUnicode_AS_UNICODE(res);
6844 for (i = 0; i < seqlen; ++i) {
6845 Py_ssize_t itemlen;
6846 item = items[i];
6847 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 /* Copy item, and maybe the separator. */
6849 if (i) {
6850 Py_UNICODE_COPY(res_p, sep, seplen);
6851 res_p += seplen;
6852 }
6853 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6854 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006855 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006856
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006858 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 return (PyObject *)res;
6860
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006862 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006863 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 return NULL;
6865}
6866
Tim Petersced69f82003-09-16 20:30:58 +00006867static
6868PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 Py_ssize_t left,
6870 Py_ssize_t right,
6871 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872{
6873 PyUnicodeObject *u;
6874
6875 if (left < 0)
6876 left = 0;
6877 if (right < 0)
6878 right = 0;
6879
Tim Peters7a29bd52001-09-12 03:03:31 +00006880 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 Py_INCREF(self);
6882 return self;
6883 }
6884
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006885 if (left > PY_SSIZE_T_MAX - self->length ||
6886 right > PY_SSIZE_T_MAX - (left + self->length)) {
6887 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6888 return NULL;
6889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 u = _PyUnicode_New(left + self->length + right);
6891 if (u) {
6892 if (left)
6893 Py_UNICODE_FILL(u->str, fill, left);
6894 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6895 if (right)
6896 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6897 }
6898
6899 return u;
6900}
6901
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006902PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905
6906 string = PyUnicode_FromObject(string);
6907 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006910 list = stringlib_splitlines(
6911 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6912 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913
6914 Py_DECREF(string);
6915 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916}
6917
Tim Petersced69f82003-09-16 20:30:58 +00006918static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 PyUnicodeObject *substring,
6921 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006924 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006927 return stringlib_split_whitespace(
6928 (PyObject*) self, self->str, self->length, maxcount
6929 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006931 return stringlib_split(
6932 (PyObject*) self, self->str, self->length,
6933 substring->str, substring->length,
6934 maxcount
6935 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936}
6937
Tim Petersced69f82003-09-16 20:30:58 +00006938static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006939PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 PyUnicodeObject *substring,
6941 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006942{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006943 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006944 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006945
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006946 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006947 return stringlib_rsplit_whitespace(
6948 (PyObject*) self, self->str, self->length, maxcount
6949 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006950
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006951 return stringlib_rsplit(
6952 (PyObject*) self, self->str, self->length,
6953 substring->str, substring->length,
6954 maxcount
6955 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006956}
6957
6958static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006960 PyUnicodeObject *str1,
6961 PyUnicodeObject *str2,
6962 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963{
6964 PyUnicodeObject *u;
6965
6966 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006968 else if (maxcount == 0 || self->length == 0)
6969 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970
Thomas Wouters477c8d52006-05-27 19:21:47 +00006971 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006972 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006973 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006974 if (str1->length == 0)
6975 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006976 if (str1->length == 1) {
6977 /* replace characters */
6978 Py_UNICODE u1, u2;
6979 if (!findchar(self->str, self->length, str1->str[0]))
6980 goto nothing;
6981 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6982 if (!u)
6983 return NULL;
6984 Py_UNICODE_COPY(u->str, self->str, self->length);
6985 u1 = str1->str[0];
6986 u2 = str2->str[0];
6987 for (i = 0; i < u->length; i++)
6988 if (u->str[i] == u1) {
6989 if (--maxcount < 0)
6990 break;
6991 u->str[i] = u2;
6992 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006994 i = stringlib_find(
6995 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006997 if (i < 0)
6998 goto nothing;
6999 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7000 if (!u)
7001 return NULL;
7002 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007003
7004 /* change everything in-place, starting with this one */
7005 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7006 i += str1->length;
7007
7008 while ( --maxcount > 0) {
7009 i = stringlib_find(self->str+i, self->length-i,
7010 str1->str, str1->length,
7011 i);
7012 if (i == -1)
7013 break;
7014 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7015 i += str1->length;
7016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007019
Victor Stinnerab1d16b2011-11-22 01:45:37 +01007020 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007021 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 Py_UNICODE *p;
7023
7024 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007025 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7026 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007027 if (n == 0)
7028 goto nothing;
7029 /* new_size = self->length + n * (str2->length - str1->length)); */
7030 delta = (str2->length - str1->length);
7031 if (delta == 0) {
7032 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007034 product = n * (str2->length - str1->length);
7035 if ((product / (str2->length - str1->length)) != n) {
7036 PyErr_SetString(PyExc_OverflowError,
7037 "replace string is too long");
7038 return NULL;
7039 }
7040 new_size = self->length + product;
7041 if (new_size < 0) {
7042 PyErr_SetString(PyExc_OverflowError,
7043 "replace string is too long");
7044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 }
7046 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007047 u = _PyUnicode_New(new_size);
7048 if (!u)
7049 return NULL;
7050 i = 0;
7051 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007052 if (str1->length > 0) {
7053 while (n-- > 0) {
7054 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007055 j = stringlib_find(self->str+i, self->length-i,
7056 str1->str, str1->length,
7057 i);
7058 if (j == -1)
7059 break;
7060 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007061 /* copy unchanged part [i:j] */
7062 Py_UNICODE_COPY(p, self->str+i, j-i);
7063 p += j - i;
7064 }
7065 /* copy substitution string */
7066 if (str2->length > 0) {
7067 Py_UNICODE_COPY(p, str2->str, str2->length);
7068 p += str2->length;
7069 }
7070 i = j + str1->length;
7071 }
7072 if (i < self->length)
7073 /* copy tail [i:] */
7074 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7075 } else {
7076 /* interleave */
7077 while (n > 0) {
7078 Py_UNICODE_COPY(p, str2->str, str2->length);
7079 p += str2->length;
7080 if (--n <= 0)
7081 break;
7082 *p++ = self->str[i++];
7083 }
7084 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7085 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007088
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007090 /* nothing to replace; return original string (when possible) */
7091 if (PyUnicode_CheckExact(self)) {
7092 Py_INCREF(self);
7093 return (PyObject *) self;
7094 }
7095 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096}
7097
7098/* --- Unicode Object Methods --------------------------------------------- */
7099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007100PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102\n\
7103Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007104characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105
7106static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007107unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 return fixup(self, fixtitle);
7110}
7111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007112PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114\n\
7115Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007116have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117
7118static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007119unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121 return fixup(self, fixcapitalize);
7122}
7123
7124#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007125PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127\n\
7128Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007129normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130
7131static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007132unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133{
7134 PyObject *list;
7135 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007136 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 /* Split into words */
7139 list = split(self, NULL, -1);
7140 if (!list)
7141 return NULL;
7142
7143 /* Capitalize each word */
7144 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7145 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 if (item == NULL)
7148 goto onError;
7149 Py_DECREF(PyList_GET_ITEM(list, i));
7150 PyList_SET_ITEM(list, i, item);
7151 }
7152
7153 /* Join the words to form a new string */
7154 item = PyUnicode_Join(NULL, list);
7155
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 Py_DECREF(list);
7158 return (PyObject *)item;
7159}
7160#endif
7161
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007162/* Argument converter. Coerces to a single unicode character */
7163
7164static int
7165convert_uc(PyObject *obj, void *addr)
7166{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007167 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7168 PyObject *uniobj;
7169 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007170
Benjamin Peterson14339b62009-01-31 16:36:08 +00007171 uniobj = PyUnicode_FromObject(obj);
7172 if (uniobj == NULL) {
7173 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007175 return 0;
7176 }
7177 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7178 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007180 Py_DECREF(uniobj);
7181 return 0;
7182 }
7183 unistr = PyUnicode_AS_UNICODE(uniobj);
7184 *fillcharloc = unistr[0];
7185 Py_DECREF(uniobj);
7186 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007187}
7188
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007189PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007192Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007193done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194
7195static PyObject *
7196unicode_center(PyUnicodeObject *self, PyObject *args)
7197{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007198 Py_ssize_t marg, left;
7199 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007200 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201
Thomas Woutersde017742006-02-16 19:34:37 +00007202 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 return NULL;
7204
Tim Peters7a29bd52001-09-12 03:03:31 +00007205 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 Py_INCREF(self);
7207 return (PyObject*) self;
7208 }
7209
7210 marg = width - self->length;
7211 left = marg / 2 + (marg & width & 1);
7212
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007213 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214}
7215
Marc-André Lemburge5034372000-08-08 08:04:29 +00007216#if 0
7217
7218/* This code should go into some future Unicode collation support
7219 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007220 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007221
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007222/* speedy UTF-16 code point order comparison */
7223/* gleaned from: */
7224/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7225
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007226static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007227{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007228 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007229 0, 0, 0, 0, 0, 0, 0, 0,
7230 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007231 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007232};
7233
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234static int
7235unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7236{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007237 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007238
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 Py_UNICODE *s1 = str1->str;
7240 Py_UNICODE *s2 = str2->str;
7241
7242 len1 = str1->length;
7243 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007244
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007246 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007247
7248 c1 = *s1++;
7249 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007250
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 if (c1 > (1<<11) * 26)
7252 c1 += utf16Fixup[c1>>11];
7253 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007254 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007255 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007256
7257 if (c1 != c2)
7258 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007259
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007260 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 }
7262
7263 return (len1 < len2) ? -1 : (len1 != len2);
7264}
7265
Marc-André Lemburge5034372000-08-08 08:04:29 +00007266#else
7267
7268static int
7269unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7270{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007271 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007272
7273 Py_UNICODE *s1 = str1->str;
7274 Py_UNICODE *s2 = str2->str;
7275
7276 len1 = str1->length;
7277 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007278
Marc-André Lemburge5034372000-08-08 08:04:29 +00007279 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007280 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007281
Fredrik Lundh45714e92001-06-26 16:39:36 +00007282 c1 = *s1++;
7283 c2 = *s2++;
7284
7285 if (c1 != c2)
7286 return (c1 < c2) ? -1 : 1;
7287
Marc-André Lemburge5034372000-08-08 08:04:29 +00007288 len1--; len2--;
7289 }
7290
7291 return (len1 < len2) ? -1 : (len1 != len2);
7292}
7293
7294#endif
7295
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007299 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7300 return unicode_compare((PyUnicodeObject *)left,
7301 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007302 PyErr_Format(PyExc_TypeError,
7303 "Can't compare %.100s and %.100s",
7304 left->ob_type->tp_name,
7305 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 return -1;
7307}
7308
Martin v. Löwis5b222132007-06-10 09:51:05 +00007309int
7310PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7311{
7312 int i;
7313 Py_UNICODE *id;
7314 assert(PyUnicode_Check(uni));
7315 id = PyUnicode_AS_UNICODE(uni);
7316 /* Compare Unicode string and source character set string */
7317 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 if (id[i] != str[i])
7319 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007320 /* This check keeps Python strings that end in '\0' from comparing equal
7321 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007322 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007324 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007326 return 0;
7327}
7328
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007329
Benjamin Peterson29060642009-01-31 22:14:21 +00007330#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007331 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007332
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007333PyObject *PyUnicode_RichCompare(PyObject *left,
7334 PyObject *right,
7335 int op)
7336{
7337 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007338
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007339 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7340 PyObject *v;
7341 if (((PyUnicodeObject *) left)->length !=
7342 ((PyUnicodeObject *) right)->length) {
7343 if (op == Py_EQ) {
7344 Py_INCREF(Py_False);
7345 return Py_False;
7346 }
7347 if (op == Py_NE) {
7348 Py_INCREF(Py_True);
7349 return Py_True;
7350 }
7351 }
7352 if (left == right)
7353 result = 0;
7354 else
7355 result = unicode_compare((PyUnicodeObject *)left,
7356 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007357
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007358 /* Convert the return value to a Boolean */
7359 switch (op) {
7360 case Py_EQ:
7361 v = TEST_COND(result == 0);
7362 break;
7363 case Py_NE:
7364 v = TEST_COND(result != 0);
7365 break;
7366 case Py_LE:
7367 v = TEST_COND(result <= 0);
7368 break;
7369 case Py_GE:
7370 v = TEST_COND(result >= 0);
7371 break;
7372 case Py_LT:
7373 v = TEST_COND(result == -1);
7374 break;
7375 case Py_GT:
7376 v = TEST_COND(result == 1);
7377 break;
7378 default:
7379 PyErr_BadArgument();
7380 return NULL;
7381 }
7382 Py_INCREF(v);
7383 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007384 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007385
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007386 Py_INCREF(Py_NotImplemented);
7387 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007388}
7389
Guido van Rossum403d68b2000-03-13 15:55:09 +00007390int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007392{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007393 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007394 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007395
7396 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007397 sub = PyUnicode_FromObject(element);
7398 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 PyErr_Format(PyExc_TypeError,
7400 "'in <string>' requires string as left operand, not %s",
7401 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007402 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007403 }
7404
Thomas Wouters477c8d52006-05-27 19:21:47 +00007405 str = PyUnicode_FromObject(container);
7406 if (!str) {
7407 Py_DECREF(sub);
7408 return -1;
7409 }
7410
7411 result = stringlib_contains_obj(str, sub);
7412
7413 Py_DECREF(str);
7414 Py_DECREF(sub);
7415
Guido van Rossum403d68b2000-03-13 15:55:09 +00007416 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007417}
7418
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419/* Concat to string or Unicode object giving a new Unicode object. */
7420
7421PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423{
7424 PyUnicodeObject *u = NULL, *v = NULL, *w;
7425
7426 /* Coerce the two arguments */
7427 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7428 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7431 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433
7434 /* Shortcuts */
7435 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 Py_DECREF(v);
7437 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 }
7439 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 Py_DECREF(u);
7441 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442 }
7443
7444 /* Concat the two Unicode strings */
7445 w = _PyUnicode_New(u->length + v->length);
7446 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 Py_UNICODE_COPY(w->str, u->str, u->length);
7449 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7450
7451 Py_DECREF(u);
7452 Py_DECREF(v);
7453 return (PyObject *)w;
7454
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 Py_XDECREF(u);
7457 Py_XDECREF(v);
7458 return NULL;
7459}
7460
Walter Dörwald1ab83302007-05-18 17:15:44 +00007461void
7462PyUnicode_Append(PyObject **pleft, PyObject *right)
7463{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007464 PyObject *new;
7465 if (*pleft == NULL)
7466 return;
7467 if (right == NULL || !PyUnicode_Check(*pleft)) {
7468 Py_DECREF(*pleft);
7469 *pleft = NULL;
7470 return;
7471 }
7472 new = PyUnicode_Concat(*pleft, right);
7473 Py_DECREF(*pleft);
7474 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007475}
7476
7477void
7478PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7479{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007480 PyUnicode_Append(pleft, right);
7481 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007482}
7483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007484PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007487Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007488string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007489interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490
7491static PyObject *
7492unicode_count(PyUnicodeObject *self, PyObject *args)
7493{
7494 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007495 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007496 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497 PyObject *result;
7498
Jesus Ceaac451502011-04-20 17:09:23 +02007499 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7500 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007502
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007503 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007504 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007505 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007506 substring->str, substring->length,
7507 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007508 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509
7510 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007511
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 return result;
7513}
7514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007515PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007516 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007518Encode S using the codec registered for encoding. Default encoding\n\
7519is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007520handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007521a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7522'xmlcharrefreplace' as well as any other name registered with\n\
7523codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524
7525static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007526unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007528 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 char *encoding = NULL;
7530 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007531
Benjamin Peterson308d6372009-09-18 21:42:35 +00007532 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7533 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007535 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007536}
7537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007538PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540\n\
7541Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007542If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543
7544static PyObject*
7545unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7546{
7547 Py_UNICODE *e;
7548 Py_UNICODE *p;
7549 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007550 Py_UNICODE *qe;
7551 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 PyUnicodeObject *u;
7553 int tabsize = 8;
7554
7555 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557
Thomas Wouters7e474022000-07-16 12:04:32 +00007558 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007559 i = 0; /* chars up to and including most recent \n or \r */
7560 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7561 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 for (p = self->str; p < e; p++)
7563 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 if (tabsize > 0) {
7565 incr = tabsize - (j % tabsize); /* cannot overflow */
7566 if (j > PY_SSIZE_T_MAX - incr)
7567 goto overflow1;
7568 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007569 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 if (j > PY_SSIZE_T_MAX - 1)
7573 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 j++;
7575 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 if (i > PY_SSIZE_T_MAX - j)
7577 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007579 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 }
7581 }
7582
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007583 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007585
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 /* Second pass: create output string and fill it */
7587 u = _PyUnicode_New(i + j);
7588 if (!u)
7589 return NULL;
7590
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007591 j = 0; /* same as in first pass */
7592 q = u->str; /* next output char */
7593 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594
7595 for (p = self->str; p < e; p++)
7596 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 if (tabsize > 0) {
7598 i = tabsize - (j % tabsize);
7599 j += i;
7600 while (i--) {
7601 if (q >= qe)
7602 goto overflow2;
7603 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007604 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007606 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 else {
7608 if (q >= qe)
7609 goto overflow2;
7610 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007611 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 if (*p == '\n' || *p == '\r')
7613 j = 0;
7614 }
7615
7616 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007617
7618 overflow2:
7619 Py_DECREF(u);
7620 overflow1:
7621 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7622 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623}
7624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007625PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627\n\
7628Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08007629such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630arguments start and end are interpreted as in slice notation.\n\
7631\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007632Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
7634static PyObject *
7635unicode_find(PyUnicodeObject *self, PyObject *args)
7636{
Jesus Ceaac451502011-04-20 17:09:23 +02007637 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007638 Py_ssize_t start;
7639 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007640 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
Jesus Ceaac451502011-04-20 17:09:23 +02007642 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7643 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645
Thomas Wouters477c8d52006-05-27 19:21:47 +00007646 result = stringlib_find_slice(
7647 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7648 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7649 start, end
7650 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651
7652 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007653
Christian Heimes217cfd12007-12-02 14:31:20 +00007654 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655}
7656
7657static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007658unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659{
7660 if (index < 0 || index >= self->length) {
7661 PyErr_SetString(PyExc_IndexError, "string index out of range");
7662 return NULL;
7663 }
7664
7665 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7666}
7667
Guido van Rossumc2504932007-09-18 19:42:40 +00007668/* Believe it or not, this produces the same value for ASCII strings
7669 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007670static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007671unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672{
Guido van Rossumc2504932007-09-18 19:42:40 +00007673 Py_ssize_t len;
7674 Py_UNICODE *p;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -08007675 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +00007676
Benjamin Petersonf6622c82012-04-09 14:53:07 -04007677#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -05007678 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -04007679#endif
Guido van Rossumc2504932007-09-18 19:42:40 +00007680 if (self->hash != -1)
7681 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007682 len = Py_SIZE(self);
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007683 /*
7684 We make the hash of the empty string be 0, rather than using
7685 (prefix ^ suffix), since this slightly obfuscates the hash secret
7686 */
7687 if (len == 0) {
7688 self->hash = 0;
7689 return 0;
7690 }
Guido van Rossumc2504932007-09-18 19:42:40 +00007691 p = self->str;
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007692 x = _Py_HashSecret.prefix;
7693 x ^= *p << 7;
Guido van Rossumc2504932007-09-18 19:42:40 +00007694 while (--len >= 0)
Gregory P. Smith63e6c322012-01-14 15:31:34 -08007695 x = (_PyHASH_MULTIPLIER*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007696 x ^= Py_SIZE(self);
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007697 x ^= _Py_HashSecret.suffix;
Guido van Rossumc2504932007-09-18 19:42:40 +00007698 if (x == -1)
7699 x = -2;
7700 self->hash = x;
7701 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702}
7703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007704PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007707Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708
7709static PyObject *
7710unicode_index(PyUnicodeObject *self, PyObject *args)
7711{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007712 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02007713 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007714 Py_ssize_t start;
7715 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716
Jesus Ceaac451502011-04-20 17:09:23 +02007717 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7718 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720
Thomas Wouters477c8d52006-05-27 19:21:47 +00007721 result = stringlib_find_slice(
7722 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7723 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7724 start, end
7725 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726
7727 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007728
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 if (result < 0) {
7730 PyErr_SetString(PyExc_ValueError, "substring not found");
7731 return NULL;
7732 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007733
Christian Heimes217cfd12007-12-02 14:31:20 +00007734 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735}
7736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007737PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007740Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007741at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742
7743static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007744unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745{
7746 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7747 register const Py_UNICODE *e;
7748 int cased;
7749
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 /* Shortcut for single character strings */
7751 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007754 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007755 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007757
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 e = p + PyUnicode_GET_SIZE(self);
7759 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007760 while (p < e) {
7761 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007762
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7764 return PyBool_FromLong(0);
7765 else if (!cased && Py_UNICODE_ISLOWER(ch))
7766 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007768 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769}
7770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007771PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007774Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007775at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776
7777static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007778unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779{
7780 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7781 register const Py_UNICODE *e;
7782 int cased;
7783
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784 /* Shortcut for single character strings */
7785 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007788 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007789 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007791
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 e = p + PyUnicode_GET_SIZE(self);
7793 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007794 while (p < e) {
7795 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007796
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7798 return PyBool_FromLong(0);
7799 else if (!cased && Py_UNICODE_ISUPPER(ch))
7800 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007802 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803}
7804
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007805PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007808Return True if S is a titlecased string and there is at least one\n\
7809character in S, i.e. upper- and titlecase characters may only\n\
7810follow uncased characters and lowercase characters only cased ones.\n\
7811Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812
7813static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007814unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815{
7816 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7817 register const Py_UNICODE *e;
7818 int cased, previous_is_cased;
7819
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820 /* Shortcut for single character strings */
7821 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7823 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007825 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007826 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007828
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829 e = p + PyUnicode_GET_SIZE(self);
7830 cased = 0;
7831 previous_is_cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007832 while (p < e) {
7833 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007834
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7836 if (previous_is_cased)
7837 return PyBool_FromLong(0);
7838 previous_is_cased = 1;
7839 cased = 1;
7840 }
7841 else if (Py_UNICODE_ISLOWER(ch)) {
7842 if (!previous_is_cased)
7843 return PyBool_FromLong(0);
7844 previous_is_cased = 1;
7845 cased = 1;
7846 }
7847 else
7848 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007850 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851}
7852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007853PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007856Return True if all characters in S are whitespace\n\
7857and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858
7859static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007860unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861{
7862 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7863 register const Py_UNICODE *e;
7864
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865 /* Shortcut for single character strings */
7866 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 Py_UNICODE_ISSPACE(*p))
7868 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007870 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007871 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007873
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007875 while (p < e) {
7876 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7877 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007880 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881}
7882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007883PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007885\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007886Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007887and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007888
7889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007890unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007891{
7892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7893 register const Py_UNICODE *e;
7894
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007895 /* Shortcut for single character strings */
7896 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 Py_UNICODE_ISALPHA(*p))
7898 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007899
7900 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007901 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007903
7904 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007905 while (p < e) {
7906 if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007908 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007909 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007910}
7911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007912PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007914\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007915Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007916and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007917
7918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007919unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007920{
7921 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7922 register const Py_UNICODE *e;
7923
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007924 /* Shortcut for single character strings */
7925 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 Py_UNICODE_ISALNUM(*p))
7927 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007928
7929 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007930 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007932
7933 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007934 while (p < e) {
7935 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7936 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007938 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007939 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007940}
7941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007942PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007945Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007946False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947
7948static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007949unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950{
7951 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7952 register const Py_UNICODE *e;
7953
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954 /* Shortcut for single character strings */
7955 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 Py_UNICODE_ISDECIMAL(*p))
7957 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007959 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007960 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007962
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007964 while (p < e) {
7965 if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007968 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969}
7970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007971PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007974Return True if all characters in S are digits\n\
7975and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976
7977static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007978unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979{
7980 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7981 register const Py_UNICODE *e;
7982
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 /* Shortcut for single character strings */
7984 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 Py_UNICODE_ISDIGIT(*p))
7986 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007988 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007989 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007991
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007993 while (p < e) {
7994 if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007997 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998}
7999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008000PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008003Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008004False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005
8006static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008007unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008{
8009 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8010 register const Py_UNICODE *e;
8011
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 /* Shortcut for single character strings */
8013 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 Py_UNICODE_ISNUMERIC(*p))
8015 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008017 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008018 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008020
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008022 while (p < e) {
8023 if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008026 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027}
8028
Martin v. Löwis47383402007-08-15 07:32:56 +00008029int
8030PyUnicode_IsIdentifier(PyObject *self)
8031{
Benjamin Petersonf413b802011-08-12 22:17:18 -05008032 const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008033 const Py_UNICODE *e;
8034 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +00008035
8036 /* Special case for empty strings */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008037 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008039
8040 /* PEP 3131 says that the first character must be in
8041 XID_Start and subsequent characters in XID_Continue,
8042 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008043 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008044 letters, digits, underscore). However, given the current
8045 definition of XID_Start and XID_Continue, it is sufficient
8046 to check just for these, except that _ must be allowed
8047 as starting an identifier. */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008048 e = p + PyUnicode_GET_SIZE(self);
8049 first = _Py_UNICODE_NEXT(p, e);
Benjamin Petersonf413b802011-08-12 22:17:18 -05008050 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +00008051 return 0;
8052
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008053 while (p < e)
8054 if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008056 return 1;
8057}
8058
8059PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008061\n\
8062Return True if S is a valid identifier according\n\
8063to the language definition.");
8064
8065static PyObject*
8066unicode_isidentifier(PyObject *self)
8067{
8068 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8069}
8070
Georg Brandl559e5d72008-06-11 18:37:52 +00008071PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008073\n\
8074Return True if all characters in S are considered\n\
8075printable in repr() or S is empty, False otherwise.");
8076
8077static PyObject*
8078unicode_isprintable(PyObject *self)
8079{
8080 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8081 register const Py_UNICODE *e;
8082
8083 /* Shortcut for single character strings */
8084 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8085 Py_RETURN_TRUE;
8086 }
8087
8088 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008089 while (p < e) {
8090 if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
Georg Brandl559e5d72008-06-11 18:37:52 +00008091 Py_RETURN_FALSE;
8092 }
8093 }
8094 Py_RETURN_TRUE;
8095}
8096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008097PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008098 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099\n\
8100Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008101iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102
8103static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008104unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008106 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107}
8108
Martin v. Löwis18e16552006-02-15 17:27:45 +00008109static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110unicode_length(PyUnicodeObject *self)
8111{
8112 return self->length;
8113}
8114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008115PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008118Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008119done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
8121static PyObject *
8122unicode_ljust(PyUnicodeObject *self, PyObject *args)
8123{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008124 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008125 Py_UNICODE fillchar = ' ';
8126
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008127 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 return NULL;
8129
Tim Peters7a29bd52001-09-12 03:03:31 +00008130 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 Py_INCREF(self);
8132 return (PyObject*) self;
8133 }
8134
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008135 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136}
8137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008138PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008141Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142
8143static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008144unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 return fixup(self, fixlower);
8147}
8148
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008149#define LEFTSTRIP 0
8150#define RIGHTSTRIP 1
8151#define BOTHSTRIP 2
8152
8153/* Arrays indexed by above */
8154static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8155
8156#define STRIPNAME(i) (stripformat[i]+3)
8157
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008158/* externally visible for str.strip(unicode) */
8159PyObject *
8160_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8161{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008162 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8163 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8164 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8165 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8166 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008167
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008169
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 i = 0;
8171 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8173 i++;
8174 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008176
Benjamin Peterson14339b62009-01-31 16:36:08 +00008177 j = len;
8178 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 do {
8180 j--;
8181 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8182 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008184
Benjamin Peterson14339b62009-01-31 16:36:08 +00008185 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 Py_INCREF(self);
8187 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188 }
8189 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008191}
8192
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193
8194static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008195do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008197 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8198 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008199
Benjamin Peterson14339b62009-01-31 16:36:08 +00008200 i = 0;
8201 if (striptype != RIGHTSTRIP) {
8202 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8203 i++;
8204 }
8205 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008206
Benjamin Peterson14339b62009-01-31 16:36:08 +00008207 j = len;
8208 if (striptype != LEFTSTRIP) {
8209 do {
8210 j--;
8211 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8212 j++;
8213 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008214
Benjamin Peterson14339b62009-01-31 16:36:08 +00008215 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8216 Py_INCREF(self);
8217 return (PyObject*)self;
8218 }
8219 else
8220 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221}
8222
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008223
8224static PyObject *
8225do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8226{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008227 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008228
Benjamin Peterson14339b62009-01-31 16:36:08 +00008229 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8230 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008231
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232 if (sep != NULL && sep != Py_None) {
8233 if (PyUnicode_Check(sep))
8234 return _PyUnicode_XStrip(self, striptype, sep);
8235 else {
8236 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 "%s arg must be None or str",
8238 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008239 return NULL;
8240 }
8241 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008242
Benjamin Peterson14339b62009-01-31 16:36:08 +00008243 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008244}
8245
8246
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008247PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008249\n\
8250Return a copy of the string S with leading and trailing\n\
8251whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008252If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008253
8254static PyObject *
8255unicode_strip(PyUnicodeObject *self, PyObject *args)
8256{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008257 if (PyTuple_GET_SIZE(args) == 0)
8258 return do_strip(self, BOTHSTRIP); /* Common case */
8259 else
8260 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008261}
8262
8263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008264PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008266\n\
8267Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008268If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008269
8270static PyObject *
8271unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8272{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008273 if (PyTuple_GET_SIZE(args) == 0)
8274 return do_strip(self, LEFTSTRIP); /* Common case */
8275 else
8276 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008277}
8278
8279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008280PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008282\n\
8283Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008284If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008285
8286static PyObject *
8287unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8288{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008289 if (PyTuple_GET_SIZE(args) == 0)
8290 return do_strip(self, RIGHTSTRIP); /* Common case */
8291 else
8292 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008293}
8294
8295
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008297unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298{
8299 PyUnicodeObject *u;
8300 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008301 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008302 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303
Serhiy Storchaka05997252013-01-26 12:14:02 +02008304 if (len < 1)
8305 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306
Tim Peters7a29bd52001-09-12 03:03:31 +00008307 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 /* no repeat, return original string */
8309 Py_INCREF(str);
8310 return (PyObject*) str;
8311 }
Tim Peters8f422462000-09-09 06:13:41 +00008312
8313 /* ensure # of chars needed doesn't overflow int and # of bytes
8314 * needed doesn't overflow size_t
8315 */
8316 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008317 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008318 PyErr_SetString(PyExc_OverflowError,
8319 "repeated string is too long");
8320 return NULL;
8321 }
8322 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8323 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8324 PyErr_SetString(PyExc_OverflowError,
8325 "repeated string is too long");
8326 return NULL;
8327 }
8328 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 if (!u)
8330 return NULL;
8331
8332 p = u->str;
8333
Georg Brandl222de0f2009-04-12 12:01:50 +00008334 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008335 Py_UNICODE_FILL(p, str->str[0], len);
8336 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008337 Py_ssize_t done = str->length; /* number of characters copied this far */
8338 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008340 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008341 Py_UNICODE_COPY(p+done, p, n);
8342 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 }
8345
8346 return (PyObject*) u;
8347}
8348
8349PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 PyObject *subobj,
8351 PyObject *replobj,
8352 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353{
8354 PyObject *self;
8355 PyObject *str1;
8356 PyObject *str2;
8357 PyObject *result;
8358
8359 self = PyUnicode_FromObject(obj);
8360 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 str1 = PyUnicode_FromObject(subobj);
8363 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 Py_DECREF(self);
8365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 }
8367 str2 = PyUnicode_FromObject(replobj);
8368 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 Py_DECREF(self);
8370 Py_DECREF(str1);
8371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372 }
Tim Petersced69f82003-09-16 20:30:58 +00008373 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 (PyUnicodeObject *)str1,
8375 (PyUnicodeObject *)str2,
8376 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 Py_DECREF(self);
8378 Py_DECREF(str1);
8379 Py_DECREF(str2);
8380 return result;
8381}
8382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008383PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008384 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385\n\
8386Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008387old replaced by new. If the optional argument count is\n\
8388given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389
8390static PyObject*
8391unicode_replace(PyUnicodeObject *self, PyObject *args)
8392{
8393 PyUnicodeObject *str1;
8394 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008395 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 PyObject *result;
8397
Martin v. Löwis18e16552006-02-15 17:27:45 +00008398 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 return NULL;
8400 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8401 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008404 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 Py_DECREF(str1);
8406 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408
8409 result = replace(self, str1, str2, maxcount);
8410
8411 Py_DECREF(str1);
8412 Py_DECREF(str2);
8413 return result;
8414}
8415
8416static
8417PyObject *unicode_repr(PyObject *unicode)
8418{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008419 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008420 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008421 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8422 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8423
8424 /* XXX(nnorwitz): rather than over-allocating, it would be
8425 better to choose a different scheme. Perhaps scan the
8426 first N-chars of the string and allocate based on that size.
8427 */
8428 /* Initial allocation is based on the longest-possible unichr
8429 escape.
8430
8431 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8432 unichr, so in this case it's the longest unichr escape. In
8433 narrow (UTF-16) builds this is five chars per source unichr
8434 since there are two unichrs in the surrogate pair, so in narrow
8435 (UTF-16) builds it's not the longest unichr escape.
8436
8437 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8438 so in the narrow (UTF-16) build case it's the longest unichr
8439 escape.
8440 */
8441
Walter Dörwald1ab83302007-05-18 17:15:44 +00008442 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008444#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008446#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008448#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008450 if (repr == NULL)
8451 return NULL;
8452
Walter Dörwald1ab83302007-05-18 17:15:44 +00008453 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008454
8455 /* Add quote */
8456 *p++ = (findchar(s, size, '\'') &&
8457 !findchar(s, size, '"')) ? '"' : '\'';
8458 while (size-- > 0) {
8459 Py_UNICODE ch = *s++;
8460
8461 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008462 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008463 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008464 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008465 continue;
8466 }
8467
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008469 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008470 *p++ = '\\';
8471 *p++ = 't';
8472 }
8473 else if (ch == '\n') {
8474 *p++ = '\\';
8475 *p++ = 'n';
8476 }
8477 else if (ch == '\r') {
8478 *p++ = '\\';
8479 *p++ = 'r';
8480 }
8481
8482 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008483 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008484 *p++ = '\\';
8485 *p++ = 'x';
8486 *p++ = hexdigits[(ch >> 4) & 0x000F];
8487 *p++ = hexdigits[ch & 0x000F];
8488 }
8489
Georg Brandl559e5d72008-06-11 18:37:52 +00008490 /* Copy ASCII characters as-is */
8491 else if (ch < 0x7F) {
8492 *p++ = ch;
8493 }
8494
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008496 else {
8497 Py_UCS4 ucs = ch;
8498
8499#ifndef Py_UNICODE_WIDE
8500 Py_UNICODE ch2 = 0;
8501 /* Get code point from surrogate pair */
8502 if (size > 0) {
8503 ch2 = *s;
8504 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008508 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008509 size--;
8510 }
8511 }
8512#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008513 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008514 (categories Z* and C* except ASCII space)
8515 */
8516 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8517 /* Map 8-bit characters to '\xhh' */
8518 if (ucs <= 0xff) {
8519 *p++ = '\\';
8520 *p++ = 'x';
8521 *p++ = hexdigits[(ch >> 4) & 0x000F];
8522 *p++ = hexdigits[ch & 0x000F];
8523 }
8524 /* Map 21-bit characters to '\U00xxxxxx' */
8525 else if (ucs >= 0x10000) {
8526 *p++ = '\\';
8527 *p++ = 'U';
8528 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8529 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8530 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8531 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8532 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8533 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8534 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8535 *p++ = hexdigits[ucs & 0x0000000F];
8536 }
8537 /* Map 16-bit characters to '\uxxxx' */
8538 else {
8539 *p++ = '\\';
8540 *p++ = 'u';
8541 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8542 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8543 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8544 *p++ = hexdigits[ucs & 0x000F];
8545 }
8546 }
8547 /* Copy characters as-is */
8548 else {
8549 *p++ = ch;
8550#ifndef Py_UNICODE_WIDE
8551 if (ucs >= 0x10000)
8552 *p++ = ch2;
8553#endif
8554 }
8555 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008556 }
8557 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008558 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008559
8560 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008561 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008562 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563}
8564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008565PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567\n\
8568Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08008569such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570arguments start and end are interpreted as in slice notation.\n\
8571\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008572Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573
8574static PyObject *
8575unicode_rfind(PyUnicodeObject *self, PyObject *args)
8576{
Jesus Ceaac451502011-04-20 17:09:23 +02008577 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008578 Py_ssize_t start;
8579 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008580 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581
Jesus Ceaac451502011-04-20 17:09:23 +02008582 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8583 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585
Thomas Wouters477c8d52006-05-27 19:21:47 +00008586 result = stringlib_rfind_slice(
8587 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8588 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8589 start, end
8590 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
8592 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008593
Christian Heimes217cfd12007-12-02 14:31:20 +00008594 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595}
8596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008597PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008600Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601
8602static PyObject *
8603unicode_rindex(PyUnicodeObject *self, PyObject *args)
8604{
Jesus Ceaac451502011-04-20 17:09:23 +02008605 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008606 Py_ssize_t start;
8607 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008608 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
Jesus Ceaac451502011-04-20 17:09:23 +02008610 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8611 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613
Thomas Wouters477c8d52006-05-27 19:21:47 +00008614 result = stringlib_rfind_slice(
8615 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8616 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8617 start, end
8618 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619
8620 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008621
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 if (result < 0) {
8623 PyErr_SetString(PyExc_ValueError, "substring not found");
8624 return NULL;
8625 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008626 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627}
8628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008629PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008632Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008633done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634
8635static PyObject *
8636unicode_rjust(PyUnicodeObject *self, PyObject *args)
8637{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008638 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008639 Py_UNICODE fillchar = ' ';
8640
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008641 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 return NULL;
8643
Tim Peters7a29bd52001-09-12 03:03:31 +00008644 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 Py_INCREF(self);
8646 return (PyObject*) self;
8647 }
8648
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008649 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650}
8651
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 PyObject *sep,
8654 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655{
8656 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008657
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 s = PyUnicode_FromObject(s);
8659 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008660 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 if (sep != NULL) {
8662 sep = PyUnicode_FromObject(sep);
8663 if (sep == NULL) {
8664 Py_DECREF(s);
8665 return NULL;
8666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667 }
8668
8669 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8670
8671 Py_DECREF(s);
8672 Py_XDECREF(sep);
8673 return result;
8674}
8675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008676PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678\n\
8679Return a list of the words in S, using sep as the\n\
8680delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008681splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008682whitespace string is a separator and empty strings are\n\
8683removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684
8685static PyObject*
8686unicode_split(PyUnicodeObject *self, PyObject *args)
8687{
8688 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008689 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690
Martin v. Löwis18e16552006-02-15 17:27:45 +00008691 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 return NULL;
8693
8694 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700}
8701
Thomas Wouters477c8d52006-05-27 19:21:47 +00008702PyObject *
8703PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8704{
8705 PyObject* str_obj;
8706 PyObject* sep_obj;
8707 PyObject* out;
8708
8709 str_obj = PyUnicode_FromObject(str_in);
8710 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008712 sep_obj = PyUnicode_FromObject(sep_in);
8713 if (!sep_obj) {
8714 Py_DECREF(str_obj);
8715 return NULL;
8716 }
8717
8718 out = stringlib_partition(
8719 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8720 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8721 );
8722
8723 Py_DECREF(sep_obj);
8724 Py_DECREF(str_obj);
8725
8726 return out;
8727}
8728
8729
8730PyObject *
8731PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8732{
8733 PyObject* str_obj;
8734 PyObject* sep_obj;
8735 PyObject* out;
8736
8737 str_obj = PyUnicode_FromObject(str_in);
8738 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008740 sep_obj = PyUnicode_FromObject(sep_in);
8741 if (!sep_obj) {
8742 Py_DECREF(str_obj);
8743 return NULL;
8744 }
8745
8746 out = stringlib_rpartition(
8747 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8748 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8749 );
8750
8751 Py_DECREF(sep_obj);
8752 Py_DECREF(str_obj);
8753
8754 return out;
8755}
8756
8757PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008759\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008760Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008761the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008762found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008763
8764static PyObject*
8765unicode_partition(PyUnicodeObject *self, PyObject *separator)
8766{
8767 return PyUnicode_Partition((PyObject *)self, separator);
8768}
8769
8770PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008771 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008772\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008773Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008774the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008775separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008776
8777static PyObject*
8778unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8779{
8780 return PyUnicode_RPartition((PyObject *)self, separator);
8781}
8782
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008783PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 PyObject *sep,
8785 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008786{
8787 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008788
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008789 s = PyUnicode_FromObject(s);
8790 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008791 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 if (sep != NULL) {
8793 sep = PyUnicode_FromObject(sep);
8794 if (sep == NULL) {
8795 Py_DECREF(s);
8796 return NULL;
8797 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008798 }
8799
8800 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8801
8802 Py_DECREF(s);
8803 Py_XDECREF(sep);
8804 return result;
8805}
8806
8807PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008809\n\
8810Return a list of the words in S, using sep as the\n\
8811delimiter string, starting at the end of the string and\n\
8812working to the front. If maxsplit is given, at most maxsplit\n\
8813splits are done. If sep is not specified, any whitespace string\n\
8814is a separator.");
8815
8816static PyObject*
8817unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8818{
8819 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008820 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008821
Martin v. Löwis18e16552006-02-15 17:27:45 +00008822 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008823 return NULL;
8824
8825 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008827 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008829 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008831}
8832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008833PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835\n\
8836Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008837Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008838is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839
8840static PyObject*
8841unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8842{
Guido van Rossum86662912000-04-11 15:38:46 +00008843 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844
Guido van Rossum86662912000-04-11 15:38:46 +00008845 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 return NULL;
8847
Guido van Rossum86662912000-04-11 15:38:46 +00008848 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849}
8850
8851static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008852PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853{
Walter Dörwald346737f2007-05-31 10:44:43 +00008854 if (PyUnicode_CheckExact(self)) {
8855 Py_INCREF(self);
8856 return self;
8857 } else
8858 /* Subtype -- return genuine unicode string with the same value. */
8859 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8860 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861}
8862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008863PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865\n\
8866Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008867and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868
8869static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008870unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 return fixup(self, fixswapcase);
8873}
8874
Georg Brandlceee0772007-11-27 23:48:05 +00008875PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008877\n\
8878Return a translation table usable for str.translate().\n\
8879If there is only one argument, it must be a dictionary mapping Unicode\n\
8880ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008881Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008882If there are two arguments, they must be strings of equal length, and\n\
8883in the resulting dictionary, each character in x will be mapped to the\n\
8884character at the same position in y. If there is a third argument, it\n\
8885must be a string, whose characters will be mapped to None in the result.");
8886
8887static PyObject*
8888unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8889{
8890 PyObject *x, *y = NULL, *z = NULL;
8891 PyObject *new = NULL, *key, *value;
8892 Py_ssize_t i = 0;
8893 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008894
Georg Brandlceee0772007-11-27 23:48:05 +00008895 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8896 return NULL;
8897 new = PyDict_New();
8898 if (!new)
8899 return NULL;
8900 if (y != NULL) {
8901 /* x must be a string too, of equal length */
8902 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8903 if (!PyUnicode_Check(x)) {
8904 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8905 "be a string if there is a second argument");
8906 goto err;
8907 }
8908 if (PyUnicode_GET_SIZE(x) != ylen) {
8909 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8910 "arguments must have equal length");
8911 goto err;
8912 }
8913 /* create entries for translating chars in x to those in y */
8914 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008915 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
Benjamin Peterson53aa1d72011-12-20 13:29:45 -06008916 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +00008917 goto err;
Benjamin Peterson53aa1d72011-12-20 13:29:45 -06008918 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8919 if (!value) {
8920 Py_DECREF(key);
8921 goto err;
8922 }
Georg Brandlceee0772007-11-27 23:48:05 +00008923 res = PyDict_SetItem(new, key, value);
8924 Py_DECREF(key);
8925 Py_DECREF(value);
8926 if (res < 0)
8927 goto err;
8928 }
8929 /* create entries for deleting chars in z */
8930 if (z != NULL) {
8931 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008932 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008933 if (!key)
8934 goto err;
8935 res = PyDict_SetItem(new, key, Py_None);
8936 Py_DECREF(key);
8937 if (res < 0)
8938 goto err;
8939 }
8940 }
8941 } else {
8942 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008943 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008944 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8945 "to maketrans it must be a dict");
8946 goto err;
8947 }
8948 /* copy entries into the new dict, converting string keys to int keys */
8949 while (PyDict_Next(x, &i, &key, &value)) {
8950 if (PyUnicode_Check(key)) {
8951 /* convert string keys to integer keys */
8952 PyObject *newkey;
8953 if (PyUnicode_GET_SIZE(key) != 1) {
8954 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8955 "table must be of length 1");
8956 goto err;
8957 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008958 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008959 if (!newkey)
8960 goto err;
8961 res = PyDict_SetItem(new, newkey, value);
8962 Py_DECREF(newkey);
8963 if (res < 0)
8964 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008965 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008966 /* just keep integer keys */
8967 if (PyDict_SetItem(new, key, value) < 0)
8968 goto err;
8969 } else {
8970 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8971 "be strings or integers");
8972 goto err;
8973 }
8974 }
8975 }
8976 return new;
8977 err:
8978 Py_DECREF(new);
8979 return NULL;
8980}
8981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008982PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984\n\
8985Return a copy of the string S, where all characters have been mapped\n\
8986through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008987Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008988Unmapped characters are left untouched. Characters mapped to None\n\
8989are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990
8991static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008992unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993{
Georg Brandlceee0772007-11-27 23:48:05 +00008994 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995}
8996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008997PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009000Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001
9002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009003unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 return fixup(self, fixupper);
9006}
9007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009008PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009011Pad a numeric string S with zeros on the left, to fill a field\n\
9012of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013
9014static PyObject *
9015unicode_zfill(PyUnicodeObject *self, PyObject *args)
9016{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009017 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 PyUnicodeObject *u;
9019
Martin v. Löwis18e16552006-02-15 17:27:45 +00009020 Py_ssize_t width;
9021 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022 return NULL;
9023
9024 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009025 if (PyUnicode_CheckExact(self)) {
9026 Py_INCREF(self);
9027 return (PyObject*) self;
9028 }
9029 else
9030 return PyUnicode_FromUnicode(
9031 PyUnicode_AS_UNICODE(self),
9032 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009033 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034 }
9035
9036 fill = width - self->length;
9037
9038 u = pad(self, fill, 0, '0');
9039
Walter Dörwald068325e2002-04-15 13:36:47 +00009040 if (u == NULL)
9041 return NULL;
9042
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 if (u->str[fill] == '+' || u->str[fill] == '-') {
9044 /* move sign to beginning of string */
9045 u->str[0] = u->str[fill];
9046 u->str[fill] = '0';
9047 }
9048
9049 return (PyObject*) u;
9050}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051
9052#if 0
9053static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009054unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055{
Christian Heimes2202f872008-02-06 14:31:34 +00009056 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009058
9059static PyObject *
9060unicode__decimal2ascii(PyObject *self)
9061{
9062 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9063 PyUnicode_GET_SIZE(self));
9064}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065#endif
9066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009067PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009070Return True if S starts with the specified prefix, False otherwise.\n\
9071With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009072With optional end, stop comparing S at that position.\n\
9073prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074
9075static PyObject *
9076unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009079 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009081 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009082 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009083 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084
Jesus Ceaac451502011-04-20 17:09:23 +02009085 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009087 if (PyTuple_Check(subobj)) {
9088 Py_ssize_t i;
9089 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9090 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009091 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009092 if (substring == NULL)
9093 return NULL;
9094 result = tailmatch(self, substring, start, end, -1);
9095 Py_DECREF(substring);
9096 if (result) {
9097 Py_RETURN_TRUE;
9098 }
9099 }
9100 /* nothing matched */
9101 Py_RETURN_FALSE;
9102 }
9103 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009104 if (substring == NULL) {
9105 if (PyErr_ExceptionMatches(PyExc_TypeError))
9106 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9107 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009108 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009109 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009110 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009112 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113}
9114
9115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009116PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009119Return True if S ends with the specified suffix, False otherwise.\n\
9120With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009121With optional end, stop comparing S at that position.\n\
9122suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123
9124static PyObject *
9125unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009128 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009130 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009131 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009132 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133
Jesus Ceaac451502011-04-20 17:09:23 +02009134 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009136 if (PyTuple_Check(subobj)) {
9137 Py_ssize_t i;
9138 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9139 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009141 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009143 result = tailmatch(self, substring, start, end, +1);
9144 Py_DECREF(substring);
9145 if (result) {
9146 Py_RETURN_TRUE;
9147 }
9148 }
9149 Py_RETURN_FALSE;
9150 }
9151 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009152 if (substring == NULL) {
9153 if (PyErr_ExceptionMatches(PyExc_TypeError))
9154 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9155 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009157 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009158 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009160 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161}
9162
Eric Smith8c663262007-08-25 02:26:07 +00009163#include "stringlib/string_format.h"
9164
9165PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009167\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009168Return a formatted version of S, using substitutions from args and kwargs.\n\
9169The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009170
Eric Smith27bbca62010-11-04 17:06:58 +00009171PyDoc_STRVAR(format_map__doc__,
9172 "S.format_map(mapping) -> str\n\
9173\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009174Return a formatted version of S, using substitutions from mapping.\n\
9175The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009176
Eric Smith4a7d76d2008-05-30 18:10:19 +00009177static PyObject *
9178unicode__format__(PyObject* self, PyObject* args)
9179{
9180 PyObject *format_spec;
9181
9182 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9183 return NULL;
9184
9185 return _PyUnicode_FormatAdvanced(self,
9186 PyUnicode_AS_UNICODE(format_spec),
9187 PyUnicode_GET_SIZE(format_spec));
9188}
9189
Eric Smith8c663262007-08-25 02:26:07 +00009190PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009192\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009193Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009194
9195static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009196unicode__sizeof__(PyUnicodeObject *v)
9197{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009198 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9199 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009200}
9201
9202PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009203 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009204
9205static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009206unicode_getnewargs(PyUnicodeObject *v)
9207{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009208 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009209}
9210
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009212 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009213 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9214 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009215 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009216 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9217 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9218 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9219 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9220 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9221 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9222 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009223 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009224 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9225 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9226 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009227 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009228 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9229 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9230 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009231 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009232 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009233 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009234 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009235 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9236 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9237 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9238 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9239 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9240 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9241 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9242 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9243 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9244 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9245 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9246 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9247 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9248 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009249 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009250 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009251 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009252 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009253 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009254 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009255 {"maketrans", (PyCFunction) unicode_maketrans,
9256 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009257 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009258#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009259 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260#endif
9261
9262#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009263 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009264 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009265 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009266#endif
9267
Benjamin Peterson14339b62009-01-31 16:36:08 +00009268 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269 {NULL, NULL}
9270};
9271
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009272static PyObject *
9273unicode_mod(PyObject *v, PyObject *w)
9274{
Benjamin Peterson29060642009-01-31 22:14:21 +00009275 if (!PyUnicode_Check(v)) {
9276 Py_INCREF(Py_NotImplemented);
9277 return Py_NotImplemented;
9278 }
9279 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009280}
9281
9282static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009283 0, /*nb_add*/
9284 0, /*nb_subtract*/
9285 0, /*nb_multiply*/
9286 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009287};
9288
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009290 (lenfunc) unicode_length, /* sq_length */
9291 PyUnicode_Concat, /* sq_concat */
9292 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9293 (ssizeargfunc) unicode_getitem, /* sq_item */
9294 0, /* sq_slice */
9295 0, /* sq_ass_item */
9296 0, /* sq_ass_slice */
9297 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298};
9299
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009300static PyObject*
9301unicode_subscript(PyUnicodeObject* self, PyObject* item)
9302{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009303 if (PyIndex_Check(item)) {
9304 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009305 if (i == -1 && PyErr_Occurred())
9306 return NULL;
9307 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009308 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009309 return unicode_getitem(self, i);
9310 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009311 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009312 Py_UNICODE* source_buf;
9313 Py_UNICODE* result_buf;
9314 PyObject* result;
9315
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009316 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009317 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009318 return NULL;
9319 }
9320
9321 if (slicelength <= 0) {
9322 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009323 } else if (start == 0 && step == 1 && slicelength == self->length &&
9324 PyUnicode_CheckExact(self)) {
9325 Py_INCREF(self);
9326 return (PyObject *)self;
9327 } else if (step == 1) {
9328 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009329 } else {
9330 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009331 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9332 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009333
Benjamin Peterson29060642009-01-31 22:14:21 +00009334 if (result_buf == NULL)
9335 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009336
9337 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9338 result_buf[i] = source_buf[cur];
9339 }
Tim Petersced69f82003-09-16 20:30:58 +00009340
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009341 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009342 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009343 return result;
9344 }
9345 } else {
9346 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9347 return NULL;
9348 }
9349}
9350
9351static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009352 (lenfunc)unicode_length, /* mp_length */
9353 (binaryfunc)unicode_subscript, /* mp_subscript */
9354 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009355};
9356
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358/* Helpers for PyUnicode_Format() */
9359
9360static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009361getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009363 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009365 (*p_argidx)++;
9366 if (arglen < 0)
9367 return args;
9368 else
9369 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 }
9371 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 return NULL;
9374}
9375
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009376/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009378static PyObject *
9379formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009381 char *p;
9382 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009384
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385 x = PyFloat_AsDouble(v);
9386 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009387 return NULL;
9388
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009390 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009391
Eric Smith0923d1d2009-04-16 20:16:10 +00009392 p = PyOS_double_to_string(x, type, prec,
9393 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009394 if (p == NULL)
9395 return NULL;
9396 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009397 PyMem_Free(p);
9398 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399}
9400
Tim Peters38fd5b62000-09-21 05:43:11 +00009401static PyObject*
9402formatlong(PyObject *val, int flags, int prec, int type)
9403{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009404 char *buf;
9405 int len;
9406 PyObject *str; /* temporary string object. */
9407 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009408
Benjamin Peterson14339b62009-01-31 16:36:08 +00009409 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9410 if (!str)
9411 return NULL;
9412 result = PyUnicode_FromStringAndSize(buf, len);
9413 Py_DECREF(str);
9414 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009415}
9416
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417static int
9418formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009419 size_t buflen,
9420 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009421{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009422 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009423 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009424 if (PyUnicode_GET_SIZE(v) == 1) {
9425 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9426 buf[1] = '\0';
9427 return 1;
9428 }
9429#ifndef Py_UNICODE_WIDE
9430 if (PyUnicode_GET_SIZE(v) == 2) {
9431 /* Decode a valid surrogate pair */
9432 int c0 = PyUnicode_AS_UNICODE(v)[0];
9433 int c1 = PyUnicode_AS_UNICODE(v)[1];
9434 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9435 0xDC00 <= c1 && c1 <= 0xDFFF) {
9436 buf[0] = c0;
9437 buf[1] = c1;
9438 buf[2] = '\0';
9439 return 2;
9440 }
9441 }
9442#endif
9443 goto onError;
9444 }
9445 else {
9446 /* Integer input truncated to a character */
9447 long x;
9448 x = PyLong_AsLong(v);
9449 if (x == -1 && PyErr_Occurred())
9450 goto onError;
9451
9452 if (x < 0 || x > 0x10ffff) {
9453 PyErr_SetString(PyExc_OverflowError,
9454 "%c arg not in range(0x110000)");
9455 return -1;
9456 }
9457
9458#ifndef Py_UNICODE_WIDE
9459 if (x > 0xffff) {
9460 x -= 0x10000;
9461 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9462 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9463 return 2;
9464 }
9465#endif
9466 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009467 buf[1] = '\0';
9468 return 1;
9469 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009470
Benjamin Peterson29060642009-01-31 22:14:21 +00009471 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009472 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009474 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475}
9476
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009477/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009478 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009479*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009480#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009481
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009483 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484{
9485 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009486 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487 int args_owned = 0;
9488 PyUnicodeObject *result = NULL;
9489 PyObject *dict = NULL;
9490 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009491
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 PyErr_BadInternalCall();
9494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 }
9496 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009497 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499 fmt = PyUnicode_AS_UNICODE(uformat);
9500 fmtcnt = PyUnicode_GET_SIZE(uformat);
9501
9502 reslen = rescnt = fmtcnt + 100;
9503 result = _PyUnicode_New(reslen);
9504 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 res = PyUnicode_AS_UNICODE(result);
9507
9508 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 arglen = PyTuple_Size(args);
9510 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 }
9512 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 arglen = -1;
9514 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -04009516 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518
9519 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009520 if (*fmt != '%') {
9521 if (--rescnt < 0) {
9522 rescnt = fmtcnt + 100;
9523 reslen += rescnt;
9524 if (_PyUnicode_Resize(&result, reslen) < 0)
9525 goto onError;
9526 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9527 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009530 }
9531 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 /* Got a format specifier */
9533 int flags = 0;
9534 Py_ssize_t width = -1;
9535 int prec = -1;
9536 Py_UNICODE c = '\0';
9537 Py_UNICODE fill;
9538 int isnumok;
9539 PyObject *v = NULL;
9540 PyObject *temp = NULL;
9541 Py_UNICODE *pbuf;
9542 Py_UNICODE sign;
9543 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009544 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 fmt++;
9547 if (*fmt == '(') {
9548 Py_UNICODE *keystart;
9549 Py_ssize_t keylen;
9550 PyObject *key;
9551 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009552
Benjamin Peterson29060642009-01-31 22:14:21 +00009553 if (dict == NULL) {
9554 PyErr_SetString(PyExc_TypeError,
9555 "format requires a mapping");
9556 goto onError;
9557 }
9558 ++fmt;
9559 --fmtcnt;
9560 keystart = fmt;
9561 /* Skip over balanced parentheses */
9562 while (pcount > 0 && --fmtcnt >= 0) {
9563 if (*fmt == ')')
9564 --pcount;
9565 else if (*fmt == '(')
9566 ++pcount;
9567 fmt++;
9568 }
9569 keylen = fmt - keystart - 1;
9570 if (fmtcnt < 0 || pcount > 0) {
9571 PyErr_SetString(PyExc_ValueError,
9572 "incomplete format key");
9573 goto onError;
9574 }
9575#if 0
9576 /* keys are converted to strings using UTF-8 and
9577 then looked up since Python uses strings to hold
9578 variables names etc. in its namespaces and we
9579 wouldn't want to break common idioms. */
9580 key = PyUnicode_EncodeUTF8(keystart,
9581 keylen,
9582 NULL);
9583#else
9584 key = PyUnicode_FromUnicode(keystart, keylen);
9585#endif
9586 if (key == NULL)
9587 goto onError;
9588 if (args_owned) {
9589 Py_DECREF(args);
9590 args_owned = 0;
9591 }
9592 args = PyObject_GetItem(dict, key);
9593 Py_DECREF(key);
9594 if (args == NULL) {
9595 goto onError;
9596 }
9597 args_owned = 1;
9598 arglen = -1;
9599 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009600 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009601 while (--fmtcnt >= 0) {
9602 switch (c = *fmt++) {
9603 case '-': flags |= F_LJUST; continue;
9604 case '+': flags |= F_SIGN; continue;
9605 case ' ': flags |= F_BLANK; continue;
9606 case '#': flags |= F_ALT; continue;
9607 case '0': flags |= F_ZERO; continue;
9608 }
9609 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009610 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009611 if (c == '*') {
9612 v = getnextarg(args, arglen, &argidx);
9613 if (v == NULL)
9614 goto onError;
9615 if (!PyLong_Check(v)) {
9616 PyErr_SetString(PyExc_TypeError,
9617 "* wants int");
9618 goto onError;
9619 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +02009620 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00009621 if (width == -1 && PyErr_Occurred())
9622 goto onError;
9623 if (width < 0) {
9624 flags |= F_LJUST;
9625 width = -width;
9626 }
9627 if (--fmtcnt >= 0)
9628 c = *fmt++;
9629 }
9630 else if (c >= '0' && c <= '9') {
9631 width = c - '0';
9632 while (--fmtcnt >= 0) {
9633 c = *fmt++;
9634 if (c < '0' || c > '9')
9635 break;
Mark Dickinsonfb90c092012-10-28 10:18:03 +00009636 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009637 PyErr_SetString(PyExc_ValueError,
9638 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009639 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009640 }
9641 width = width*10 + (c - '0');
9642 }
9643 }
9644 if (c == '.') {
9645 prec = 0;
9646 if (--fmtcnt >= 0)
9647 c = *fmt++;
9648 if (c == '*') {
9649 v = getnextarg(args, arglen, &argidx);
9650 if (v == NULL)
9651 goto onError;
9652 if (!PyLong_Check(v)) {
9653 PyErr_SetString(PyExc_TypeError,
9654 "* wants int");
9655 goto onError;
9656 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +02009657 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00009658 if (prec == -1 && PyErr_Occurred())
9659 goto onError;
9660 if (prec < 0)
9661 prec = 0;
9662 if (--fmtcnt >= 0)
9663 c = *fmt++;
9664 }
9665 else if (c >= '0' && c <= '9') {
9666 prec = c - '0';
9667 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009668 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009669 if (c < '0' || c > '9')
9670 break;
Mark Dickinsonfb90c092012-10-28 10:18:03 +00009671 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009672 PyErr_SetString(PyExc_ValueError,
9673 "prec too big");
9674 goto onError;
9675 }
9676 prec = prec*10 + (c - '0');
9677 }
9678 }
9679 } /* prec */
9680 if (fmtcnt >= 0) {
9681 if (c == 'h' || c == 'l' || c == 'L') {
9682 if (--fmtcnt >= 0)
9683 c = *fmt++;
9684 }
9685 }
9686 if (fmtcnt < 0) {
9687 PyErr_SetString(PyExc_ValueError,
9688 "incomplete format");
9689 goto onError;
9690 }
9691 if (c != '%') {
9692 v = getnextarg(args, arglen, &argidx);
9693 if (v == NULL)
9694 goto onError;
9695 }
9696 sign = 0;
9697 fill = ' ';
9698 switch (c) {
9699
9700 case '%':
9701 pbuf = formatbuf;
9702 /* presume that buffer length is at least 1 */
9703 pbuf[0] = '%';
9704 len = 1;
9705 break;
9706
9707 case 's':
9708 case 'r':
9709 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009710 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009711 temp = v;
9712 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009713 }
9714 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009715 if (c == 's')
9716 temp = PyObject_Str(v);
9717 else if (c == 'r')
9718 temp = PyObject_Repr(v);
9719 else
9720 temp = PyObject_ASCII(v);
9721 if (temp == NULL)
9722 goto onError;
9723 if (PyUnicode_Check(temp))
9724 /* nothing to do */;
9725 else {
9726 Py_DECREF(temp);
9727 PyErr_SetString(PyExc_TypeError,
9728 "%s argument has non-string str()");
9729 goto onError;
9730 }
9731 }
9732 pbuf = PyUnicode_AS_UNICODE(temp);
9733 len = PyUnicode_GET_SIZE(temp);
9734 if (prec >= 0 && len > prec)
9735 len = prec;
9736 break;
9737
9738 case 'i':
9739 case 'd':
9740 case 'u':
9741 case 'o':
9742 case 'x':
9743 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +00009744 isnumok = 0;
9745 if (PyNumber_Check(v)) {
9746 PyObject *iobj=NULL;
9747
9748 if (PyLong_Check(v)) {
9749 iobj = v;
9750 Py_INCREF(iobj);
9751 }
9752 else {
9753 iobj = PyNumber_Long(v);
9754 }
9755 if (iobj!=NULL) {
9756 if (PyLong_Check(iobj)) {
9757 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07009758 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +00009759 Py_DECREF(iobj);
9760 if (!temp)
9761 goto onError;
9762 pbuf = PyUnicode_AS_UNICODE(temp);
9763 len = PyUnicode_GET_SIZE(temp);
9764 sign = 1;
9765 }
9766 else {
9767 Py_DECREF(iobj);
9768 }
9769 }
9770 }
9771 if (!isnumok) {
9772 PyErr_Format(PyExc_TypeError,
9773 "%%%c format: a number is required, "
9774 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9775 goto onError;
9776 }
9777 if (flags & F_ZERO)
9778 fill = '0';
9779 break;
9780
9781 case 'e':
9782 case 'E':
9783 case 'f':
9784 case 'F':
9785 case 'g':
9786 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009787 temp = formatfloat(v, flags, prec, c);
9788 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009789 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009790 pbuf = PyUnicode_AS_UNICODE(temp);
9791 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009792 sign = 1;
9793 if (flags & F_ZERO)
9794 fill = '0';
9795 break;
9796
9797 case 'c':
9798 pbuf = formatbuf;
9799 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9800 if (len < 0)
9801 goto onError;
9802 break;
9803
9804 default:
9805 PyErr_Format(PyExc_ValueError,
9806 "unsupported format character '%c' (0x%x) "
9807 "at index %zd",
9808 (31<=c && c<=126) ? (char)c : '?',
9809 (int)c,
9810 (Py_ssize_t)(fmt - 1 -
9811 PyUnicode_AS_UNICODE(uformat)));
9812 goto onError;
9813 }
9814 if (sign) {
9815 if (*pbuf == '-' || *pbuf == '+') {
9816 sign = *pbuf++;
9817 len--;
9818 }
9819 else if (flags & F_SIGN)
9820 sign = '+';
9821 else if (flags & F_BLANK)
9822 sign = ' ';
9823 else
9824 sign = 0;
9825 }
9826 if (width < len)
9827 width = len;
9828 if (rescnt - (sign != 0) < width) {
9829 reslen -= rescnt;
9830 rescnt = width + fmtcnt + 100;
9831 reslen += rescnt;
9832 if (reslen < 0) {
9833 Py_XDECREF(temp);
9834 PyErr_NoMemory();
9835 goto onError;
9836 }
9837 if (_PyUnicode_Resize(&result, reslen) < 0) {
9838 Py_XDECREF(temp);
9839 goto onError;
9840 }
9841 res = PyUnicode_AS_UNICODE(result)
9842 + reslen - rescnt;
9843 }
9844 if (sign) {
9845 if (fill != ' ')
9846 *res++ = sign;
9847 rescnt--;
9848 if (width > len)
9849 width--;
9850 }
9851 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9852 assert(pbuf[0] == '0');
9853 assert(pbuf[1] == c);
9854 if (fill != ' ') {
9855 *res++ = *pbuf++;
9856 *res++ = *pbuf++;
9857 }
9858 rescnt -= 2;
9859 width -= 2;
9860 if (width < 0)
9861 width = 0;
9862 len -= 2;
9863 }
9864 if (width > len && !(flags & F_LJUST)) {
9865 do {
9866 --rescnt;
9867 *res++ = fill;
9868 } while (--width > len);
9869 }
9870 if (fill == ' ') {
9871 if (sign)
9872 *res++ = sign;
9873 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9874 assert(pbuf[0] == '0');
9875 assert(pbuf[1] == c);
9876 *res++ = *pbuf++;
9877 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009878 }
9879 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009880 Py_UNICODE_COPY(res, pbuf, len);
9881 res += len;
9882 rescnt -= len;
9883 while (--width >= len) {
9884 --rescnt;
9885 *res++ = ' ';
9886 }
9887 if (dict && (argidx < arglen) && c != '%') {
9888 PyErr_SetString(PyExc_TypeError,
9889 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009890 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009891 goto onError;
9892 }
9893 Py_XDECREF(temp);
9894 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895 } /* until end */
9896 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009897 PyErr_SetString(PyExc_TypeError,
9898 "not all arguments converted during string formatting");
9899 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900 }
9901
Thomas Woutersa96affe2006-03-12 00:29:36 +00009902 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009903 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009905 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906 }
9907 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908 return (PyObject *)result;
9909
Benjamin Peterson29060642009-01-31 22:14:21 +00009910 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911 Py_XDECREF(result);
9912 Py_DECREF(uformat);
9913 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009914 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009915 }
9916 return NULL;
9917}
9918
Jeremy Hylton938ace62002-07-17 16:30:39 +00009919static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009920unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9921
Tim Peters6d6c1a32001-08-02 04:15:00 +00009922static PyObject *
9923unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9924{
Benjamin Peterson29060642009-01-31 22:14:21 +00009925 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009926 static char *kwlist[] = {"object", "encoding", "errors", 0};
9927 char *encoding = NULL;
9928 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009929
Benjamin Peterson14339b62009-01-31 16:36:08 +00009930 if (type != &PyUnicode_Type)
9931 return unicode_subtype_new(type, args, kwds);
9932 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009933 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009934 return NULL;
9935 if (x == NULL)
9936 return (PyObject *)_PyUnicode_New(0);
9937 if (encoding == NULL && errors == NULL)
9938 return PyObject_Str(x);
9939 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009940 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009941}
9942
Guido van Rossume023fe02001-08-30 03:12:59 +00009943static PyObject *
9944unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9945{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009946 PyUnicodeObject *tmp, *pnew;
9947 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009948
Benjamin Peterson14339b62009-01-31 16:36:08 +00009949 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9950 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9951 if (tmp == NULL)
9952 return NULL;
9953 assert(PyUnicode_Check(tmp));
9954 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9955 if (pnew == NULL) {
9956 Py_DECREF(tmp);
9957 return NULL;
9958 }
9959 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9960 if (pnew->str == NULL) {
9961 _Py_ForgetReference((PyObject *)pnew);
9962 PyObject_Del(pnew);
9963 Py_DECREF(tmp);
9964 return PyErr_NoMemory();
9965 }
9966 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9967 pnew->length = n;
9968 pnew->hash = tmp->hash;
9969 Py_DECREF(tmp);
9970 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009971}
9972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009973PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -07009974"str(object='') -> str\n\
9975str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009976\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +10009977Create a new string object from the given object. If encoding or\n\
9978errors is specified, then the object must expose a data buffer\n\
9979that will be decoded using the given encoding and error handler.\n\
9980Otherwise, returns the result of object.__str__() (if defined)\n\
9981or repr(object).\n\
9982encoding defaults to sys.getdefaultencoding().\n\
9983errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009984
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009985static PyObject *unicode_iter(PyObject *seq);
9986
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009988 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009989 "str", /* tp_name */
9990 sizeof(PyUnicodeObject), /* tp_size */
9991 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009993 (destructor)unicode_dealloc, /* tp_dealloc */
9994 0, /* tp_print */
9995 0, /* tp_getattr */
9996 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009997 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009998 unicode_repr, /* tp_repr */
9999 &unicode_as_number, /* tp_as_number */
10000 &unicode_as_sequence, /* tp_as_sequence */
10001 &unicode_as_mapping, /* tp_as_mapping */
10002 (hashfunc) unicode_hash, /* tp_hash*/
10003 0, /* tp_call*/
10004 (reprfunc) unicode_str, /* tp_str */
10005 PyObject_GenericGetAttr, /* tp_getattro */
10006 0, /* tp_setattro */
10007 0, /* tp_as_buffer */
10008 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000010009 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010010 unicode_doc, /* tp_doc */
10011 0, /* tp_traverse */
10012 0, /* tp_clear */
10013 PyUnicode_RichCompare, /* tp_richcompare */
10014 0, /* tp_weaklistoffset */
10015 unicode_iter, /* tp_iter */
10016 0, /* tp_iternext */
10017 unicode_methods, /* tp_methods */
10018 0, /* tp_members */
10019 0, /* tp_getset */
10020 &PyBaseObject_Type, /* tp_base */
10021 0, /* tp_dict */
10022 0, /* tp_descr_get */
10023 0, /* tp_descr_set */
10024 0, /* tp_dictoffset */
10025 0, /* tp_init */
10026 0, /* tp_alloc */
10027 unicode_new, /* tp_new */
10028 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029};
10030
10031/* Initialize the Unicode implementation */
10032
Thomas Wouters78890102000-07-22 19:25:51 +000010033void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010035 /* XXX - move this array to unicodectype.c ? */
10036 Py_UNICODE linebreak[] = {
10037 0x000A, /* LINE FEED */
10038 0x000D, /* CARRIAGE RETURN */
10039 0x001C, /* FILE SEPARATOR */
10040 0x001D, /* GROUP SEPARATOR */
10041 0x001E, /* RECORD SEPARATOR */
10042 0x0085, /* NEXT LINE */
10043 0x2028, /* LINE SEPARATOR */
10044 0x2029, /* PARAGRAPH SEPARATOR */
10045 };
10046
Fred Drakee4315f52000-05-09 19:53:39 +000010047 /* Init the implementation */
Serhiy Storchaka05997252013-01-26 12:14:02 +020010048 if (!unicode_empty) {
10049 unicode_empty = _PyUnicode_New(0);
10050 if (!unicode_empty)
10051 return;
10052 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010053
Guido van Rossumcacfc072002-05-24 19:01:59 +000010054 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010055 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010056
10057 /* initialize the linebreak bloom filter */
10058 bloom_linebreak = make_bloom_mask(
10059 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10060 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010061
10062 PyType_Ready(&EncodingMapType);
Benjamin Petersonc4311282012-10-30 23:21:10 -040010063
10064 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
10065 Py_FatalError("Can't initialize field name iterator type");
10066
10067 if (PyType_Ready(&PyFormatterIter_Type) < 0)
10068 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069}
10070
10071/* Finalize the Unicode implementation */
10072
Christian Heimesa156e092008-02-16 07:38:31 +000010073int
10074PyUnicode_ClearFreeList(void)
10075{
10076 int freelist_size = numfree;
10077 PyUnicodeObject *u;
10078
10079 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010080 PyUnicodeObject *v = u;
10081 u = *(PyUnicodeObject **)u;
10082 if (v->str)
10083 PyObject_DEL(v->str);
10084 Py_XDECREF(v->defenc);
10085 PyObject_Del(v);
10086 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010087 }
10088 free_list = NULL;
10089 assert(numfree == 0);
10090 return freelist_size;
10091}
10092
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093void
Thomas Wouters78890102000-07-22 19:25:51 +000010094_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010096 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010097
Serhiy Storchaka05997252013-01-26 12:14:02 +020010098 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010099
Serhiy Storchaka05997252013-01-26 12:14:02 +020010100 for (i = 0; i < 256; i++)
10101 Py_CLEAR(unicode_latin1[i]);
10102
Christian Heimesa156e092008-02-16 07:38:31 +000010103 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010105
Walter Dörwald16807132007-05-25 13:52:07 +000010106void
10107PyUnicode_InternInPlace(PyObject **p)
10108{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010109 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10110 PyObject *t;
10111 if (s == NULL || !PyUnicode_Check(s))
10112 Py_FatalError(
10113 "PyUnicode_InternInPlace: unicode strings only please!");
10114 /* If it's a subclass, we don't really know what putting
10115 it in the interned dict might do. */
10116 if (!PyUnicode_CheckExact(s))
10117 return;
10118 if (PyUnicode_CHECK_INTERNED(s))
10119 return;
10120 if (interned == NULL) {
10121 interned = PyDict_New();
10122 if (interned == NULL) {
10123 PyErr_Clear(); /* Don't leave an exception */
10124 return;
10125 }
10126 }
10127 /* It might be that the GetItem call fails even
10128 though the key is present in the dictionary,
10129 namely when this happens during a stack overflow. */
10130 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010131 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010132 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010133
Benjamin Peterson29060642009-01-31 22:14:21 +000010134 if (t) {
10135 Py_INCREF(t);
10136 Py_DECREF(*p);
10137 *p = t;
10138 return;
10139 }
Walter Dörwald16807132007-05-25 13:52:07 +000010140
Benjamin Peterson14339b62009-01-31 16:36:08 +000010141 PyThreadState_GET()->recursion_critical = 1;
10142 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10143 PyErr_Clear();
10144 PyThreadState_GET()->recursion_critical = 0;
10145 return;
10146 }
10147 PyThreadState_GET()->recursion_critical = 0;
10148 /* The two references in interned are not counted by refcnt.
10149 The deallocator will take care of this */
10150 Py_REFCNT(s) -= 2;
10151 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010152}
10153
10154void
10155PyUnicode_InternImmortal(PyObject **p)
10156{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010157 PyUnicode_InternInPlace(p);
10158 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10159 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10160 Py_INCREF(*p);
10161 }
Walter Dörwald16807132007-05-25 13:52:07 +000010162}
10163
10164PyObject *
10165PyUnicode_InternFromString(const char *cp)
10166{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010167 PyObject *s = PyUnicode_FromString(cp);
10168 if (s == NULL)
10169 return NULL;
10170 PyUnicode_InternInPlace(&s);
10171 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010172}
10173
10174void _Py_ReleaseInternedUnicodeStrings(void)
10175{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010176 PyObject *keys;
10177 PyUnicodeObject *s;
10178 Py_ssize_t i, n;
10179 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010180
Benjamin Peterson14339b62009-01-31 16:36:08 +000010181 if (interned == NULL || !PyDict_Check(interned))
10182 return;
10183 keys = PyDict_Keys(interned);
10184 if (keys == NULL || !PyList_Check(keys)) {
10185 PyErr_Clear();
10186 return;
10187 }
Walter Dörwald16807132007-05-25 13:52:07 +000010188
Benjamin Peterson14339b62009-01-31 16:36:08 +000010189 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10190 detector, interned unicode strings are not forcibly deallocated;
10191 rather, we give them their stolen references back, and then clear
10192 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010193
Benjamin Peterson14339b62009-01-31 16:36:08 +000010194 n = PyList_GET_SIZE(keys);
10195 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010196 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010197 for (i = 0; i < n; i++) {
10198 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10199 switch (s->state) {
10200 case SSTATE_NOT_INTERNED:
10201 /* XXX Shouldn't happen */
10202 break;
10203 case SSTATE_INTERNED_IMMORTAL:
10204 Py_REFCNT(s) += 1;
10205 immortal_size += s->length;
10206 break;
10207 case SSTATE_INTERNED_MORTAL:
10208 Py_REFCNT(s) += 2;
10209 mortal_size += s->length;
10210 break;
10211 default:
10212 Py_FatalError("Inconsistent interned string state.");
10213 }
10214 s->state = SSTATE_NOT_INTERNED;
10215 }
10216 fprintf(stderr, "total size of all interned strings: "
10217 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10218 "mortal/immortal\n", mortal_size, immortal_size);
10219 Py_DECREF(keys);
10220 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020010221 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000010222}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010223
10224
10225/********************* Unicode Iterator **************************/
10226
10227typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010228 PyObject_HEAD
10229 Py_ssize_t it_index;
10230 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010231} unicodeiterobject;
10232
10233static void
10234unicodeiter_dealloc(unicodeiterobject *it)
10235{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010236 _PyObject_GC_UNTRACK(it);
10237 Py_XDECREF(it->it_seq);
10238 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010239}
10240
10241static int
10242unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10243{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010244 Py_VISIT(it->it_seq);
10245 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010246}
10247
10248static PyObject *
10249unicodeiter_next(unicodeiterobject *it)
10250{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010251 PyUnicodeObject *seq;
10252 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010253
Benjamin Peterson14339b62009-01-31 16:36:08 +000010254 assert(it != NULL);
10255 seq = it->it_seq;
10256 if (seq == NULL)
10257 return NULL;
10258 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010259
Benjamin Peterson14339b62009-01-31 16:36:08 +000010260 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10261 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010262 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010263 if (item != NULL)
10264 ++it->it_index;
10265 return item;
10266 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010267
Benjamin Peterson14339b62009-01-31 16:36:08 +000010268 Py_DECREF(seq);
10269 it->it_seq = NULL;
10270 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010271}
10272
10273static PyObject *
10274unicodeiter_len(unicodeiterobject *it)
10275{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010276 Py_ssize_t len = 0;
10277 if (it->it_seq)
10278 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10279 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010280}
10281
10282PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10283
10284static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010285 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010286 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010287 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010288};
10289
10290PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010291 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10292 "str_iterator", /* tp_name */
10293 sizeof(unicodeiterobject), /* tp_basicsize */
10294 0, /* tp_itemsize */
10295 /* methods */
10296 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10297 0, /* tp_print */
10298 0, /* tp_getattr */
10299 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010300 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010301 0, /* tp_repr */
10302 0, /* tp_as_number */
10303 0, /* tp_as_sequence */
10304 0, /* tp_as_mapping */
10305 0, /* tp_hash */
10306 0, /* tp_call */
10307 0, /* tp_str */
10308 PyObject_GenericGetAttr, /* tp_getattro */
10309 0, /* tp_setattro */
10310 0, /* tp_as_buffer */
10311 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10312 0, /* tp_doc */
10313 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10314 0, /* tp_clear */
10315 0, /* tp_richcompare */
10316 0, /* tp_weaklistoffset */
10317 PyObject_SelfIter, /* tp_iter */
10318 (iternextfunc)unicodeiter_next, /* tp_iternext */
10319 unicodeiter_methods, /* tp_methods */
10320 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010321};
10322
10323static PyObject *
10324unicode_iter(PyObject *seq)
10325{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010326 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010327
Benjamin Peterson14339b62009-01-31 16:36:08 +000010328 if (!PyUnicode_Check(seq)) {
10329 PyErr_BadInternalCall();
10330 return NULL;
10331 }
10332 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10333 if (it == NULL)
10334 return NULL;
10335 it->it_index = 0;
10336 Py_INCREF(seq);
10337 it->it_seq = (PyUnicodeObject *)seq;
10338 _PyObject_GC_TRACK(it);
10339 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010340}
10341
Martin v. Löwis5b222132007-06-10 09:51:05 +000010342size_t
10343Py_UNICODE_strlen(const Py_UNICODE *u)
10344{
10345 int res = 0;
10346 while(*u++)
10347 res++;
10348 return res;
10349}
10350
10351Py_UNICODE*
10352Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10353{
10354 Py_UNICODE *u = s1;
10355 while ((*u++ = *s2++));
10356 return s1;
10357}
10358
10359Py_UNICODE*
10360Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10361{
10362 Py_UNICODE *u = s1;
10363 while ((*u++ = *s2++))
10364 if (n-- == 0)
10365 break;
10366 return s1;
10367}
10368
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010369Py_UNICODE*
10370Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10371{
10372 Py_UNICODE *u1 = s1;
10373 u1 += Py_UNICODE_strlen(u1);
10374 Py_UNICODE_strcpy(u1, s2);
10375 return s1;
10376}
10377
Martin v. Löwis5b222132007-06-10 09:51:05 +000010378int
10379Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10380{
10381 while (*s1 && *s2 && *s1 == *s2)
10382 s1++, s2++;
10383 if (*s1 && *s2)
10384 return (*s1 < *s2) ? -1 : +1;
10385 if (*s1)
10386 return 1;
10387 if (*s2)
10388 return -1;
10389 return 0;
10390}
10391
Victor Stinneref8d95c2010-08-16 22:03:11 +000010392int
10393Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10394{
10395 register Py_UNICODE u1, u2;
10396 for (; n != 0; n--) {
10397 u1 = *s1;
10398 u2 = *s2;
10399 if (u1 != u2)
10400 return (u1 < u2) ? -1 : +1;
10401 if (u1 == '\0')
10402 return 0;
10403 s1++;
10404 s2++;
10405 }
10406 return 0;
10407}
10408
Martin v. Löwis5b222132007-06-10 09:51:05 +000010409Py_UNICODE*
10410Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10411{
10412 const Py_UNICODE *p;
10413 for (p = s; *p; p++)
10414 if (*p == c)
10415 return (Py_UNICODE*)p;
10416 return NULL;
10417}
10418
Victor Stinner331ea922010-08-10 16:37:20 +000010419Py_UNICODE*
10420Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10421{
10422 const Py_UNICODE *p;
10423 p = s + Py_UNICODE_strlen(s);
10424 while (p != s) {
10425 p--;
10426 if (*p == c)
10427 return (Py_UNICODE*)p;
10428 }
10429 return NULL;
10430}
10431
Victor Stinner71133ff2010-09-01 23:43:53 +000010432Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010433PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010434{
10435 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10436 Py_UNICODE *copy;
10437 Py_ssize_t size;
10438
10439 /* Ensure we won't overflow the size. */
10440 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10441 PyErr_NoMemory();
10442 return NULL;
10443 }
10444 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10445 size *= sizeof(Py_UNICODE);
10446 copy = PyMem_Malloc(size);
10447 if (copy == NULL) {
10448 PyErr_NoMemory();
10449 return NULL;
10450 }
10451 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10452 return copy;
10453}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010454
Georg Brandl66c221e2010-10-14 07:04:07 +000010455/* A _string module, to export formatter_parser and formatter_field_name_split
10456 to the string.Formatter class implemented in Python. */
10457
10458static PyMethodDef _string_methods[] = {
10459 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10460 METH_O, PyDoc_STR("split the argument as a field name")},
10461 {"formatter_parser", (PyCFunction) formatter_parser,
10462 METH_O, PyDoc_STR("parse the argument as a format string")},
10463 {NULL, NULL}
10464};
10465
10466static struct PyModuleDef _string_module = {
10467 PyModuleDef_HEAD_INIT,
10468 "_string",
10469 PyDoc_STR("string helper module"),
10470 0,
10471 _string_methods,
10472 NULL,
10473 NULL,
10474 NULL,
10475 NULL
10476};
10477
10478PyMODINIT_FUNC
10479PyInit__string(void)
10480{
10481 return PyModule_Create(&_string_module);
10482}
10483
10484
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010485#ifdef __cplusplus
10486}
10487#endif