blob: 3a288d845b8eaa385580ee6f53b84350caca1afa [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
Serhiy Storchaka05997252013-01-26 12:14:02 +020083NOTE: In the interpreter's initialization phase, some globals are currently
84 initialized dynamically as needed. In the process Unicode objects may
85 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086
87*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000089
90#ifdef __cplusplus
91extern "C" {
92#endif
93
Walter Dörwald16807132007-05-25 13:52:07 +000094/* This dictionary holds all interned unicode strings. Note that references
95 to strings in this dictionary are *not* counted in the string's ob_refcnt.
96 When the interned string reaches a refcnt of 0 the string deallocation
97 function will delete the reference from this dictionary.
98
99 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000100 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000101*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200102static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000103
Guido van Rossumd57fd912000-03-10 22:53:23 +0000104/* Free list for Unicode objects */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200105static PyUnicodeObject *free_list = NULL;
106static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000108/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200109static PyUnicodeObject *unicode_empty = NULL;
110
111#define _Py_RETURN_UNICODE_EMPTY() \
112 do { \
113 if (unicode_empty != NULL) \
114 Py_INCREF(unicode_empty); \
115 else { \
116 unicode_empty = _PyUnicode_New(0); \
117 if (unicode_empty != NULL) \
118 Py_INCREF(unicode_empty); \
119 } \
120 return (PyObject *)unicode_empty; \
121 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000122
123/* Single character Unicode strings in the Latin-1 range are being
124 shared as well. */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200125static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000126
Christian Heimes190d79e2008-01-30 11:58:22 +0000127/* Fast detection of the most frequent whitespace characters */
128const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000129 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000130/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000131/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000132/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000133/* case 0x000C: * FORM FEED */
134/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 1, 1, 1, 1, 1, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000137/* case 0x001C: * FILE SEPARATOR */
138/* case 0x001D: * GROUP SEPARATOR */
139/* case 0x001E: * RECORD SEPARATOR */
140/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000141 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000142/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000143 1, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000147
Benjamin Peterson14339b62009-01-31 16:36:08 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000156};
157
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000158static PyObject *unicode_encode_call_errorhandler(const char *errors,
159 PyObject **errorHandler,const char *encoding, const char *reason,
160 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
161 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
162
Victor Stinner31be90b2010-04-22 19:38:16 +0000163static void raise_encode_exception(PyObject **exceptionObject,
164 const char *encoding,
165 const Py_UNICODE *unicode, Py_ssize_t size,
166 Py_ssize_t startpos, Py_ssize_t endpos,
167 const char *reason);
168
Christian Heimes190d79e2008-01-30 11:58:22 +0000169/* Same for linebreaks */
170static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000172/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000173/* 0x000B, * LINE TABULATION */
174/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000175/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000176 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000178/* 0x001C, * FILE SEPARATOR */
179/* 0x001D, * GROUP SEPARATOR */
180/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000181 0, 0, 0, 0, 1, 1, 1, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000186
Benjamin Peterson14339b62009-01-31 16:36:08 +0000187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000195};
196
197
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000199PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000200{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000201#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000203#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 /* This is actually an illegal character, so it should
205 not be passed to unichr. */
206 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000207#endif
208}
209
Thomas Wouters477c8d52006-05-27 19:21:47 +0000210/* --- Bloom Filters ----------------------------------------------------- */
211
212/* stuff to implement simple "bloom filters" for Unicode characters.
213 to keep things simple, we use a single bitmask, using the least 5
214 bits from each unicode characters as the bit index. */
215
216/* the linebreak mask is set up by Unicode_Init below */
217
Antoine Pitrouf068f942010-01-13 14:19:12 +0000218#if LONG_BIT >= 128
219#define BLOOM_WIDTH 128
220#elif LONG_BIT >= 64
221#define BLOOM_WIDTH 64
222#elif LONG_BIT >= 32
223#define BLOOM_WIDTH 32
224#else
225#error "LONG_BIT is smaller than 32"
226#endif
227
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228#define BLOOM_MASK unsigned long
229
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231
Antoine Pitrouf068f942010-01-13 14:19:12 +0000232#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
233#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234
Benjamin Peterson29060642009-01-31 22:14:21 +0000235#define BLOOM_LINEBREAK(ch) \
236 ((ch) < 128U ? ascii_linebreak[(ch)] : \
237 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000238
239Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
240{
241 /* calculate simple bloom-style bitmask for a given unicode string */
242
Antoine Pitrouf068f942010-01-13 14:19:12 +0000243 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000244 Py_ssize_t i;
245
246 mask = 0;
247 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000248 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000249
250 return mask;
251}
252
253Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
254{
255 Py_ssize_t i;
256
257 for (i = 0; i < setlen; i++)
258 if (set[i] == chr)
259 return 1;
260
261 return 0;
262}
263
Benjamin Peterson29060642009-01-31 22:14:21 +0000264#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000265 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
266
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267/* --- Unicode Object ----------------------------------------------------- */
268
269static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272{
273 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000274
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000275 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000277 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000279 /* Resizing shared object (unicode_empty or single character
280 objects) in-place is not allowed. Use PyUnicode_Resize()
281 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000284 (unicode->length == 1 &&
285 unicode->str[0] < 256U &&
286 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000288 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 return -1;
290 }
291
Thomas Wouters477c8d52006-05-27 19:21:47 +0000292 /* We allocate one more byte to make sure the string is Ux0000 terminated.
293 The overallocation is also used by fastsearch, which assumes that it's
294 safe to look at str[length] (without making any assumptions about what
295 it contains). */
296
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000298 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000301 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 PyErr_NoMemory();
303 return -1;
304 }
305 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307
Benjamin Peterson29060642009-01-31 22:14:21 +0000308 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000311 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000430 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000431 }
432 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000433 *(PyUnicodeObject **)unicode = free_list;
434 free_list = unicode;
435 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436 }
437 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyObject_DEL(unicode->str);
439 Py_XDECREF(unicode->defenc);
440 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441 }
442}
443
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444static
445int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000446{
447 register PyUnicodeObject *v;
448
449 /* Argument checks */
450 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 PyErr_BadInternalCall();
452 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000454 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000455 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000456 PyErr_BadInternalCall();
457 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000458 }
459
460 /* Resizing unicode_empty and single character objects is not
461 possible since these are being shared. We simply return a fresh
462 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000463 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000464 (v == unicode_empty || v->length == 1)) {
465 PyUnicodeObject *w = _PyUnicode_New(length);
466 if (w == NULL)
467 return -1;
468 Py_UNICODE_COPY(w->str, v->str,
469 length < v->length ? length : v->length);
470 Py_DECREF(*unicode);
471 *unicode = w;
472 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000473 }
474
475 /* Note that we don't have to modify *unicode for unshared Unicode
476 objects, since we can modify them in-place. */
477 return unicode_resize(v, length);
478}
479
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000480int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
481{
482 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
483}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000487{
488 PyUnicodeObject *unicode;
489
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000490 /* If the Unicode data is known at construction time, we can apply
491 some optimizations which share commonly used objects. */
492 if (u != NULL) {
493
Benjamin Peterson29060642009-01-31 22:14:21 +0000494 /* Optimization for empty strings */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200495 if (size == 0)
496 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +0000497
498 /* Single character Unicode objects in the Latin-1 range are
499 shared when using this constructor */
500 if (size == 1 && *u < 256) {
501 unicode = unicode_latin1[*u];
502 if (!unicode) {
503 unicode = _PyUnicode_New(1);
504 if (!unicode)
505 return NULL;
506 unicode->str[0] = *u;
507 unicode_latin1[*u] = unicode;
508 }
509 Py_INCREF(unicode);
510 return (PyObject *)unicode;
511 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000512 }
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode = _PyUnicode_New(size);
515 if (!unicode)
516 return NULL;
517
518 /* Copy the Unicode data into the new object */
519 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521
522 return (PyObject *)unicode;
523}
524
Walter Dörwaldd2034312007-05-18 16:29:38 +0000525PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000526{
527 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000528
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 if (size < 0) {
530 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000531 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 return NULL;
533 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000534
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000535 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000536 some optimizations which share commonly used objects.
537 Also, this means the input must be UTF-8, so fall back to the
538 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000539 if (u != NULL) {
540
Benjamin Peterson29060642009-01-31 22:14:21 +0000541 /* Optimization for empty strings */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200542 if (size == 0)
543 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +0000544
545 /* Single characters are shared when using this constructor.
546 Restrict to ASCII, since the input must be UTF-8. */
547 if (size == 1 && Py_CHARMASK(*u) < 128) {
548 unicode = unicode_latin1[Py_CHARMASK(*u)];
549 if (!unicode) {
550 unicode = _PyUnicode_New(1);
551 if (!unicode)
552 return NULL;
553 unicode->str[0] = Py_CHARMASK(*u);
554 unicode_latin1[Py_CHARMASK(*u)] = unicode;
555 }
556 Py_INCREF(unicode);
557 return (PyObject *)unicode;
558 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000559
560 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 }
562
Walter Dörwald55507312007-05-18 13:12:10 +0000563 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 if (!unicode)
565 return NULL;
566
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000567 return (PyObject *)unicode;
568}
569
Walter Dörwaldd2034312007-05-18 16:29:38 +0000570PyObject *PyUnicode_FromString(const char *u)
571{
572 size_t size = strlen(u);
573 if (size > PY_SSIZE_T_MAX) {
574 PyErr_SetString(PyExc_OverflowError, "input too long");
575 return NULL;
576 }
577
578 return PyUnicode_FromStringAndSize(u, size);
579}
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson081dfee2009-03-18 14:47:41 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 if (size == 0)
602 return PyUnicode_FromStringAndSize(NULL, 0);
603 PyErr_BadInternalCall();
604 return NULL;
605 }
606
607 if (size == -1) {
608 size = wcslen(w);
609 }
610
611 alloc = size;
612 orig_w = w;
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF)
615 alloc++;
616 w++;
617 }
618 w = orig_w;
619 unicode = _PyUnicode_New(alloc);
620 if (!unicode)
621 return NULL;
622
623 /* Copy the wchar_t data into the new object */
624 {
625 register Py_UNICODE *u;
626 u = PyUnicode_AS_UNICODE(unicode);
627 for (i = size; i > 0; i--) {
628 if (*w > 0xFFFF) {
629 wchar_t ordinal = *w++;
630 ordinal -= 0x10000;
631 *u++ = 0xD800 | (ordinal >> 10);
632 *u++ = 0xDC00 | (ordinal & 0x3FF);
633 }
634 else
635 *u++ = *w++;
636 }
637 }
638 return (PyObject *)unicode;
639}
640
641#else
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000644 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645{
646 PyUnicodeObject *unicode;
647
648 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == 0)
650 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000651 PyErr_BadInternalCall();
652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 }
654
Martin v. Löwis790465f2008-04-05 20:41:37 +0000655 if (size == -1) {
656 size = wcslen(w);
657 }
658
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 unicode = _PyUnicode_New(size);
660 if (!unicode)
661 return NULL;
662
663 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000664#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000666#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000668 register Py_UNICODE *u;
669 register Py_ssize_t i;
670 u = PyUnicode_AS_UNICODE(unicode);
671 for (i = size; i > 0; i--)
672 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000673 }
674#endif
675
676 return (PyObject *)unicode;
677}
678
Mark Dickinson081dfee2009-03-18 14:47:41 +0000679#endif /* CONVERT_WCHAR_TO_SURROGATES */
680
681#undef CONVERT_WCHAR_TO_SURROGATES
682
Walter Dörwald346737f2007-05-31 10:44:43 +0000683static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000684makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
685 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000686{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000687 *fmt++ = '%';
688 if (width) {
689 if (zeropad)
690 *fmt++ = '0';
691 fmt += sprintf(fmt, "%d", width);
692 }
693 if (precision)
694 fmt += sprintf(fmt, ".%d", precision);
695 if (longflag)
696 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000697 else if (longlongflag) {
698 /* longlongflag should only ever be nonzero on machines with
699 HAVE_LONG_LONG defined */
700#ifdef HAVE_LONG_LONG
701 char *f = PY_FORMAT_LONG_LONG;
702 while (*f)
703 *fmt++ = *f++;
704#else
705 /* we shouldn't ever get here */
706 assert(0);
707 *fmt++ = 'l';
708#endif
709 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000710 else if (size_tflag) {
711 char *f = PY_FORMAT_SIZE_T;
712 while (*f)
713 *fmt++ = *f++;
714 }
715 *fmt++ = c;
716 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000717}
718
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
720
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000721/* size of fixed-size buffer for formatting single arguments */
722#define ITEM_BUFFER_LEN 21
723/* maximum number of characters required for output of %ld. 21 characters
724 allows for 64-bit integers (in decimal) and an optional sign. */
725#define MAX_LONG_CHARS 21
726/* maximum number of characters required for output of %lld.
727 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
728 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
729#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
730
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731PyObject *
732PyUnicode_FromFormatV(const char *format, va_list vargs)
733{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000734 va_list count;
735 Py_ssize_t callcount = 0;
736 PyObject **callresults = NULL;
737 PyObject **callresult = NULL;
738 Py_ssize_t n = 0;
739 int width = 0;
740 int precision = 0;
741 int zeropad;
742 const char* f;
743 Py_UNICODE *s;
744 PyObject *string;
745 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 /* use abuffer instead of buffer, if we need more space
748 * (which can happen if there's a format specifier with width). */
749 char *abuffer = NULL;
750 char *realbuffer;
751 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000752 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000753 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000755 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000756 /* step 1: count the number of %S/%R/%A/%s format specifications
757 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
758 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
759 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000760 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000761 if (*f == '%') {
762 if (*(f+1)=='%')
763 continue;
Victor Stinner2b574a22011-03-01 22:48:49 +0000764 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000765 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000766 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000767 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000768 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 ;
770 if (*f == 's')
771 ++callcount;
772 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000773 else if (128 <= (unsigned char)*f) {
774 PyErr_Format(PyExc_ValueError,
775 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000776 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000777 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000778 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000779 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000780 }
781 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000782 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000783 if (callcount) {
784 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
785 if (!callresults) {
786 PyErr_NoMemory();
787 return NULL;
788 }
789 callresult = callresults;
790 }
791 /* step 3: figure out how large a buffer we need */
792 for (f = format; *f; f++) {
793 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000794#ifdef HAVE_LONG_LONG
795 int longlongflag = 0;
796#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 const char* p = f;
798 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000799 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000801 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000802 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000803
Benjamin Peterson14339b62009-01-31 16:36:08 +0000804 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
805 * they don't affect the amount of space we reserve.
806 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000807 if (*f == 'l') {
808 if (f[1] == 'd' || f[1] == 'u') {
809 ++f;
810 }
811#ifdef HAVE_LONG_LONG
812 else if (f[1] == 'l' &&
813 (f[2] == 'd' || f[2] == 'u')) {
814 longlongflag = 1;
815 f += 2;
816 }
817#endif
818 }
819 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000820 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000821 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000822
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 switch (*f) {
824 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +0000825 {
826#ifndef Py_UNICODE_WIDE
827 int ordinal = va_arg(count, int);
828 if (ordinal > 0xffff)
829 n += 2;
830 else
831 n++;
832#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000833 (void)va_arg(count, int);
Victor Stinner659eb842011-02-23 12:14:22 +0000834 n++;
835#endif
836 break;
837 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000838 case '%':
839 n++;
840 break;
841 case 'd': case 'u': case 'i': case 'x':
842 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000843#ifdef HAVE_LONG_LONG
844 if (longlongflag) {
845 if (width < MAX_LONG_LONG_CHARS)
846 width = MAX_LONG_LONG_CHARS;
847 }
848 else
849#endif
850 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
851 including sign. Decimal takes the most space. This
852 isn't enough for octal. If a width is specified we
853 need more (which we allocate later). */
854 if (width < MAX_LONG_CHARS)
855 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000856 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000857 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000858 if (abuffersize < width)
859 abuffersize = width;
860 break;
861 case 's':
862 {
863 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000864 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000865 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
866 if (!str)
867 goto fail;
868 n += PyUnicode_GET_SIZE(str);
869 /* Remember the str and switch to the next slot */
870 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000871 break;
872 }
873 case 'U':
874 {
875 PyObject *obj = va_arg(count, PyObject *);
876 assert(obj && PyUnicode_Check(obj));
877 n += PyUnicode_GET_SIZE(obj);
878 break;
879 }
880 case 'V':
881 {
882 PyObject *obj = va_arg(count, PyObject *);
883 const char *str = va_arg(count, const char *);
Victor Stinner2b574a22011-03-01 22:48:49 +0000884 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000885 assert(obj || str);
886 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2b574a22011-03-01 22:48:49 +0000887 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000888 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2b574a22011-03-01 22:48:49 +0000889 *callresult++ = NULL;
890 }
891 else {
892 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
893 if (!str_obj)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str_obj);
896 *callresult++ = str_obj;
897 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000898 break;
899 }
900 case 'S':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *str;
904 assert(obj);
905 str = PyObject_Str(obj);
906 if (!str)
907 goto fail;
908 n += PyUnicode_GET_SIZE(str);
909 /* Remember the str and switch to the next slot */
910 *callresult++ = str;
911 break;
912 }
913 case 'R':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *repr;
917 assert(obj);
918 repr = PyObject_Repr(obj);
919 if (!repr)
920 goto fail;
921 n += PyUnicode_GET_SIZE(repr);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = repr;
924 break;
925 }
926 case 'A':
927 {
928 PyObject *obj = va_arg(count, PyObject *);
929 PyObject *ascii;
930 assert(obj);
931 ascii = PyObject_ASCII(obj);
932 if (!ascii)
933 goto fail;
934 n += PyUnicode_GET_SIZE(ascii);
935 /* Remember the repr and switch to the next slot */
936 *callresult++ = ascii;
937 break;
938 }
939 case 'p':
940 (void) va_arg(count, int);
941 /* maximum 64-bit pointer representation:
942 * 0xffffffffffffffff
943 * so 19 characters is enough.
944 * XXX I count 18 -- what's the extra for?
945 */
946 n += 19;
947 break;
948 default:
949 /* if we stumble upon an unknown
950 formatting code, copy the rest of
951 the format string to the output
952 string. (we cannot just skip the
953 code, since there's no way to know
954 what's in the argument list) */
955 n += strlen(p);
956 goto expand;
957 }
958 } else
959 n++;
960 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000961 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000962 if (abuffersize > ITEM_BUFFER_LEN) {
963 /* add 1 for sprintf's trailing null byte */
964 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000965 if (!abuffer) {
966 PyErr_NoMemory();
967 goto fail;
968 }
969 realbuffer = abuffer;
970 }
971 else
972 realbuffer = buffer;
973 /* step 4: fill the buffer */
974 /* Since we've analyzed how much space we need for the worst case,
975 we don't have to resize the string.
976 There can be no errors beyond this point. */
977 string = PyUnicode_FromUnicode(NULL, n);
978 if (!string)
979 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000980
Benjamin Peterson14339b62009-01-31 16:36:08 +0000981 s = PyUnicode_AS_UNICODE(string);
982 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000983
Benjamin Peterson14339b62009-01-31 16:36:08 +0000984 for (f = format; *f; f++) {
985 if (*f == '%') {
986 const char* p = f++;
987 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 int size_tflag = 0;
990 zeropad = (*f == '0');
991 /* parse the width.precision part */
992 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000993 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000994 width = (width*10) + *f++ - '0';
995 precision = 0;
996 if (*f == '.') {
997 f++;
David Malcolm96960882010-11-05 17:23:41 +0000998 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000999 precision = (precision*10) + *f++ - '0';
1000 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001001 /* Handle %ld, %lu, %lld and %llu. */
1002 if (*f == 'l') {
1003 if (f[1] == 'd' || f[1] == 'u') {
1004 longflag = 1;
1005 ++f;
1006 }
1007#ifdef HAVE_LONG_LONG
1008 else if (f[1] == 'l' &&
1009 (f[2] == 'd' || f[2] == 'u')) {
1010 longlongflag = 1;
1011 f += 2;
1012 }
1013#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001014 }
1015 /* handle the size_t flag. */
1016 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1017 size_tflag = 1;
1018 ++f;
1019 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 switch (*f) {
1022 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +00001023 {
1024 int ordinal = va_arg(vargs, int);
1025#ifndef Py_UNICODE_WIDE
1026 if (ordinal > 0xffff) {
1027 ordinal -= 0x10000;
1028 *s++ = 0xD800 | (ordinal >> 10);
1029 *s++ = 0xDC00 | (ordinal & 0x3FF);
1030 } else
1031#endif
1032 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 break;
Victor Stinner659eb842011-02-23 12:14:22 +00001034 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001036 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1037 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001038 if (longflag)
1039 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001040#ifdef HAVE_LONG_LONG
1041 else if (longlongflag)
1042 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1043#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001044 else if (size_tflag)
1045 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1046 else
1047 sprintf(realbuffer, fmt, va_arg(vargs, int));
1048 appendstring(realbuffer);
1049 break;
1050 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001051 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1052 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001053 if (longflag)
1054 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001055#ifdef HAVE_LONG_LONG
1056 else if (longlongflag)
1057 sprintf(realbuffer, fmt, va_arg(vargs,
1058 unsigned PY_LONG_LONG));
1059#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001060 else if (size_tflag)
1061 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1062 else
1063 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1064 appendstring(realbuffer);
1065 break;
1066 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001067 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001068 sprintf(realbuffer, fmt, va_arg(vargs, int));
1069 appendstring(realbuffer);
1070 break;
1071 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001072 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001073 sprintf(realbuffer, fmt, va_arg(vargs, int));
1074 appendstring(realbuffer);
1075 break;
1076 case 's':
1077 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001078 /* unused, since we already have the result */
1079 (void) va_arg(vargs, char *);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1081 PyUnicode_GET_SIZE(*callresult));
1082 s += PyUnicode_GET_SIZE(*callresult);
1083 /* We're done with the unicode()/repr() => forget it */
1084 Py_DECREF(*callresult);
1085 /* switch to next unicode()/repr() result */
1086 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001087 break;
1088 }
1089 case 'U':
1090 {
1091 PyObject *obj = va_arg(vargs, PyObject *);
1092 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1093 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1094 s += size;
1095 break;
1096 }
1097 case 'V':
1098 {
1099 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2b574a22011-03-01 22:48:49 +00001100 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001101 if (obj) {
1102 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1103 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1104 s += size;
1105 } else {
Victor Stinner2b574a22011-03-01 22:48:49 +00001106 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1107 PyUnicode_GET_SIZE(*callresult));
1108 s += PyUnicode_GET_SIZE(*callresult);
1109 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001110 }
Victor Stinner2b574a22011-03-01 22:48:49 +00001111 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001112 break;
1113 }
1114 case 'S':
1115 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001116 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001117 {
1118 Py_UNICODE *ucopy;
1119 Py_ssize_t usize;
1120 Py_ssize_t upos;
1121 /* unused, since we already have the result */
1122 (void) va_arg(vargs, PyObject *);
1123 ucopy = PyUnicode_AS_UNICODE(*callresult);
1124 usize = PyUnicode_GET_SIZE(*callresult);
1125 for (upos = 0; upos<usize;)
1126 *s++ = ucopy[upos++];
1127 /* We're done with the unicode()/repr() => forget it */
1128 Py_DECREF(*callresult);
1129 /* switch to next unicode()/repr() result */
1130 ++callresult;
1131 break;
1132 }
1133 case 'p':
1134 sprintf(buffer, "%p", va_arg(vargs, void*));
1135 /* %p is ill-defined: ensure leading 0x. */
1136 if (buffer[1] == 'X')
1137 buffer[1] = 'x';
1138 else if (buffer[1] != 'x') {
1139 memmove(buffer+2, buffer, strlen(buffer)+1);
1140 buffer[0] = '0';
1141 buffer[1] = 'x';
1142 }
1143 appendstring(buffer);
1144 break;
1145 case '%':
1146 *s++ = '%';
1147 break;
1148 default:
1149 appendstring(p);
1150 goto end;
1151 }
Victor Stinner1205f272010-09-11 00:54:47 +00001152 }
Victor Stinner1205f272010-09-11 00:54:47 +00001153 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 *s++ = *f;
1155 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
Benjamin Peterson29060642009-01-31 22:14:21 +00001157 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 if (callresults)
1159 PyObject_Free(callresults);
1160 if (abuffer)
1161 PyObject_Free(abuffer);
1162 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1163 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001164 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001165 if (callresults) {
1166 PyObject **callresult2 = callresults;
1167 while (callresult2 < callresult) {
Victor Stinner2b574a22011-03-01 22:48:49 +00001168 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001169 ++callresult2;
1170 }
1171 PyObject_Free(callresults);
1172 }
1173 if (abuffer)
1174 PyObject_Free(abuffer);
1175 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001176}
1177
1178#undef appendstring
1179
1180PyObject *
1181PyUnicode_FromFormat(const char *format, ...)
1182{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001183 PyObject* ret;
1184 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001185
1186#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001187 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001188#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001189 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001190#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 ret = PyUnicode_FromFormatV(format, vargs);
1192 va_end(vargs);
1193 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001194}
1195
Victor Stinner5593d8a2010-10-02 11:11:27 +00001196/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1197 convert a Unicode object to a wide character string.
1198
Victor Stinnerd88d9832011-09-06 02:00:05 +02001199 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001200 character) required to convert the unicode object. Ignore size argument.
1201
Victor Stinnerd88d9832011-09-06 02:00:05 +02001202 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001203 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001204 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001205static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001206unicode_aswidechar(PyUnicodeObject *unicode,
1207 wchar_t *w,
1208 Py_ssize_t size)
1209{
1210#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001211 Py_ssize_t res;
1212 if (w != NULL) {
1213 res = PyUnicode_GET_SIZE(unicode);
1214 if (size > res)
1215 size = res + 1;
1216 else
1217 res = size;
1218 memcpy(w, unicode->str, size * sizeof(wchar_t));
1219 return res;
1220 }
1221 else
1222 return PyUnicode_GET_SIZE(unicode) + 1;
1223#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1224 register const Py_UNICODE *u;
1225 const Py_UNICODE *uend;
1226 const wchar_t *worig, *wend;
1227 Py_ssize_t nchar;
1228
Victor Stinner137c34c2010-09-29 10:25:54 +00001229 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001230 uend = u + PyUnicode_GET_SIZE(unicode);
1231 if (w != NULL) {
1232 worig = w;
1233 wend = w + size;
1234 while (u != uend && w != wend) {
1235 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1236 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1237 {
1238 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1239 u += 2;
1240 }
1241 else {
1242 *w = *u;
1243 u++;
1244 }
1245 w++;
1246 }
1247 if (w != wend)
1248 *w = L'\0';
1249 return w - worig;
1250 }
1251 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001252 nchar = 1; /* null character at the end */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001253 while (u != uend) {
1254 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1255 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1256 u += 2;
1257 else
1258 u++;
1259 nchar++;
1260 }
1261 }
1262 return nchar;
1263#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1264 register Py_UNICODE *u, *uend, ordinal;
1265 register Py_ssize_t i;
1266 wchar_t *worig, *wend;
1267 Py_ssize_t nchar;
1268
1269 u = PyUnicode_AS_UNICODE(unicode);
1270 uend = u + PyUnicode_GET_SIZE(u);
1271 if (w != NULL) {
1272 worig = w;
1273 wend = w + size;
1274 while (u != uend && w != wend) {
1275 ordinal = *u;
1276 if (ordinal > 0xffff) {
1277 ordinal -= 0x10000;
1278 *w++ = 0xD800 | (ordinal >> 10);
1279 *w++ = 0xDC00 | (ordinal & 0x3FF);
1280 }
1281 else
1282 *w++ = ordinal;
1283 u++;
1284 }
1285 if (w != wend)
1286 *w = 0;
1287 return w - worig;
1288 }
1289 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001290 nchar = 1; /* null character */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001291 while (u != uend) {
1292 if (*u > 0xffff)
1293 nchar += 2;
1294 else
1295 nchar++;
1296 u++;
1297 }
1298 return nchar;
1299 }
1300#else
1301# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001302#endif
1303}
1304
1305Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001306PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001307 wchar_t *w,
1308 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001309{
1310 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001311 PyErr_BadInternalCall();
1312 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315}
1316
Victor Stinner137c34c2010-09-29 10:25:54 +00001317wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001318PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001319 Py_ssize_t *size)
1320{
1321 wchar_t* buffer;
1322 Py_ssize_t buflen;
1323
1324 if (unicode == NULL) {
1325 PyErr_BadInternalCall();
1326 return NULL;
1327 }
1328
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001329 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001330 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001331 PyErr_NoMemory();
1332 return NULL;
1333 }
1334
Victor Stinner137c34c2010-09-29 10:25:54 +00001335 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1336 if (buffer == NULL) {
1337 PyErr_NoMemory();
1338 return NULL;
1339 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001340 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001341 if (size != NULL)
1342 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001343 return buffer;
1344}
1345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346#endif
1347
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001348PyObject *PyUnicode_FromOrdinal(int ordinal)
1349{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001350 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001351
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001352 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 PyErr_SetString(PyExc_ValueError,
1354 "chr() arg not in range(0x110000)");
1355 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001356 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001357
1358#ifndef Py_UNICODE_WIDE
1359 if (ordinal > 0xffff) {
1360 ordinal -= 0x10000;
1361 s[0] = 0xD800 | (ordinal >> 10);
1362 s[1] = 0xDC00 | (ordinal & 0x3FF);
1363 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001364 }
1365#endif
1366
Hye-Shik Chang40574832004-04-06 07:24:51 +00001367 s[0] = (Py_UNICODE)ordinal;
1368 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001369}
1370
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371PyObject *PyUnicode_FromObject(register PyObject *obj)
1372{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001373 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001375 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 Py_INCREF(obj);
1377 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001378 }
1379 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001380 /* For a Unicode subtype that's not a Unicode object,
1381 return a true Unicode object with the same data. */
1382 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1383 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001384 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001385 PyErr_Format(PyExc_TypeError,
1386 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001387 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001388 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001389}
1390
1391PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001392 const char *encoding,
1393 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001394{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001395 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001396 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001397
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001399 PyErr_BadInternalCall();
1400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001402
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001403 /* Decoding bytes objects is the most common case and should be fast */
1404 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02001405 if (PyBytes_GET_SIZE(obj) == 0)
1406 _Py_RETURN_UNICODE_EMPTY();
1407 v = PyUnicode_Decode(
1408 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1409 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001410 return v;
1411 }
1412
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001413 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001414 PyErr_SetString(PyExc_TypeError,
1415 "decoding str is not supported");
1416 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001417 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001418
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001419 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1420 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1421 PyErr_Format(PyExc_TypeError,
1422 "coercing to str: need bytes, bytearray "
1423 "or buffer-like object, %.80s found",
1424 Py_TYPE(obj)->tp_name);
1425 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001426 }
Tim Petersced69f82003-09-16 20:30:58 +00001427
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001428 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02001429 PyBuffer_Release(&buffer);
1430 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001432
Serhiy Storchaka05997252013-01-26 12:14:02 +02001433 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001434 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001435 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436}
1437
Victor Stinner600d3be2010-06-10 12:00:55 +00001438/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001439 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1440 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01001441int
1442_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00001443 char *lower,
1444 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001446 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001447 char *l;
1448 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001450 e = encoding;
1451 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001452 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001453 while (*e) {
1454 if (l == l_end)
1455 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001456 if (Py_ISUPPER(*e)) {
1457 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001458 }
1459 else if (*e == '_') {
1460 *l++ = '-';
1461 e++;
1462 }
1463 else {
1464 *l++ = *e++;
1465 }
1466 }
1467 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001468 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001469}
1470
1471PyObject *PyUnicode_Decode(const char *s,
1472 Py_ssize_t size,
1473 const char *encoding,
1474 const char *errors)
1475{
1476 PyObject *buffer = NULL, *unicode;
1477 Py_buffer info;
1478 char lower[11]; /* Enough for any encoding shortcut */
1479
1480 if (encoding == NULL)
1481 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001482
1483 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01001484 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Victor Stinner37296e82010-06-10 13:36:23 +00001485 if (strcmp(lower, "utf-8") == 0)
1486 return PyUnicode_DecodeUTF8(s, size, errors);
1487 else if ((strcmp(lower, "latin-1") == 0) ||
1488 (strcmp(lower, "iso-8859-1") == 0))
1489 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001490#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001491 else if (strcmp(lower, "mbcs") == 0)
1492 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001493#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001494 else if (strcmp(lower, "ascii") == 0)
1495 return PyUnicode_DecodeASCII(s, size, errors);
1496 else if (strcmp(lower, "utf-16") == 0)
1497 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1498 else if (strcmp(lower, "utf-32") == 0)
1499 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1500 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501
1502 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001503 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001504 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001505 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001506 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 if (buffer == NULL)
1508 goto onError;
1509 unicode = PyCodec_Decode(buffer, encoding, errors);
1510 if (unicode == NULL)
1511 goto onError;
1512 if (!PyUnicode_Check(unicode)) {
1513 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001514 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001515 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516 Py_DECREF(unicode);
1517 goto onError;
1518 }
1519 Py_DECREF(buffer);
1520 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001521
Benjamin Peterson29060642009-01-31 22:14:21 +00001522 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523 Py_XDECREF(buffer);
1524 return NULL;
1525}
1526
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001527PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1528 const char *encoding,
1529 const char *errors)
1530{
1531 PyObject *v;
1532
1533 if (!PyUnicode_Check(unicode)) {
1534 PyErr_BadArgument();
1535 goto onError;
1536 }
1537
1538 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001539 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001540
1541 /* Decode via the codec registry */
1542 v = PyCodec_Decode(unicode, encoding, errors);
1543 if (v == NULL)
1544 goto onError;
1545 return v;
1546
Benjamin Peterson29060642009-01-31 22:14:21 +00001547 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001548 return NULL;
1549}
1550
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001551PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1552 const char *encoding,
1553 const char *errors)
1554{
1555 PyObject *v;
1556
1557 if (!PyUnicode_Check(unicode)) {
1558 PyErr_BadArgument();
1559 goto onError;
1560 }
1561
1562 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001563 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001564
1565 /* Decode via the codec registry */
1566 v = PyCodec_Decode(unicode, encoding, errors);
1567 if (v == NULL)
1568 goto onError;
1569 if (!PyUnicode_Check(v)) {
1570 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001571 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001572 Py_TYPE(v)->tp_name);
1573 Py_DECREF(v);
1574 goto onError;
1575 }
1576 return v;
1577
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001579 return NULL;
1580}
1581
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001583 Py_ssize_t size,
1584 const char *encoding,
1585 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586{
1587 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001588
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 unicode = PyUnicode_FromUnicode(s, size);
1590 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1593 Py_DECREF(unicode);
1594 return v;
1595}
1596
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001597PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1598 const char *encoding,
1599 const char *errors)
1600{
1601 PyObject *v;
1602
1603 if (!PyUnicode_Check(unicode)) {
1604 PyErr_BadArgument();
1605 goto onError;
1606 }
1607
1608 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001609 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001610
1611 /* Encode via the codec registry */
1612 v = PyCodec_Encode(unicode, encoding, errors);
1613 if (v == NULL)
1614 goto onError;
1615 return v;
1616
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001618 return NULL;
1619}
1620
Victor Stinnerad158722010-10-27 00:25:46 +00001621PyObject *
1622PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001623{
Victor Stinner313a1202010-06-11 23:56:51 +00001624#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001625 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1626 PyUnicode_GET_SIZE(unicode),
1627 NULL);
1628#elif defined(__APPLE__)
1629 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1630 PyUnicode_GET_SIZE(unicode),
1631 "surrogateescape");
1632#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001633 PyInterpreterState *interp = PyThreadState_GET()->interp;
1634 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1635 cannot use it to encode and decode filenames before it is loaded. Load
1636 the Python codec requires to encode at least its own filename. Use the C
1637 version of the locale codec until the codec registry is initialized and
1638 the Python codec is loaded.
1639
1640 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1641 cannot only rely on it: check also interp->fscodec_initialized for
1642 subinterpreters. */
1643 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001644 return PyUnicode_AsEncodedString(unicode,
1645 Py_FileSystemDefaultEncoding,
1646 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001647 }
1648 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001649 /* locale encoding with surrogateescape */
1650 wchar_t *wchar;
1651 char *bytes;
1652 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001653 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001654
1655 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1656 if (wchar == NULL)
1657 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001658 bytes = _Py_wchar2char(wchar, &error_pos);
1659 if (bytes == NULL) {
1660 if (error_pos != (size_t)-1) {
1661 char *errmsg = strerror(errno);
1662 PyObject *exc = NULL;
1663 if (errmsg == NULL)
1664 errmsg = "Py_wchar2char() failed";
1665 raise_encode_exception(&exc,
1666 "filesystemencoding",
1667 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1668 error_pos, error_pos+1,
1669 errmsg);
1670 Py_XDECREF(exc);
1671 }
1672 else
1673 PyErr_NoMemory();
1674 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001675 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001676 }
1677 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001678
1679 bytes_obj = PyBytes_FromString(bytes);
1680 PyMem_Free(bytes);
1681 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001682 }
Victor Stinnerad158722010-10-27 00:25:46 +00001683#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001684}
1685
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1687 const char *encoding,
1688 const char *errors)
1689{
1690 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001691 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001692
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 if (!PyUnicode_Check(unicode)) {
1694 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001695 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 }
Fred Drakee4315f52000-05-09 19:53:39 +00001697
Tim Petersced69f82003-09-16 20:30:58 +00001698 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001699 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001700
1701 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01001702 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Victor Stinner37296e82010-06-10 13:36:23 +00001703 if (strcmp(lower, "utf-8") == 0)
1704 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1705 PyUnicode_GET_SIZE(unicode),
1706 errors);
1707 else if ((strcmp(lower, "latin-1") == 0) ||
1708 (strcmp(lower, "iso-8859-1") == 0))
1709 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1710 PyUnicode_GET_SIZE(unicode),
1711 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001712#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001713 else if (strcmp(lower, "mbcs") == 0)
1714 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1715 PyUnicode_GET_SIZE(unicode),
1716 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001717#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001718 else if (strcmp(lower, "ascii") == 0)
1719 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1720 PyUnicode_GET_SIZE(unicode),
1721 errors);
1722 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001723 /* During bootstrap, we may need to find the encodings
1724 package, to load the file system encoding, and require the
1725 file system encoding in order to load the encodings
1726 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001727
Victor Stinner59e62db2010-05-15 13:14:32 +00001728 Break out of this dependency by assuming that the path to
1729 the encodings module is ASCII-only. XXX could try wcstombs
1730 instead, if the file system encoding is the locale's
1731 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001732 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001733 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1734 !PyThreadState_GET()->interp->codecs_initialized)
1735 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1736 PyUnicode_GET_SIZE(unicode),
1737 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738
1739 /* Encode via the codec registry */
1740 v = PyCodec_Encode(unicode, encoding, errors);
1741 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001742 return NULL;
1743
1744 /* The normal path */
1745 if (PyBytes_Check(v))
1746 return v;
1747
1748 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001749 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001750 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001751 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001752
1753 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1754 "encoder %s returned bytearray instead of bytes",
1755 encoding);
1756 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001757 Py_DECREF(v);
1758 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001759 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001760
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001761 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1762 Py_DECREF(v);
1763 return b;
1764 }
1765
1766 PyErr_Format(PyExc_TypeError,
1767 "encoder did not return a bytes object (type=%.400s)",
1768 Py_TYPE(v)->tp_name);
1769 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001770 return NULL;
1771}
1772
1773PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1774 const char *encoding,
1775 const char *errors)
1776{
1777 PyObject *v;
1778
1779 if (!PyUnicode_Check(unicode)) {
1780 PyErr_BadArgument();
1781 goto onError;
1782 }
1783
1784 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001786
1787 /* Encode via the codec registry */
1788 v = PyCodec_Encode(unicode, encoding, errors);
1789 if (v == NULL)
1790 goto onError;
1791 if (!PyUnicode_Check(v)) {
1792 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001793 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001794 Py_TYPE(v)->tp_name);
1795 Py_DECREF(v);
1796 goto onError;
1797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001799
Benjamin Peterson29060642009-01-31 22:14:21 +00001800 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 return NULL;
1802}
1803
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001804PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001805 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001806{
1807 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001808 if (v)
1809 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001810 if (errors != NULL)
1811 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001812 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001813 PyUnicode_GET_SIZE(unicode),
1814 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001815 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001816 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001817 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001818 return v;
1819}
1820
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001821PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001822PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001823 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001824 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1825}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001826
Christian Heimes5894ba72007-11-04 11:43:14 +00001827PyObject*
1828PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1829{
Victor Stinnerad158722010-10-27 00:25:46 +00001830#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1831 return PyUnicode_DecodeMBCS(s, size, NULL);
1832#elif defined(__APPLE__)
1833 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1834#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001835 PyInterpreterState *interp = PyThreadState_GET()->interp;
1836 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1837 cannot use it to encode and decode filenames before it is loaded. Load
1838 the Python codec requires to encode at least its own filename. Use the C
1839 version of the locale codec until the codec registry is initialized and
1840 the Python codec is loaded.
1841
1842 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1843 cannot only rely on it: check also interp->fscodec_initialized for
1844 subinterpreters. */
1845 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001846 return PyUnicode_Decode(s, size,
1847 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001848 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001849 }
1850 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001851 /* locale encoding with surrogateescape */
1852 wchar_t *wchar;
1853 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001854 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001855
1856 if (s[size] != '\0' || size != strlen(s)) {
1857 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1858 return NULL;
1859 }
1860
Victor Stinner168e1172010-10-16 23:16:16 +00001861 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001862 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001863 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001864
Victor Stinner168e1172010-10-16 23:16:16 +00001865 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001866 PyMem_Free(wchar);
1867 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001868 }
Victor Stinnerad158722010-10-27 00:25:46 +00001869#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001870}
1871
Martin v. Löwis011e8422009-05-05 04:43:17 +00001872
1873int
Antoine Pitrou13348842012-01-29 18:36:34 +01001874_PyUnicode_HasNULChars(PyObject* s)
1875{
1876 static PyObject *nul = NULL;
1877
1878 if (nul == NULL)
1879 nul = PyUnicode_FromStringAndSize("\0", 1);
1880 if (nul == NULL)
1881 return -1;
1882 return PyUnicode_Contains(s, nul);
1883}
1884
1885
1886int
Martin v. Löwis011e8422009-05-05 04:43:17 +00001887PyUnicode_FSConverter(PyObject* arg, void* addr)
1888{
1889 PyObject *output = NULL;
1890 Py_ssize_t size;
1891 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001892 if (arg == NULL) {
1893 Py_DECREF(*(PyObject**)addr);
1894 return 1;
1895 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001896 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001897 output = arg;
1898 Py_INCREF(output);
1899 }
1900 else {
1901 arg = PyUnicode_FromObject(arg);
1902 if (!arg)
1903 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001904 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001905 Py_DECREF(arg);
1906 if (!output)
1907 return 0;
1908 if (!PyBytes_Check(output)) {
1909 Py_DECREF(output);
1910 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1911 return 0;
1912 }
1913 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001914 size = PyBytes_GET_SIZE(output);
1915 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001916 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05001917 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001918 Py_DECREF(output);
1919 return 0;
1920 }
1921 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001922 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001923}
1924
1925
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001926int
1927PyUnicode_FSDecoder(PyObject* arg, void* addr)
1928{
1929 PyObject *output = NULL;
1930 Py_ssize_t size;
1931 void *data;
1932 if (arg == NULL) {
1933 Py_DECREF(*(PyObject**)addr);
1934 return 1;
1935 }
1936 if (PyUnicode_Check(arg)) {
1937 output = arg;
1938 Py_INCREF(output);
1939 }
1940 else {
1941 arg = PyBytes_FromObject(arg);
1942 if (!arg)
1943 return 0;
1944 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1945 PyBytes_GET_SIZE(arg));
1946 Py_DECREF(arg);
1947 if (!output)
1948 return 0;
1949 if (!PyUnicode_Check(output)) {
1950 Py_DECREF(output);
1951 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1952 return 0;
1953 }
1954 }
1955 size = PyUnicode_GET_SIZE(output);
1956 data = PyUnicode_AS_UNICODE(output);
1957 if (size != Py_UNICODE_strlen(data)) {
1958 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1959 Py_DECREF(output);
1960 return 0;
1961 }
1962 *(PyObject**)addr = output;
1963 return Py_CLEANUP_SUPPORTED;
1964}
1965
1966
Martin v. Löwis5b222132007-06-10 09:51:05 +00001967char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001968_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001969{
Christian Heimesf3863112007-11-22 07:46:41 +00001970 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001971 if (!PyUnicode_Check(unicode)) {
1972 PyErr_BadArgument();
1973 return NULL;
1974 }
Christian Heimesf3863112007-11-22 07:46:41 +00001975 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1976 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001977 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001978 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001979 *psize = PyBytes_GET_SIZE(bytes);
1980 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001981}
1982
1983char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001984_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001985{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001986 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001987}
1988
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1990{
1991 if (!PyUnicode_Check(unicode)) {
1992 PyErr_BadArgument();
1993 goto onError;
1994 }
1995 return PyUnicode_AS_UNICODE(unicode);
1996
Benjamin Peterson29060642009-01-31 22:14:21 +00001997 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998 return NULL;
1999}
2000
Martin v. Löwis18e16552006-02-15 17:27:45 +00002001Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002002{
2003 if (!PyUnicode_Check(unicode)) {
2004 PyErr_BadArgument();
2005 goto onError;
2006 }
2007 return PyUnicode_GET_SIZE(unicode);
2008
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 return -1;
2011}
2012
Thomas Wouters78890102000-07-22 19:25:51 +00002013const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002014{
Victor Stinner42cb4622010-09-01 19:39:01 +00002015 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002016}
2017
Victor Stinner554f3f02010-06-16 23:33:54 +00002018/* create or adjust a UnicodeDecodeError */
2019static void
2020make_decode_exception(PyObject **exceptionObject,
2021 const char *encoding,
2022 const char *input, Py_ssize_t length,
2023 Py_ssize_t startpos, Py_ssize_t endpos,
2024 const char *reason)
2025{
2026 if (*exceptionObject == NULL) {
2027 *exceptionObject = PyUnicodeDecodeError_Create(
2028 encoding, input, length, startpos, endpos, reason);
2029 }
2030 else {
2031 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2032 goto onError;
2033 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2034 goto onError;
2035 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2036 goto onError;
2037 }
2038 return;
2039
2040onError:
2041 Py_DECREF(*exceptionObject);
2042 *exceptionObject = NULL;
2043}
2044
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002045/* error handling callback helper:
2046 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002047 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 and adjust various state variables.
2049 return 0 on success, -1 on error
2050*/
2051
2052static
2053int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 const char *encoding, const char *reason,
2055 const char **input, const char **inend, Py_ssize_t *startinpos,
2056 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2057 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002059 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060
2061 PyObject *restuple = NULL;
2062 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002063 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002064 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002065 Py_ssize_t requiredsize;
2066 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002068 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002069 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 int res = -1;
2071
2072 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002073 *errorHandler = PyCodec_LookupError(errors);
2074 if (*errorHandler == NULL)
2075 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002076 }
2077
Victor Stinner554f3f02010-06-16 23:33:54 +00002078 make_decode_exception(exceptionObject,
2079 encoding,
2080 *input, *inend - *input,
2081 *startinpos, *endinpos,
2082 reason);
2083 if (*exceptionObject == NULL)
2084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085
2086 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2087 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002088 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002089 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002090 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002092 }
2093 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002094 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002095
2096 /* Copy back the bytes variables, which might have been modified by the
2097 callback */
2098 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2099 if (!inputobj)
2100 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002101 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002102 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002103 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002104 *input = PyBytes_AS_STRING(inputobj);
2105 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002106 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002107 /* we can DECREF safely, as the exception has another reference,
2108 so the object won't go away. */
2109 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002110
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002111 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002112 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002113 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002114 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2115 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002116 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117
2118 /* need more space? (at least enough for what we
2119 have+the replacement+the rest of the string (starting
2120 at the new input position), so we won't have to check space
2121 when there are no errors in the rest of the string) */
2122 repptr = PyUnicode_AS_UNICODE(repunicode);
2123 repsize = PyUnicode_GET_SIZE(repunicode);
2124 requiredsize = *outpos + repsize + insize-newpos;
2125 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002126 if (requiredsize<2*outsize)
2127 requiredsize = 2*outsize;
2128 if (_PyUnicode_Resize(output, requiredsize) < 0)
2129 goto onError;
2130 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002131 }
2132 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002133 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 Py_UNICODE_COPY(*outptr, repptr, repsize);
2135 *outptr += repsize;
2136 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002137
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 /* we made it! */
2139 res = 0;
2140
Benjamin Peterson29060642009-01-31 22:14:21 +00002141 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 Py_XDECREF(restuple);
2143 return res;
2144}
2145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002146/* --- UTF-7 Codec -------------------------------------------------------- */
2147
Antoine Pitrou244651a2009-05-04 18:56:13 +00002148/* See RFC2152 for details. We encode conservatively and decode liberally. */
2149
2150/* Three simple macros defining base-64. */
2151
2152/* Is c a base-64 character? */
2153
2154#define IS_BASE64(c) \
2155 (((c) >= 'A' && (c) <= 'Z') || \
2156 ((c) >= 'a' && (c) <= 'z') || \
2157 ((c) >= '0' && (c) <= '9') || \
2158 (c) == '+' || (c) == '/')
2159
2160/* given that c is a base-64 character, what is its base-64 value? */
2161
2162#define FROM_BASE64(c) \
2163 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2164 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2165 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2166 (c) == '+' ? 62 : 63)
2167
2168/* What is the base-64 character of the bottom 6 bits of n? */
2169
2170#define TO_BASE64(n) \
2171 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2172
2173/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2174 * decoded as itself. We are permissive on decoding; the only ASCII
2175 * byte not decoding to itself is the + which begins a base64
2176 * string. */
2177
2178#define DECODE_DIRECT(c) \
2179 ((c) <= 127 && (c) != '+')
2180
2181/* The UTF-7 encoder treats ASCII characters differently according to
2182 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2183 * the above). See RFC2152. This array identifies these different
2184 * sets:
2185 * 0 : "Set D"
2186 * alphanumeric and '(),-./:?
2187 * 1 : "Set O"
2188 * !"#$%&*;<=>@[]^_`{|}
2189 * 2 : "whitespace"
2190 * ht nl cr sp
2191 * 3 : special (must be base64 encoded)
2192 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2193 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002194
Tim Petersced69f82003-09-16 20:30:58 +00002195static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002196char utf7_category[128] = {
2197/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2198 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2199/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2200 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2201/* sp ! " # $ % & ' ( ) * + , - . / */
2202 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2203/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2205/* @ A B C D E F G H I J K L M N O */
2206 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2207/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2209/* ` a b c d e f g h i j k l m n o */
2210 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2211/* p q r s t u v w x y z { | } ~ del */
2212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002213};
2214
Antoine Pitrou244651a2009-05-04 18:56:13 +00002215/* ENCODE_DIRECT: this character should be encoded as itself. The
2216 * answer depends on whether we are encoding set O as itself, and also
2217 * on whether we are encoding whitespace as itself. RFC2152 makes it
2218 * clear that the answers to these questions vary between
2219 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002220
Antoine Pitrou244651a2009-05-04 18:56:13 +00002221#define ENCODE_DIRECT(c, directO, directWS) \
2222 ((c) < 128 && (c) > 0 && \
2223 ((utf7_category[(c)] == 0) || \
2224 (directWS && (utf7_category[(c)] == 2)) || \
2225 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002226
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002227PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002228 Py_ssize_t size,
2229 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002230{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002231 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2232}
2233
Antoine Pitrou244651a2009-05-04 18:56:13 +00002234/* The decoder. The only state we preserve is our read position,
2235 * i.e. how many characters we have consumed. So if we end in the
2236 * middle of a shift sequence we have to back off the read position
2237 * and the output to the beginning of the sequence, otherwise we lose
2238 * all the shift state (seen bits, number of bits seen, high
2239 * surrogate). */
2240
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002241PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002242 Py_ssize_t size,
2243 const char *errors,
2244 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002245{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002246 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002247 Py_ssize_t startinpos;
2248 Py_ssize_t endinpos;
2249 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002250 const char *e;
2251 PyUnicodeObject *unicode;
2252 Py_UNICODE *p;
2253 const char *errmsg = "";
2254 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002255 Py_UNICODE *shiftOutStart;
2256 unsigned int base64bits = 0;
2257 unsigned long base64buffer = 0;
2258 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 PyObject *errorHandler = NULL;
2260 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002261
2262 unicode = _PyUnicode_New(size);
2263 if (!unicode)
2264 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002265 if (size == 0) {
2266 if (consumed)
2267 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002268 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002269 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002270
2271 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002272 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002273 e = s + size;
2274
2275 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002277 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002278 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002279
Antoine Pitrou244651a2009-05-04 18:56:13 +00002280 if (inShift) { /* in a base-64 section */
2281 if (IS_BASE64(ch)) { /* consume a base-64 character */
2282 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2283 base64bits += 6;
2284 s++;
2285 if (base64bits >= 16) {
2286 /* we have enough bits for a UTF-16 value */
2287 Py_UNICODE outCh = (Py_UNICODE)
2288 (base64buffer >> (base64bits-16));
2289 base64bits -= 16;
2290 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2291 if (surrogate) {
2292 /* expecting a second surrogate */
2293 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2294#ifdef Py_UNICODE_WIDE
2295 *p++ = (((surrogate & 0x3FF)<<10)
2296 | (outCh & 0x3FF)) + 0x10000;
2297#else
2298 *p++ = surrogate;
2299 *p++ = outCh;
2300#endif
2301 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002302 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002303 }
2304 else {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002305 *p++ = surrogate;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002306 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002307 }
2308 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002309 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002310 /* first surrogate */
2311 surrogate = outCh;
2312 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002313 else {
2314 *p++ = outCh;
2315 }
2316 }
2317 }
2318 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002319 inShift = 0;
2320 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002321 if (surrogate) {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002322 *p++ = surrogate;
2323 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002324 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002325 if (base64bits > 0) { /* left-over bits */
2326 if (base64bits >= 6) {
2327 /* We've seen at least one base-64 character */
2328 errmsg = "partial character in shift sequence";
2329 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002330 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002331 else {
2332 /* Some bits remain; they should be zero */
2333 if (base64buffer != 0) {
2334 errmsg = "non-zero padding bits in shift sequence";
2335 goto utf7Error;
2336 }
2337 }
2338 }
2339 if (ch != '-') {
2340 /* '-' is absorbed; other terminating
2341 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002342 *p++ = ch;
2343 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002344 }
2345 }
2346 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002347 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002348 s++; /* consume '+' */
2349 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002350 s++;
2351 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002352 }
2353 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002354 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002355 shiftOutStart = p;
2356 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002357 }
2358 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002359 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002360 *p++ = ch;
2361 s++;
2362 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002363 else {
2364 startinpos = s-starts;
2365 s++;
2366 errmsg = "unexpected special character";
2367 goto utf7Error;
2368 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002369 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002370utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002371 outpos = p-PyUnicode_AS_UNICODE(unicode);
2372 endinpos = s-starts;
2373 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002374 errors, &errorHandler,
2375 "utf7", errmsg,
2376 &starts, &e, &startinpos, &endinpos, &exc, &s,
2377 &unicode, &outpos, &p))
2378 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002379 }
2380
Antoine Pitrou244651a2009-05-04 18:56:13 +00002381 /* end of string */
2382
2383 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2384 /* if we're in an inconsistent state, that's an error */
2385 if (surrogate ||
2386 (base64bits >= 6) ||
2387 (base64bits > 0 && base64buffer != 0)) {
2388 outpos = p-PyUnicode_AS_UNICODE(unicode);
2389 endinpos = size;
2390 if (unicode_decode_call_errorhandler(
2391 errors, &errorHandler,
2392 "utf7", "unterminated shift sequence",
2393 &starts, &e, &startinpos, &endinpos, &exc, &s,
2394 &unicode, &outpos, &p))
2395 goto onError;
2396 if (s < e)
2397 goto restart;
2398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002400
2401 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002402 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002403 if (inShift) {
2404 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002405 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002406 }
2407 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002408 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002409 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002410 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002411
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002412 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002413 goto onError;
2414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002417 return (PyObject *)unicode;
2418
Benjamin Peterson29060642009-01-31 22:14:21 +00002419 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002420 Py_XDECREF(errorHandler);
2421 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002422 Py_DECREF(unicode);
2423 return NULL;
2424}
2425
2426
2427PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002428 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002429 int base64SetO,
2430 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002431 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002432{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002433 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002434 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002435 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002436 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002437 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002438 unsigned int base64bits = 0;
2439 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002440 char * out;
2441 char * start;
2442
2443 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002444 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002445
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002446 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002447 return PyErr_NoMemory();
2448
Antoine Pitrou244651a2009-05-04 18:56:13 +00002449 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002450 if (v == NULL)
2451 return NULL;
2452
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002453 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002454 for (;i < size; ++i) {
2455 Py_UNICODE ch = s[i];
2456
Antoine Pitrou244651a2009-05-04 18:56:13 +00002457 if (inShift) {
2458 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2459 /* shifting out */
2460 if (base64bits) { /* output remaining bits */
2461 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2462 base64buffer = 0;
2463 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002464 }
2465 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002466 /* Characters not in the BASE64 set implicitly unshift the sequence
2467 so no '-' is required, except if the character is itself a '-' */
2468 if (IS_BASE64(ch) || ch == '-') {
2469 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002471 *out++ = (char) ch;
2472 }
2473 else {
2474 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002475 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002476 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002477 else { /* not in a shift sequence */
2478 if (ch == '+') {
2479 *out++ = '+';
2480 *out++ = '-';
2481 }
2482 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2483 *out++ = (char) ch;
2484 }
2485 else {
2486 *out++ = '+';
2487 inShift = 1;
2488 goto encode_char;
2489 }
2490 }
2491 continue;
2492encode_char:
2493#ifdef Py_UNICODE_WIDE
2494 if (ch >= 0x10000) {
2495 /* code first surrogate */
2496 base64bits += 16;
2497 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2498 while (base64bits >= 6) {
2499 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2500 base64bits -= 6;
2501 }
2502 /* prepare second surrogate */
2503 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2504 }
2505#endif
2506 base64bits += 16;
2507 base64buffer = (base64buffer << 16) | ch;
2508 while (base64bits >= 6) {
2509 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2510 base64bits -= 6;
2511 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002512 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002513 if (base64bits)
2514 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2515 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002516 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002517 if (_PyBytes_Resize(&v, out - start) < 0)
2518 return NULL;
2519 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002520}
2521
Antoine Pitrou244651a2009-05-04 18:56:13 +00002522#undef IS_BASE64
2523#undef FROM_BASE64
2524#undef TO_BASE64
2525#undef DECODE_DIRECT
2526#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002527
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528/* --- UTF-8 Codec -------------------------------------------------------- */
2529
Tim Petersced69f82003-09-16 20:30:58 +00002530static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002532 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2533 illegal prefix. See RFC 3629 for details */
2534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2544 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2546 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2547 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2548 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2549 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550};
2551
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002553 Py_ssize_t size,
2554 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555{
Walter Dörwald69652032004-09-07 20:24:22 +00002556 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2557}
2558
Antoine Pitrouab868312009-01-10 15:40:25 +00002559/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2560#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2561
2562/* Mask to quickly check whether a C 'long' contains a
2563 non-ASCII, UTF8-encoded char. */
2564#if (SIZEOF_LONG == 8)
2565# define ASCII_CHAR_MASK 0x8080808080808080L
2566#elif (SIZEOF_LONG == 4)
2567# define ASCII_CHAR_MASK 0x80808080L
2568#else
2569# error C 'long' size should be either 4 or 8!
2570#endif
2571
Walter Dörwald69652032004-09-07 20:24:22 +00002572PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002573 Py_ssize_t size,
2574 const char *errors,
2575 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002576{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002579 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002580 Py_ssize_t startinpos;
2581 Py_ssize_t endinpos;
2582 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002583 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 PyUnicodeObject *unicode;
2585 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002586 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002587 PyObject *errorHandler = NULL;
2588 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589
2590 /* Note: size will always be longer than the resulting Unicode
2591 character count */
2592 unicode = _PyUnicode_New(size);
2593 if (!unicode)
2594 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002595 if (size == 0) {
2596 if (consumed)
2597 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600
2601 /* Unpack UTF-8 encoded data */
2602 p = unicode->str;
2603 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002604 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605
2606 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002607 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608
2609 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002610 /* Fast path for runs of ASCII characters. Given that common UTF-8
2611 input will consist of an overwhelming majority of ASCII
2612 characters, we try to optimize for this case by checking
2613 as many characters as a C 'long' can contain.
2614 First, check if we can do an aligned read, as most CPUs have
2615 a penalty for unaligned reads.
2616 */
2617 if (!((size_t) s & LONG_PTR_MASK)) {
2618 /* Help register allocation */
2619 register const char *_s = s;
2620 register Py_UNICODE *_p = p;
2621 while (_s < aligned_end) {
2622 /* Read a whole long at a time (either 4 or 8 bytes),
2623 and do a fast unrolled copy if it only contains ASCII
2624 characters. */
2625 unsigned long data = *(unsigned long *) _s;
2626 if (data & ASCII_CHAR_MASK)
2627 break;
2628 _p[0] = (unsigned char) _s[0];
2629 _p[1] = (unsigned char) _s[1];
2630 _p[2] = (unsigned char) _s[2];
2631 _p[3] = (unsigned char) _s[3];
2632#if (SIZEOF_LONG == 8)
2633 _p[4] = (unsigned char) _s[4];
2634 _p[5] = (unsigned char) _s[5];
2635 _p[6] = (unsigned char) _s[6];
2636 _p[7] = (unsigned char) _s[7];
2637#endif
2638 _s += SIZEOF_LONG;
2639 _p += SIZEOF_LONG;
2640 }
2641 s = _s;
2642 p = _p;
2643 if (s == e)
2644 break;
2645 ch = (unsigned char)*s;
2646 }
2647 }
2648
2649 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002650 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 s++;
2652 continue;
2653 }
2654
2655 n = utf8_code_length[ch];
2656
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002657 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002658 if (consumed)
2659 break;
2660 else {
2661 errmsg = "unexpected end of data";
2662 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002663 endinpos = startinpos+1;
2664 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2665 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002666 goto utf8Error;
2667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669
2670 switch (n) {
2671
2672 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002673 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002674 startinpos = s-starts;
2675 endinpos = startinpos+1;
2676 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677
2678 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002679 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 startinpos = s-starts;
2681 endinpos = startinpos+1;
2682 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683
2684 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002685 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002686 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002687 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002688 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 goto utf8Error;
2690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002692 assert ((ch > 0x007F) && (ch <= 0x07FF));
2693 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 break;
2695
2696 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002697 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2698 will result in surrogates in range d800-dfff. Surrogates are
2699 not valid UTF-8 so they are rejected.
2700 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2701 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002702 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002703 (s[2] & 0xc0) != 0x80 ||
2704 ((unsigned char)s[0] == 0xE0 &&
2705 (unsigned char)s[1] < 0xA0) ||
2706 ((unsigned char)s[0] == 0xED &&
2707 (unsigned char)s[1] > 0x9F)) {
2708 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002709 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002710 endinpos = startinpos + 1;
2711
2712 /* if s[1] first two bits are 1 and 0, then the invalid
2713 continuation byte is s[2], so increment endinpos by 1,
2714 if not, s[1] is invalid and endinpos doesn't need to
2715 be incremented. */
2716 if ((s[1] & 0xC0) == 0x80)
2717 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 goto utf8Error;
2719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002721 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2722 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002723 break;
2724
2725 case 4:
2726 if ((s[1] & 0xc0) != 0x80 ||
2727 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002728 (s[3] & 0xc0) != 0x80 ||
2729 ((unsigned char)s[0] == 0xF0 &&
2730 (unsigned char)s[1] < 0x90) ||
2731 ((unsigned char)s[0] == 0xF4 &&
2732 (unsigned char)s[1] > 0x8F)) {
2733 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002734 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002735 endinpos = startinpos + 1;
2736 if ((s[1] & 0xC0) == 0x80) {
2737 endinpos++;
2738 if ((s[2] & 0xC0) == 0x80)
2739 endinpos++;
2740 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002741 goto utf8Error;
2742 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002743 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002744 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2745 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2746
Fredrik Lundh8f455852001-06-27 18:59:43 +00002747#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002749#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002750 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002751
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002752 /* translate from 10000..10FFFF to 0..FFFF */
2753 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002754
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002755 /* high surrogate = top 10 bits added to D800 */
2756 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002757
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002758 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002759 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002760#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 }
2763 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002765
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 utf8Error:
2767 outpos = p-PyUnicode_AS_UNICODE(unicode);
2768 if (unicode_decode_call_errorhandler(
2769 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01002770 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 &starts, &e, &startinpos, &endinpos, &exc, &s,
2772 &unicode, &outpos, &p))
2773 goto onError;
2774 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 }
Walter Dörwald69652032004-09-07 20:24:22 +00002776 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002777 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778
2779 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002780 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 goto onError;
2782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002783 Py_XDECREF(errorHandler);
2784 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 return (PyObject *)unicode;
2786
Benjamin Peterson29060642009-01-31 22:14:21 +00002787 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788 Py_XDECREF(errorHandler);
2789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 Py_DECREF(unicode);
2791 return NULL;
2792}
2793
Antoine Pitrouab868312009-01-10 15:40:25 +00002794#undef ASCII_CHAR_MASK
2795
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002796#ifdef __APPLE__
2797
2798/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01002799 used to decode the command line arguments on Mac OS X.
2800
2801 Return a pointer to a newly allocated wide character string (use
2802 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002803
2804wchar_t*
2805_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2806{
2807 int n;
2808 const char *e;
2809 wchar_t *unicode, *p;
2810
2811 /* Note: size will always be longer than the resulting Unicode
2812 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01002813 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002814 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002815 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2816 if (!unicode)
2817 return NULL;
2818
2819 /* Unpack UTF-8 encoded data */
2820 p = unicode;
2821 e = s + size;
2822 while (s < e) {
2823 Py_UCS4 ch = (unsigned char)*s;
2824
2825 if (ch < 0x80) {
2826 *p++ = (wchar_t)ch;
2827 s++;
2828 continue;
2829 }
2830
2831 n = utf8_code_length[ch];
2832 if (s + n > e) {
2833 goto surrogateescape;
2834 }
2835
2836 switch (n) {
2837 case 0:
2838 case 1:
2839 goto surrogateescape;
2840
2841 case 2:
2842 if ((s[1] & 0xc0) != 0x80)
2843 goto surrogateescape;
2844 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2845 assert ((ch > 0x007F) && (ch <= 0x07FF));
2846 *p++ = (wchar_t)ch;
2847 break;
2848
2849 case 3:
2850 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2851 will result in surrogates in range d800-dfff. Surrogates are
2852 not valid UTF-8 so they are rejected.
2853 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2854 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2855 if ((s[1] & 0xc0) != 0x80 ||
2856 (s[2] & 0xc0) != 0x80 ||
2857 ((unsigned char)s[0] == 0xE0 &&
2858 (unsigned char)s[1] < 0xA0) ||
2859 ((unsigned char)s[0] == 0xED &&
2860 (unsigned char)s[1] > 0x9F)) {
2861
2862 goto surrogateescape;
2863 }
2864 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2865 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2866 *p++ = (Py_UNICODE)ch;
2867 break;
2868
2869 case 4:
2870 if ((s[1] & 0xc0) != 0x80 ||
2871 (s[2] & 0xc0) != 0x80 ||
2872 (s[3] & 0xc0) != 0x80 ||
2873 ((unsigned char)s[0] == 0xF0 &&
2874 (unsigned char)s[1] < 0x90) ||
2875 ((unsigned char)s[0] == 0xF4 &&
2876 (unsigned char)s[1] > 0x8F)) {
2877 goto surrogateescape;
2878 }
2879 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2880 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2881 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2882
2883#if SIZEOF_WCHAR_T == 4
2884 *p++ = (wchar_t)ch;
2885#else
2886 /* compute and append the two surrogates: */
2887
2888 /* translate from 10000..10FFFF to 0..FFFF */
2889 ch -= 0x10000;
2890
2891 /* high surrogate = top 10 bits added to D800 */
2892 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2893
2894 /* low surrogate = bottom 10 bits added to DC00 */
2895 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2896#endif
2897 break;
2898 }
2899 s += n;
2900 continue;
2901
2902 surrogateescape:
2903 *p++ = 0xDC00 + ch;
2904 s++;
2905 }
2906 *p = L'\0';
2907 return unicode;
2908}
2909
2910#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002911
Tim Peters602f7402002-04-27 18:03:26 +00002912/* Allocation strategy: if the string is short, convert into a stack buffer
2913 and allocate exactly as much space needed at the end. Else allocate the
2914 maximum possible needed (4 result bytes per Unicode character), and return
2915 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002916*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002917PyObject *
2918PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 Py_ssize_t size,
2920 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921{
Tim Peters602f7402002-04-27 18:03:26 +00002922#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002923
Guido van Rossum98297ee2007-11-06 21:34:58 +00002924 Py_ssize_t i; /* index into s of next input byte */
2925 PyObject *result; /* result string object */
2926 char *p; /* next free byte in output buffer */
2927 Py_ssize_t nallocated; /* number of result bytes allocated */
2928 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002929 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002930 PyObject *errorHandler = NULL;
2931 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002932
Tim Peters602f7402002-04-27 18:03:26 +00002933 assert(s != NULL);
2934 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935
Tim Peters602f7402002-04-27 18:03:26 +00002936 if (size <= MAX_SHORT_UNICHARS) {
2937 /* Write into the stack buffer; nallocated can't overflow.
2938 * At the end, we'll allocate exactly as much heap space as it
2939 * turns out we need.
2940 */
2941 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002942 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002943 p = stackbuf;
2944 }
2945 else {
2946 /* Overallocate on the heap, and give the excess back at the end. */
2947 nallocated = size * 4;
2948 if (nallocated / 4 != size) /* overflow! */
2949 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002950 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002951 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002952 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002953 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002954 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002955
Tim Peters602f7402002-04-27 18:03:26 +00002956 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002957 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002958
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002959 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002960 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002962
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002964 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002965 *p++ = (char)(0xc0 | (ch >> 6));
2966 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002967 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002968#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002969 /* Special case: check for high and low surrogate */
2970 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2971 Py_UCS4 ch2 = s[i];
2972 /* Combine the two surrogates to form a UCS4 value */
2973 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2974 i++;
2975
2976 /* Encode UCS4 Unicode ordinals */
2977 *p++ = (char)(0xf0 | (ch >> 18));
2978 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002979 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2980 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002981 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002982#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002983 Py_ssize_t newpos;
2984 PyObject *rep;
2985 Py_ssize_t repsize, k;
2986 rep = unicode_encode_call_errorhandler
2987 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2988 s, size, &exc, i-1, i, &newpos);
2989 if (!rep)
2990 goto error;
2991
2992 if (PyBytes_Check(rep))
2993 repsize = PyBytes_GET_SIZE(rep);
2994 else
2995 repsize = PyUnicode_GET_SIZE(rep);
2996
2997 if (repsize > 4) {
2998 Py_ssize_t offset;
2999
3000 if (result == NULL)
3001 offset = p - stackbuf;
3002 else
3003 offset = p - PyBytes_AS_STRING(result);
3004
3005 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3006 /* integer overflow */
3007 PyErr_NoMemory();
3008 goto error;
3009 }
3010 nallocated += repsize - 4;
3011 if (result != NULL) {
3012 if (_PyBytes_Resize(&result, nallocated) < 0)
3013 goto error;
3014 } else {
3015 result = PyBytes_FromStringAndSize(NULL, nallocated);
3016 if (result == NULL)
3017 goto error;
3018 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3019 }
3020 p = PyBytes_AS_STRING(result) + offset;
3021 }
3022
3023 if (PyBytes_Check(rep)) {
3024 char *prep = PyBytes_AS_STRING(rep);
3025 for(k = repsize; k > 0; k--)
3026 *p++ = *prep++;
3027 } else /* rep is unicode */ {
3028 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3029 Py_UNICODE c;
3030
3031 for(k=0; k<repsize; k++) {
3032 c = prep[k];
3033 if (0x80 <= c) {
3034 raise_encode_exception(&exc, "utf-8", s, size,
3035 i-1, i, "surrogates not allowed");
3036 goto error;
3037 }
3038 *p++ = (char)prep[k];
3039 }
3040 }
3041 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003042#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003043 }
Victor Stinner445a6232010-04-22 20:01:57 +00003044#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003045 } else if (ch < 0x10000) {
3046 *p++ = (char)(0xe0 | (ch >> 12));
3047 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3048 *p++ = (char)(0x80 | (ch & 0x3f));
3049 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003050 /* Encode UCS4 Unicode ordinals */
3051 *p++ = (char)(0xf0 | (ch >> 18));
3052 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3053 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3054 *p++ = (char)(0x80 | (ch & 0x3f));
3055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003057
Guido van Rossum98297ee2007-11-06 21:34:58 +00003058 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003059 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003060 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003061 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003062 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003063 }
3064 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003065 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003066 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003067 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003068 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003069 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003070 Py_XDECREF(errorHandler);
3071 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003072 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003073 error:
3074 Py_XDECREF(errorHandler);
3075 Py_XDECREF(exc);
3076 Py_XDECREF(result);
3077 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003078
Tim Peters602f7402002-04-27 18:03:26 +00003079#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080}
3081
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3083{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 if (!PyUnicode_Check(unicode)) {
3085 PyErr_BadArgument();
3086 return NULL;
3087 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003088 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003089 PyUnicode_GET_SIZE(unicode),
3090 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091}
3092
Walter Dörwald41980ca2007-08-16 21:55:45 +00003093/* --- UTF-32 Codec ------------------------------------------------------- */
3094
3095PyObject *
3096PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 Py_ssize_t size,
3098 const char *errors,
3099 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003100{
3101 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3102}
3103
3104PyObject *
3105PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003106 Py_ssize_t size,
3107 const char *errors,
3108 int *byteorder,
3109 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003110{
3111 const char *starts = s;
3112 Py_ssize_t startinpos;
3113 Py_ssize_t endinpos;
3114 Py_ssize_t outpos;
3115 PyUnicodeObject *unicode;
3116 Py_UNICODE *p;
3117#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003118 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003119 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003120#else
3121 const int pairs = 0;
3122#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003123 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003124 int bo = 0; /* assume native ordering by default */
3125 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003126 /* Offsets from q for retrieving bytes in the right order. */
3127#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3128 int iorder[] = {0, 1, 2, 3};
3129#else
3130 int iorder[] = {3, 2, 1, 0};
3131#endif
3132 PyObject *errorHandler = NULL;
3133 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003134
Walter Dörwald41980ca2007-08-16 21:55:45 +00003135 q = (unsigned char *)s;
3136 e = q + size;
3137
3138 if (byteorder)
3139 bo = *byteorder;
3140
3141 /* Check for BOM marks (U+FEFF) in the input and adjust current
3142 byte order setting accordingly. In native mode, the leading BOM
3143 mark is skipped, in all other modes, it is copied to the output
3144 stream as-is (giving a ZWNBSP character). */
3145 if (bo == 0) {
3146 if (size >= 4) {
3147 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003148 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003149#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003150 if (bom == 0x0000FEFF) {
3151 q += 4;
3152 bo = -1;
3153 }
3154 else if (bom == 0xFFFE0000) {
3155 q += 4;
3156 bo = 1;
3157 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003158#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003159 if (bom == 0x0000FEFF) {
3160 q += 4;
3161 bo = 1;
3162 }
3163 else if (bom == 0xFFFE0000) {
3164 q += 4;
3165 bo = -1;
3166 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003167#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003169 }
3170
3171 if (bo == -1) {
3172 /* force LE */
3173 iorder[0] = 0;
3174 iorder[1] = 1;
3175 iorder[2] = 2;
3176 iorder[3] = 3;
3177 }
3178 else if (bo == 1) {
3179 /* force BE */
3180 iorder[0] = 3;
3181 iorder[1] = 2;
3182 iorder[2] = 1;
3183 iorder[3] = 0;
3184 }
3185
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003186 /* On narrow builds we split characters outside the BMP into two
3187 codepoints => count how much extra space we need. */
3188#ifndef Py_UNICODE_WIDE
Serhiy Storchakadec798e2013-01-08 22:45:42 +02003189 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003190 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3191 pairs++;
3192#endif
3193
3194 /* This might be one to much, because of a BOM */
3195 unicode = _PyUnicode_New((size+3)/4+pairs);
3196 if (!unicode)
3197 return NULL;
3198 if (size == 0)
3199 return (PyObject *)unicode;
3200
3201 /* Unpack UTF-32 encoded data */
3202 p = unicode->str;
3203
Walter Dörwald41980ca2007-08-16 21:55:45 +00003204 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 Py_UCS4 ch;
3206 /* remaining bytes at the end? (size should be divisible by 4) */
3207 if (e-q<4) {
3208 if (consumed)
3209 break;
3210 errmsg = "truncated data";
3211 startinpos = ((const char *)q)-starts;
3212 endinpos = ((const char *)e)-starts;
3213 goto utf32Error;
3214 /* The remaining input chars are ignored if the callback
3215 chooses to skip the input */
3216 }
3217 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3218 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003219
Benjamin Peterson29060642009-01-31 22:14:21 +00003220 if (ch >= 0x110000)
3221 {
3222 errmsg = "codepoint not in range(0x110000)";
3223 startinpos = ((const char *)q)-starts;
3224 endinpos = startinpos+4;
3225 goto utf32Error;
3226 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003227#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 if (ch >= 0x10000)
3229 {
3230 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3231 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3232 }
3233 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003234#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 *p++ = ch;
3236 q += 4;
3237 continue;
3238 utf32Error:
3239 outpos = p-PyUnicode_AS_UNICODE(unicode);
3240 if (unicode_decode_call_errorhandler(
3241 errors, &errorHandler,
3242 "utf32", errmsg,
3243 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3244 &unicode, &outpos, &p))
3245 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003246 }
3247
3248 if (byteorder)
3249 *byteorder = bo;
3250
3251 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003252 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003253
3254 /* Adjust length */
3255 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3256 goto onError;
3257
3258 Py_XDECREF(errorHandler);
3259 Py_XDECREF(exc);
3260 return (PyObject *)unicode;
3261
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003263 Py_DECREF(unicode);
3264 Py_XDECREF(errorHandler);
3265 Py_XDECREF(exc);
3266 return NULL;
3267}
3268
3269PyObject *
3270PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003271 Py_ssize_t size,
3272 const char *errors,
3273 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003274{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003275 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003276 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003277 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003278#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003279 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003280#else
3281 const int pairs = 0;
3282#endif
3283 /* Offsets from p for storing byte pairs in the right order. */
3284#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3285 int iorder[] = {0, 1, 2, 3};
3286#else
3287 int iorder[] = {3, 2, 1, 0};
3288#endif
3289
Benjamin Peterson29060642009-01-31 22:14:21 +00003290#define STORECHAR(CH) \
3291 do { \
3292 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3293 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3294 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3295 p[iorder[0]] = (CH) & 0xff; \
3296 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003297 } while(0)
3298
3299 /* In narrow builds we can output surrogate pairs as one codepoint,
3300 so we need less space. */
3301#ifndef Py_UNICODE_WIDE
3302 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3304 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3305 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003306#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003307 nsize = (size - pairs + (byteorder == 0));
3308 bytesize = nsize * 4;
3309 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003311 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003312 if (v == NULL)
3313 return NULL;
3314
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003315 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003316 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003317 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003318 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003319 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003320
3321 if (byteorder == -1) {
3322 /* force LE */
3323 iorder[0] = 0;
3324 iorder[1] = 1;
3325 iorder[2] = 2;
3326 iorder[3] = 3;
3327 }
3328 else if (byteorder == 1) {
3329 /* force BE */
3330 iorder[0] = 3;
3331 iorder[1] = 2;
3332 iorder[2] = 1;
3333 iorder[3] = 0;
3334 }
3335
3336 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003337 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003338#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003339 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3340 Py_UCS4 ch2 = *s;
3341 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3342 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3343 s++;
3344 size--;
3345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003346 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003347#endif
3348 STORECHAR(ch);
3349 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003350
3351 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003352 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003353#undef STORECHAR
3354}
3355
3356PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3357{
3358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_BadArgument();
3360 return NULL;
3361 }
3362 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003363 PyUnicode_GET_SIZE(unicode),
3364 NULL,
3365 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003366}
3367
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368/* --- UTF-16 Codec ------------------------------------------------------- */
3369
Tim Peters772747b2001-08-09 22:21:55 +00003370PyObject *
3371PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 Py_ssize_t size,
3373 const char *errors,
3374 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375{
Walter Dörwald69652032004-09-07 20:24:22 +00003376 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3377}
3378
Antoine Pitrouab868312009-01-10 15:40:25 +00003379/* Two masks for fast checking of whether a C 'long' may contain
3380 UTF16-encoded surrogate characters. This is an efficient heuristic,
3381 assuming that non-surrogate characters with a code point >= 0x8000 are
3382 rare in most input.
3383 FAST_CHAR_MASK is used when the input is in native byte ordering,
3384 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003385*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003386#if (SIZEOF_LONG == 8)
3387# define FAST_CHAR_MASK 0x8000800080008000L
3388# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3389#elif (SIZEOF_LONG == 4)
3390# define FAST_CHAR_MASK 0x80008000L
3391# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3392#else
3393# error C 'long' size should be either 4 or 8!
3394#endif
3395
Walter Dörwald69652032004-09-07 20:24:22 +00003396PyObject *
3397PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003398 Py_ssize_t size,
3399 const char *errors,
3400 int *byteorder,
3401 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003402{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003404 Py_ssize_t startinpos;
3405 Py_ssize_t endinpos;
3406 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 PyUnicodeObject *unicode;
3408 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003409 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003410 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003411 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003412 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003413 /* Offsets from q for retrieving byte pairs in the right order. */
3414#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3415 int ihi = 1, ilo = 0;
3416#else
3417 int ihi = 0, ilo = 1;
3418#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 PyObject *errorHandler = NULL;
3420 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421
3422 /* Note: size will always be longer than the resulting Unicode
3423 character count */
3424 unicode = _PyUnicode_New(size);
3425 if (!unicode)
3426 return NULL;
3427 if (size == 0)
3428 return (PyObject *)unicode;
3429
3430 /* Unpack UTF-16 encoded data */
3431 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003432 q = (unsigned char *)s;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003433 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434
3435 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003436 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003438 /* Check for BOM marks (U+FEFF) in the input and adjust current
3439 byte order setting accordingly. In native mode, the leading BOM
3440 mark is skipped, in all other modes, it is copied to the output
3441 stream as-is (giving a ZWNBSP character). */
3442 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003443 if (size >= 2) {
3444 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003445#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 if (bom == 0xFEFF) {
3447 q += 2;
3448 bo = -1;
3449 }
3450 else if (bom == 0xFFFE) {
3451 q += 2;
3452 bo = 1;
3453 }
Tim Petersced69f82003-09-16 20:30:58 +00003454#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 if (bom == 0xFEFF) {
3456 q += 2;
3457 bo = 1;
3458 }
3459 else if (bom == 0xFFFE) {
3460 q += 2;
3461 bo = -1;
3462 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003463#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466
Tim Peters772747b2001-08-09 22:21:55 +00003467 if (bo == -1) {
3468 /* force LE */
3469 ihi = 1;
3470 ilo = 0;
3471 }
3472 else if (bo == 1) {
3473 /* force BE */
3474 ihi = 0;
3475 ilo = 1;
3476 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003477#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3478 native_ordering = ilo < ihi;
3479#else
3480 native_ordering = ilo > ihi;
3481#endif
Tim Peters772747b2001-08-09 22:21:55 +00003482
Antoine Pitrouab868312009-01-10 15:40:25 +00003483 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003484 while (1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003485 Py_UNICODE ch;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003486 if (e - q < 2) {
3487 /* remaining byte at the end? (size should be even) */
3488 if (q == e || consumed)
3489 break;
3490 errmsg = "truncated data";
3491 startinpos = ((const char *)q) - starts;
3492 endinpos = ((const char *)e) - starts;
3493 outpos = p - PyUnicode_AS_UNICODE(unicode);
3494 goto utf16Error;
3495 /* The remaining input chars are ignored if the callback
3496 chooses to skip the input */
3497 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003498 /* First check for possible aligned read of a C 'long'. Unaligned
3499 reads are more expensive, better to defer to another iteration. */
3500 if (!((size_t) q & LONG_PTR_MASK)) {
3501 /* Fast path for runs of non-surrogate chars. */
3502 register const unsigned char *_q = q;
3503 Py_UNICODE *_p = p;
3504 if (native_ordering) {
3505 /* Native ordering is simple: as long as the input cannot
3506 possibly contain a surrogate char, do an unrolled copy
3507 of several 16-bit code points to the target object.
3508 The non-surrogate check is done on several input bytes
3509 at a time (as many as a C 'long' can contain). */
3510 while (_q < aligned_end) {
3511 unsigned long data = * (unsigned long *) _q;
3512 if (data & FAST_CHAR_MASK)
3513 break;
3514 _p[0] = ((unsigned short *) _q)[0];
3515 _p[1] = ((unsigned short *) _q)[1];
3516#if (SIZEOF_LONG == 8)
3517 _p[2] = ((unsigned short *) _q)[2];
3518 _p[3] = ((unsigned short *) _q)[3];
3519#endif
3520 _q += SIZEOF_LONG;
3521 _p += SIZEOF_LONG / 2;
3522 }
3523 }
3524 else {
3525 /* Byteswapped ordering is similar, but we must decompose
3526 the copy bytewise, and take care of zero'ing out the
3527 upper bytes if the target object is in 32-bit units
3528 (that is, in UCS-4 builds). */
3529 while (_q < aligned_end) {
3530 unsigned long data = * (unsigned long *) _q;
3531 if (data & SWAPPED_FAST_CHAR_MASK)
3532 break;
3533 /* Zero upper bytes in UCS-4 builds */
3534#if (Py_UNICODE_SIZE > 2)
3535 _p[0] = 0;
3536 _p[1] = 0;
3537#if (SIZEOF_LONG == 8)
3538 _p[2] = 0;
3539 _p[3] = 0;
3540#endif
3541#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003542 /* Issue #4916; UCS-4 builds on big endian machines must
3543 fill the two last bytes of each 4-byte unit. */
3544#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3545# define OFF 2
3546#else
3547# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003548#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003549 ((unsigned char *) _p)[OFF + 1] = _q[0];
3550 ((unsigned char *) _p)[OFF + 0] = _q[1];
3551 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3552 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3553#if (SIZEOF_LONG == 8)
3554 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3555 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3556 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3557 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3558#endif
3559#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003560 _q += SIZEOF_LONG;
3561 _p += SIZEOF_LONG / 2;
3562 }
3563 }
3564 p = _p;
3565 q = _q;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003566 if (e - q < 2)
3567 continue;
Antoine Pitrouab868312009-01-10 15:40:25 +00003568 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570
Benjamin Peterson14339b62009-01-31 16:36:08 +00003571 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003572
3573 if (ch < 0xD800 || ch > 0xDFFF) {
3574 *p++ = ch;
3575 continue;
3576 }
3577
3578 /* UTF-16 code pair: */
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003579 if (e - q < 2) {
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02003580 q -= 2;
3581 if (consumed)
3582 break;
Benjamin Peterson29060642009-01-31 22:14:21 +00003583 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02003584 startinpos = ((const char *)q) - starts;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003585 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00003586 goto utf16Error;
3587 }
3588 if (0xD800 <= ch && ch <= 0xDBFF) {
3589 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3590 q += 2;
3591 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003592#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003593 *p++ = ch;
3594 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003595#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003597#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 continue;
3599 }
3600 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003601 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 startinpos = (((const char *)q)-4)-starts;
3603 endinpos = startinpos+2;
3604 goto utf16Error;
3605 }
3606
Benjamin Peterson14339b62009-01-31 16:36:08 +00003607 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003608 errmsg = "illegal encoding";
3609 startinpos = (((const char *)q)-2)-starts;
3610 endinpos = startinpos+2;
3611 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003612
Benjamin Peterson29060642009-01-31 22:14:21 +00003613 utf16Error:
3614 outpos = p - PyUnicode_AS_UNICODE(unicode);
3615 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003616 errors,
3617 &errorHandler,
3618 "utf16", errmsg,
3619 &starts,
3620 (const char **)&e,
3621 &startinpos,
3622 &endinpos,
3623 &exc,
3624 (const char **)&q,
3625 &unicode,
3626 &outpos,
3627 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003628 goto onError;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003629 /* Update data because unicode_decode_call_errorhandler might have
3630 changed the input object. */
3631 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Antoine Pitrouab868312009-01-10 15:40:25 +00003632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633
3634 if (byteorder)
3635 *byteorder = bo;
3636
Walter Dörwald69652032004-09-07 20:24:22 +00003637 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003639
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003641 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 goto onError;
3643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 Py_XDECREF(errorHandler);
3645 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 return (PyObject *)unicode;
3647
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 Py_XDECREF(errorHandler);
3651 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 return NULL;
3653}
3654
Antoine Pitrouab868312009-01-10 15:40:25 +00003655#undef FAST_CHAR_MASK
3656#undef SWAPPED_FAST_CHAR_MASK
3657
Tim Peters772747b2001-08-09 22:21:55 +00003658PyObject *
3659PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 Py_ssize_t size,
3661 const char *errors,
3662 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003664 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003665 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003666 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003667#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003668 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003669#else
3670 const int pairs = 0;
3671#endif
Tim Peters772747b2001-08-09 22:21:55 +00003672 /* Offsets from p for storing byte pairs in the right order. */
3673#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3674 int ihi = 1, ilo = 0;
3675#else
3676 int ihi = 0, ilo = 1;
3677#endif
3678
Benjamin Peterson29060642009-01-31 22:14:21 +00003679#define STORECHAR(CH) \
3680 do { \
3681 p[ihi] = ((CH) >> 8) & 0xff; \
3682 p[ilo] = (CH) & 0xff; \
3683 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003684 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003686#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003687 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 if (s[i] >= 0x10000)
3689 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003690#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003691 /* 2 * (size + pairs + (byteorder == 0)) */
3692 if (size > PY_SSIZE_T_MAX ||
3693 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003695 nsize = size + pairs + (byteorder == 0);
3696 bytesize = nsize * 2;
3697 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003699 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700 if (v == NULL)
3701 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003703 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003706 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003707 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003708
3709 if (byteorder == -1) {
3710 /* force LE */
3711 ihi = 1;
3712 ilo = 0;
3713 }
3714 else if (byteorder == 1) {
3715 /* force BE */
3716 ihi = 0;
3717 ilo = 1;
3718 }
3719
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003720 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 Py_UNICODE ch = *s++;
3722 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003723#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003724 if (ch >= 0x10000) {
3725 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3726 ch = 0xD800 | ((ch-0x10000) >> 10);
3727 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003728#endif
Tim Peters772747b2001-08-09 22:21:55 +00003729 STORECHAR(ch);
3730 if (ch2)
3731 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003732 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003733
3734 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003735 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003736#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737}
3738
3739PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3740{
3741 if (!PyUnicode_Check(unicode)) {
3742 PyErr_BadArgument();
3743 return NULL;
3744 }
3745 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 PyUnicode_GET_SIZE(unicode),
3747 NULL,
3748 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749}
3750
3751/* --- Unicode Escape Codec ----------------------------------------------- */
3752
Fredrik Lundh06d12682001-01-24 07:59:11 +00003753static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003754
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 Py_ssize_t size,
3757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003760 Py_ssize_t startinpos;
3761 Py_ssize_t endinpos;
3762 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003766 char* message;
3767 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 PyObject *errorHandler = NULL;
3769 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003770
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 /* Escaped strings will always be longer than the resulting
3772 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003773 length after conversion to the true value.
3774 (but if the error callback returns a long replacement string
3775 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 v = _PyUnicode_New(size);
3777 if (v == NULL)
3778 goto onError;
3779 if (size == 0)
3780 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003784
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 while (s < end) {
3786 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003787 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003788 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789
3790 /* Non-escape characters are interpreted as Unicode ordinals */
3791 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003792 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 continue;
3794 }
3795
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 /* \ - Escapes */
3798 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003799 c = *s++;
3800 if (s > end)
3801 c = '\0'; /* Invalid after \ */
3802 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803
Benjamin Peterson29060642009-01-31 22:14:21 +00003804 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 case '\n': break;
3806 case '\\': *p++ = '\\'; break;
3807 case '\'': *p++ = '\''; break;
3808 case '\"': *p++ = '\"'; break;
3809 case 'b': *p++ = '\b'; break;
3810 case 'f': *p++ = '\014'; break; /* FF */
3811 case 't': *p++ = '\t'; break;
3812 case 'n': *p++ = '\n'; break;
3813 case 'r': *p++ = '\r'; break;
3814 case 'v': *p++ = '\013'; break; /* VT */
3815 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3816
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 case '0': case '1': case '2': case '3':
3819 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003820 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003821 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003822 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003823 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003824 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003826 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 break;
3828
Benjamin Peterson29060642009-01-31 22:14:21 +00003829 /* hex escapes */
3830 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003832 digits = 2;
3833 message = "truncated \\xXX escape";
3834 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835
Benjamin Peterson29060642009-01-31 22:14:21 +00003836 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003838 digits = 4;
3839 message = "truncated \\uXXXX escape";
3840 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003843 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003844 digits = 8;
3845 message = "truncated \\UXXXXXXXX escape";
3846 hexescape:
3847 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02003848 if (end - s < digits) {
3849 /* count only hex digits */
3850 for (; s < end; ++s) {
3851 c = (unsigned char)*s;
3852 if (!Py_ISXDIGIT(c))
3853 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003854 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02003855 goto error;
3856 }
3857 for (; digits--; ++s) {
3858 c = (unsigned char)*s;
3859 if (!Py_ISXDIGIT(c))
3860 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003861 chr = (chr<<4) & ~0xF;
3862 if (c >= '0' && c <= '9')
3863 chr += c - '0';
3864 else if (c >= 'a' && c <= 'f')
3865 chr += 10 + c - 'a';
3866 else
3867 chr += 10 + c - 'A';
3868 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003869 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003870 /* _decoding_error will have already written into the
3871 target buffer. */
3872 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003873 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003874 /* when we get here, chr is a 32-bit unicode character */
3875 if (chr <= 0xffff)
3876 /* UCS-2 character */
3877 *p++ = (Py_UNICODE) chr;
3878 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003879 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003880 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003881#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003882 *p++ = chr;
3883#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003884 chr -= 0x10000L;
3885 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003886 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003887#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003888 } else {
Serhiy Storchakad6793772013-01-29 10:20:44 +02003889 message = "illegal Unicode character";
3890 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003891 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003892 break;
3893
Benjamin Peterson29060642009-01-31 22:14:21 +00003894 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003895 case 'N':
3896 message = "malformed \\N character escape";
3897 if (ucnhash_CAPI == NULL) {
3898 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003899 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003900 if (ucnhash_CAPI == NULL)
3901 goto ucnhashError;
3902 }
3903 if (*s == '{') {
3904 const char *start = s+1;
3905 /* look for the closing brace */
3906 while (*s != '}' && s < end)
3907 s++;
3908 if (s > start && s < end && *s == '}') {
3909 /* found a name. look it up in the unicode database */
3910 message = "unknown Unicode character name";
3911 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02003912 if (s - start - 1 <= INT_MAX &&
3913 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003914 goto store;
3915 }
3916 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02003917 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003918
3919 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003920 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 message = "\\ at end of string";
3922 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02003923 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00003924 }
3925 else {
3926 *p++ = '\\';
3927 *p++ = (unsigned char)s[-1];
3928 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003929 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02003931 continue;
3932
3933 error:
3934 endinpos = s-starts;
3935 outpos = p-PyUnicode_AS_UNICODE(v);
3936 if (unicode_decode_call_errorhandler(
3937 errors, &errorHandler,
3938 "unicodeescape", message,
3939 &starts, &end, &startinpos, &endinpos, &exc, &s,
3940 &v, &outpos, &p))
3941 goto onError;
3942 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003944 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003946 Py_XDECREF(errorHandler);
3947 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003949
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003951 PyErr_SetString(
3952 PyExc_UnicodeError,
3953 "\\N escapes not supported (can't load unicodedata module)"
3954 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003955 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 Py_XDECREF(errorHandler);
3957 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003958 return NULL;
3959
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 Py_XDECREF(errorHandler);
3963 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 return NULL;
3965}
3966
3967/* Return a Unicode-Escape string version of the Unicode object.
3968
3969 If quotes is true, the string is enclosed in u"" or u'' quotes as
3970 appropriate.
3971
3972*/
3973
Thomas Wouters477c8d52006-05-27 19:21:47 +00003974Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 Py_ssize_t size,
3976 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003977{
3978 /* like wcschr, but doesn't stop at NULL characters */
3979
3980 while (size-- > 0) {
3981 if (*s == ch)
3982 return s;
3983 s++;
3984 }
3985
3986 return NULL;
3987}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003988
Walter Dörwald79e913e2007-05-12 11:08:06 +00003989static const char *hexdigits = "0123456789abcdef";
3990
3991PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003994 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003997#ifdef Py_UNICODE_WIDE
3998 const Py_ssize_t expandsize = 10;
3999#else
4000 const Py_ssize_t expandsize = 6;
4001#endif
4002
Thomas Wouters89f507f2006-12-13 04:49:30 +00004003 /* XXX(nnorwitz): rather than over-allocating, it would be
4004 better to choose a different scheme. Perhaps scan the
4005 first N-chars of the string and allocate based on that size.
4006 */
4007 /* Initial allocation is based on the longest-possible unichr
4008 escape.
4009
4010 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4011 unichr, so in this case it's the longest unichr escape. In
4012 narrow (UTF-16) builds this is five chars per source unichr
4013 since there are two unichrs in the surrogate pair, so in narrow
4014 (UTF-16) builds it's not the longest unichr escape.
4015
4016 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4017 so in the narrow (UTF-16) build case it's the longest unichr
4018 escape.
4019 */
4020
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004021 if (size == 0)
4022 return PyBytes_FromStringAndSize(NULL, 0);
4023
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004024 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004025 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004026
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004027 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 2
4029 + expandsize*size
4030 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 if (repr == NULL)
4032 return NULL;
4033
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004034 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 while (size-- > 0) {
4037 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004038
Walter Dörwald79e913e2007-05-12 11:08:06 +00004039 /* Escape backslashes */
4040 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 *p++ = '\\';
4042 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004043 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004044 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004045
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004046#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004047 /* Map 21-bit characters to '\U00xxxxxx' */
4048 else if (ch >= 0x10000) {
4049 *p++ = '\\';
4050 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004051 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4052 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4053 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4054 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4055 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4056 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4057 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4058 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004060 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004061#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4063 else if (ch >= 0xD800 && ch < 0xDC00) {
4064 Py_UNICODE ch2;
4065 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004066
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 ch2 = *s++;
4068 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004069 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4071 *p++ = '\\';
4072 *p++ = 'U';
4073 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4074 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4075 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4076 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4077 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4078 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4079 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4080 *p++ = hexdigits[ucs & 0x0000000F];
4081 continue;
4082 }
4083 /* Fall through: isolated surrogates are copied as-is */
4084 s--;
4085 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004086 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004087#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004088
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004090 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091 *p++ = '\\';
4092 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004093 *p++ = hexdigits[(ch >> 12) & 0x000F];
4094 *p++ = hexdigits[(ch >> 8) & 0x000F];
4095 *p++ = hexdigits[(ch >> 4) & 0x000F];
4096 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004098
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004099 /* Map special whitespace to '\t', \n', '\r' */
4100 else if (ch == '\t') {
4101 *p++ = '\\';
4102 *p++ = 't';
4103 }
4104 else if (ch == '\n') {
4105 *p++ = '\\';
4106 *p++ = 'n';
4107 }
4108 else if (ch == '\r') {
4109 *p++ = '\\';
4110 *p++ = 'r';
4111 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004112
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004113 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004114 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004116 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004117 *p++ = hexdigits[(ch >> 4) & 0x000F];
4118 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004119 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004120
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 /* Copy everything else as-is */
4122 else
4123 *p++ = (char) ch;
4124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004126 assert(p - PyBytes_AS_STRING(repr) > 0);
4127 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4128 return NULL;
4129 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130}
4131
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004132PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004134 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 if (!PyUnicode_Check(unicode)) {
4136 PyErr_BadArgument();
4137 return NULL;
4138 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004139 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4140 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004141 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142}
4143
4144/* --- Raw Unicode Escape Codec ------------------------------------------- */
4145
4146PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 Py_ssize_t size,
4148 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004151 Py_ssize_t startinpos;
4152 Py_ssize_t endinpos;
4153 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 const char *end;
4157 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 PyObject *errorHandler = NULL;
4159 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004160
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161 /* Escaped strings will always be longer than the resulting
4162 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 length after conversion to the true value. (But decoding error
4164 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 v = _PyUnicode_New(size);
4166 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 end = s + size;
4172 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 unsigned char c;
4174 Py_UCS4 x;
4175 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004176 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 /* Non-escape characters are interpreted as Unicode ordinals */
4179 if (*s != '\\') {
4180 *p++ = (unsigned char)*s++;
4181 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004182 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 startinpos = s-starts;
4184
4185 /* \u-escapes are only interpreted iff the number of leading
4186 backslashes if odd */
4187 bs = s;
4188 for (;s < end;) {
4189 if (*s != '\\')
4190 break;
4191 *p++ = (unsigned char)*s++;
4192 }
4193 if (((s - bs) & 1) == 0 ||
4194 s >= end ||
4195 (*s != 'u' && *s != 'U')) {
4196 continue;
4197 }
4198 p--;
4199 count = *s=='u' ? 4 : 8;
4200 s++;
4201
4202 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4203 outpos = p-PyUnicode_AS_UNICODE(v);
4204 for (x = 0, i = 0; i < count; ++i, ++s) {
4205 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004206 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 endinpos = s-starts;
4208 if (unicode_decode_call_errorhandler(
4209 errors, &errorHandler,
4210 "rawunicodeescape", "truncated \\uXXXX",
4211 &starts, &end, &startinpos, &endinpos, &exc, &s,
4212 &v, &outpos, &p))
4213 goto onError;
4214 goto nextByte;
4215 }
4216 x = (x<<4) & ~0xF;
4217 if (c >= '0' && c <= '9')
4218 x += c - '0';
4219 else if (c >= 'a' && c <= 'f')
4220 x += 10 + c - 'a';
4221 else
4222 x += 10 + c - 'A';
4223 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004224 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 /* UCS-2 character */
4226 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004227 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 /* UCS-4 character. Either store directly, or as
4229 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004230#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004232#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004233 x -= 0x10000L;
4234 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4235 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004236#endif
4237 } else {
4238 endinpos = s-starts;
4239 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004240 if (unicode_decode_call_errorhandler(
4241 errors, &errorHandler,
4242 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 &starts, &end, &startinpos, &endinpos, &exc, &s,
4244 &v, &outpos, &p))
4245 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004246 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 nextByte:
4248 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004250 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 Py_XDECREF(errorHandler);
4253 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004255
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 Py_XDECREF(errorHandler);
4259 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 return NULL;
4261}
4262
4263PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004264 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004266 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267 char *p;
4268 char *q;
4269
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004270#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004271 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004272#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004273 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004274#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004275
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004276 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004277 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004278
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004279 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 if (repr == NULL)
4281 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004282 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004283 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004285 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 while (size-- > 0) {
4287 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004288#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 /* Map 32-bit characters to '\Uxxxxxxxx' */
4290 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004291 *p++ = '\\';
4292 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004293 *p++ = hexdigits[(ch >> 28) & 0xf];
4294 *p++ = hexdigits[(ch >> 24) & 0xf];
4295 *p++ = hexdigits[(ch >> 20) & 0xf];
4296 *p++ = hexdigits[(ch >> 16) & 0xf];
4297 *p++ = hexdigits[(ch >> 12) & 0xf];
4298 *p++ = hexdigits[(ch >> 8) & 0xf];
4299 *p++ = hexdigits[(ch >> 4) & 0xf];
4300 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004301 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004302 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004303#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4305 if (ch >= 0xD800 && ch < 0xDC00) {
4306 Py_UNICODE ch2;
4307 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004308
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 ch2 = *s++;
4310 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004311 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4313 *p++ = '\\';
4314 *p++ = 'U';
4315 *p++ = hexdigits[(ucs >> 28) & 0xf];
4316 *p++ = hexdigits[(ucs >> 24) & 0xf];
4317 *p++ = hexdigits[(ucs >> 20) & 0xf];
4318 *p++ = hexdigits[(ucs >> 16) & 0xf];
4319 *p++ = hexdigits[(ucs >> 12) & 0xf];
4320 *p++ = hexdigits[(ucs >> 8) & 0xf];
4321 *p++ = hexdigits[(ucs >> 4) & 0xf];
4322 *p++ = hexdigits[ucs & 0xf];
4323 continue;
4324 }
4325 /* Fall through: isolated surrogates are copied as-is */
4326 s--;
4327 size++;
4328 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004329#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004330 /* Map 16-bit characters to '\uxxxx' */
4331 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 *p++ = '\\';
4333 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004334 *p++ = hexdigits[(ch >> 12) & 0xf];
4335 *p++ = hexdigits[(ch >> 8) & 0xf];
4336 *p++ = hexdigits[(ch >> 4) & 0xf];
4337 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 /* Copy everything else as-is */
4340 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 *p++ = (char) ch;
4342 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004343 size = p - q;
4344
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004345 assert(size > 0);
4346 if (_PyBytes_Resize(&repr, size) < 0)
4347 return NULL;
4348 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349}
4350
4351PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4352{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004353 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004355 PyErr_BadArgument();
4356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004358 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4359 PyUnicode_GET_SIZE(unicode));
4360
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004361 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362}
4363
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004364/* --- Unicode Internal Codec ------------------------------------------- */
4365
4366PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 Py_ssize_t size,
4368 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004369{
4370 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004371 Py_ssize_t startinpos;
4372 Py_ssize_t endinpos;
4373 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004374 PyUnicodeObject *v;
4375 Py_UNICODE *p;
4376 const char *end;
4377 const char *reason;
4378 PyObject *errorHandler = NULL;
4379 PyObject *exc = NULL;
4380
Neal Norwitzd43069c2006-01-08 01:12:10 +00004381#ifdef Py_UNICODE_WIDE
4382 Py_UNICODE unimax = PyUnicode_GetMax();
4383#endif
4384
Thomas Wouters89f507f2006-12-13 04:49:30 +00004385 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004386 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4387 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004389 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004391 p = PyUnicode_AS_UNICODE(v);
4392 end = s + size;
4393
4394 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004395 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004396 /* We have to sanity check the raw data, otherwise doom looms for
4397 some malformed UCS-4 data. */
4398 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004399#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004400 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004401#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004402 end-s < Py_UNICODE_SIZE
4403 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004404 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004405 startinpos = s - starts;
4406 if (end-s < Py_UNICODE_SIZE) {
4407 endinpos = end-starts;
4408 reason = "truncated input";
4409 }
4410 else {
4411 endinpos = s - starts + Py_UNICODE_SIZE;
4412 reason = "illegal code point (> 0x10FFFF)";
4413 }
4414 outpos = p - PyUnicode_AS_UNICODE(v);
4415 if (unicode_decode_call_errorhandler(
4416 errors, &errorHandler,
4417 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004418 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004419 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004420 goto onError;
4421 }
4422 }
4423 else {
4424 p++;
4425 s += Py_UNICODE_SIZE;
4426 }
4427 }
4428
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004429 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004430 goto onError;
4431 Py_XDECREF(errorHandler);
4432 Py_XDECREF(exc);
4433 return (PyObject *)v;
4434
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004436 Py_XDECREF(v);
4437 Py_XDECREF(errorHandler);
4438 Py_XDECREF(exc);
4439 return NULL;
4440}
4441
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442/* --- Latin-1 Codec ------------------------------------------------------ */
4443
4444PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 Py_ssize_t size,
4446 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447{
4448 PyUnicodeObject *v;
4449 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004450 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004451
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004453 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 Py_UNICODE r = *(unsigned char*)s;
4455 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004456 }
4457
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 v = _PyUnicode_New(size);
4459 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004464 e = s + size;
4465 /* Unrolling the copy makes it much faster by reducing the looping
4466 overhead. This is similar to what many memcpy() implementations do. */
4467 unrolled_end = e - 4;
4468 while (s < unrolled_end) {
4469 p[0] = (unsigned char) s[0];
4470 p[1] = (unsigned char) s[1];
4471 p[2] = (unsigned char) s[2];
4472 p[3] = (unsigned char) s[3];
4473 s += 4;
4474 p += 4;
4475 }
4476 while (s < e)
4477 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004479
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 Py_XDECREF(v);
4482 return NULL;
4483}
4484
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485/* create or adjust a UnicodeEncodeError */
4486static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 const char *encoding,
4488 const Py_UNICODE *unicode, Py_ssize_t size,
4489 Py_ssize_t startpos, Py_ssize_t endpos,
4490 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 *exceptionObject = PyUnicodeEncodeError_Create(
4494 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 }
4496 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4498 goto onError;
4499 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4500 goto onError;
4501 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4502 goto onError;
4503 return;
4504 onError:
4505 Py_DECREF(*exceptionObject);
4506 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 }
4508}
4509
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510/* raises a UnicodeEncodeError */
4511static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 const char *encoding,
4513 const Py_UNICODE *unicode, Py_ssize_t size,
4514 Py_ssize_t startpos, Py_ssize_t endpos,
4515 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516{
4517 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521}
4522
4523/* error handling callback helper:
4524 build arguments, call the callback and check the arguments,
4525 put the result into newpos and return the replacement string, which
4526 has to be freed by the caller */
4527static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 PyObject **errorHandler,
4529 const char *encoding, const char *reason,
4530 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4531 Py_ssize_t startpos, Py_ssize_t endpos,
4532 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004534 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535
4536 PyObject *restuple;
4537 PyObject *resunicode;
4538
4539 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004543 }
4544
4545 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549
4550 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004555 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 Py_DECREF(restuple);
4557 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004559 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 &resunicode, newpos)) {
4561 Py_DECREF(restuple);
4562 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004564 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4565 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4566 Py_DECREF(restuple);
4567 return NULL;
4568 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004571 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004572 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4573 Py_DECREF(restuple);
4574 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004575 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 Py_INCREF(resunicode);
4577 Py_DECREF(restuple);
4578 return resunicode;
4579}
4580
4581static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 Py_ssize_t size,
4583 const char *errors,
4584 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585{
4586 /* output object */
4587 PyObject *res;
4588 /* pointers to the beginning and end+1 of input */
4589 const Py_UNICODE *startp = p;
4590 const Py_UNICODE *endp = p + size;
4591 /* pointer to the beginning of the unencodable characters */
4592 /* const Py_UNICODE *badp = NULL; */
4593 /* pointer into the output */
4594 char *str;
4595 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004596 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004597 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4598 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 PyObject *errorHandler = NULL;
4600 PyObject *exc = NULL;
4601 /* the following variable is used for caching string comparisons
4602 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4603 int known_errorHandler = -1;
4604
4605 /* allocate enough for a simple encoding without
4606 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004607 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004608 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004609 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004611 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004612 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 ressize = size;
4614
4615 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 /* can we encode this? */
4619 if (c<limit) {
4620 /* no overflow check, because we know that the space is enough */
4621 *str++ = (char)c;
4622 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004623 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004624 else {
4625 Py_ssize_t unicodepos = p-startp;
4626 Py_ssize_t requiredsize;
4627 PyObject *repunicode;
4628 Py_ssize_t repsize;
4629 Py_ssize_t newpos;
4630 Py_ssize_t respos;
4631 Py_UNICODE *uni2;
4632 /* startpos for collecting unencodable chars */
4633 const Py_UNICODE *collstart = p;
4634 const Py_UNICODE *collend = p;
4635 /* find all unecodable characters */
4636 while ((collend < endp) && ((*collend)>=limit))
4637 ++collend;
4638 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4639 if (known_errorHandler==-1) {
4640 if ((errors==NULL) || (!strcmp(errors, "strict")))
4641 known_errorHandler = 1;
4642 else if (!strcmp(errors, "replace"))
4643 known_errorHandler = 2;
4644 else if (!strcmp(errors, "ignore"))
4645 known_errorHandler = 3;
4646 else if (!strcmp(errors, "xmlcharrefreplace"))
4647 known_errorHandler = 4;
4648 else
4649 known_errorHandler = 0;
4650 }
4651 switch (known_errorHandler) {
4652 case 1: /* strict */
4653 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4654 goto onError;
4655 case 2: /* replace */
4656 while (collstart++<collend)
4657 *str++ = '?'; /* fall through */
4658 case 3: /* ignore */
4659 p = collend;
4660 break;
4661 case 4: /* xmlcharrefreplace */
4662 respos = str - PyBytes_AS_STRING(res);
4663 /* determine replacement size (temporarily (mis)uses p) */
4664 for (p = collstart, repsize = 0; p < collend; ++p) {
4665 if (*p<10)
4666 repsize += 2+1+1;
4667 else if (*p<100)
4668 repsize += 2+2+1;
4669 else if (*p<1000)
4670 repsize += 2+3+1;
4671 else if (*p<10000)
4672 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004673#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 else
4675 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004676#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 else if (*p<100000)
4678 repsize += 2+5+1;
4679 else if (*p<1000000)
4680 repsize += 2+6+1;
4681 else
4682 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004683#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 }
4685 requiredsize = respos+repsize+(endp-collend);
4686 if (requiredsize > ressize) {
4687 if (requiredsize<2*ressize)
4688 requiredsize = 2*ressize;
4689 if (_PyBytes_Resize(&res, requiredsize))
4690 goto onError;
4691 str = PyBytes_AS_STRING(res) + respos;
4692 ressize = requiredsize;
4693 }
4694 /* generate replacement (temporarily (mis)uses p) */
4695 for (p = collstart; p < collend; ++p) {
4696 str += sprintf(str, "&#%d;", (int)*p);
4697 }
4698 p = collend;
4699 break;
4700 default:
4701 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4702 encoding, reason, startp, size, &exc,
4703 collstart-startp, collend-startp, &newpos);
4704 if (repunicode == NULL)
4705 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004706 if (PyBytes_Check(repunicode)) {
4707 /* Directly copy bytes result to output. */
4708 repsize = PyBytes_Size(repunicode);
4709 if (repsize > 1) {
4710 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004711 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004712 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4713 Py_DECREF(repunicode);
4714 goto onError;
4715 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004716 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004717 ressize += repsize-1;
4718 }
4719 memcpy(str, PyBytes_AsString(repunicode), repsize);
4720 str += repsize;
4721 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004722 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004723 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004724 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 /* need more space? (at least enough for what we
4726 have+the replacement+the rest of the string, so
4727 we won't have to check space for encodable characters) */
4728 respos = str - PyBytes_AS_STRING(res);
4729 repsize = PyUnicode_GET_SIZE(repunicode);
4730 requiredsize = respos+repsize+(endp-collend);
4731 if (requiredsize > ressize) {
4732 if (requiredsize<2*ressize)
4733 requiredsize = 2*ressize;
4734 if (_PyBytes_Resize(&res, requiredsize)) {
4735 Py_DECREF(repunicode);
4736 goto onError;
4737 }
4738 str = PyBytes_AS_STRING(res) + respos;
4739 ressize = requiredsize;
4740 }
4741 /* check if there is anything unencodable in the replacement
4742 and copy it to the output */
4743 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4744 c = *uni2;
4745 if (c >= limit) {
4746 raise_encode_exception(&exc, encoding, startp, size,
4747 unicodepos, unicodepos+1, reason);
4748 Py_DECREF(repunicode);
4749 goto onError;
4750 }
4751 *str = (char)c;
4752 }
4753 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004754 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004755 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004756 }
4757 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004758 /* Resize if we allocated to much */
4759 size = str - PyBytes_AS_STRING(res);
4760 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004761 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004762 if (_PyBytes_Resize(&res, size) < 0)
4763 goto onError;
4764 }
4765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 Py_XDECREF(errorHandler);
4767 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004768 return res;
4769
4770 onError:
4771 Py_XDECREF(res);
4772 Py_XDECREF(errorHandler);
4773 Py_XDECREF(exc);
4774 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775}
4776
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 Py_ssize_t size,
4779 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782}
4783
4784PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4785{
4786 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 PyErr_BadArgument();
4788 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 }
4790 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 PyUnicode_GET_SIZE(unicode),
4792 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793}
4794
4795/* --- 7-bit ASCII Codec -------------------------------------------------- */
4796
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 Py_ssize_t size,
4799 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802 PyUnicodeObject *v;
4803 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004804 Py_ssize_t startinpos;
4805 Py_ssize_t endinpos;
4806 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 const char *e;
4808 PyObject *errorHandler = NULL;
4809 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004810
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004812 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 Py_UNICODE r = *(unsigned char*)s;
4814 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004815 }
Tim Petersced69f82003-09-16 20:30:58 +00004816
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 v = _PyUnicode_New(size);
4818 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 e = s + size;
4824 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 register unsigned char c = (unsigned char)*s;
4826 if (c < 128) {
4827 *p++ = c;
4828 ++s;
4829 }
4830 else {
4831 startinpos = s-starts;
4832 endinpos = startinpos + 1;
4833 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4834 if (unicode_decode_call_errorhandler(
4835 errors, &errorHandler,
4836 "ascii", "ordinal not in range(128)",
4837 &starts, &e, &startinpos, &endinpos, &exc, &s,
4838 &v, &outpos, &p))
4839 goto onError;
4840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004842 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845 Py_XDECREF(errorHandler);
4846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004848
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 Py_XDECREF(errorHandler);
4852 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 return NULL;
4854}
4855
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 Py_ssize_t size,
4858 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861}
4862
4863PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4864{
4865 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 PyErr_BadArgument();
4867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 }
4869 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 PyUnicode_GET_SIZE(unicode),
4871 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872}
4873
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004874#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004875
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004876/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004877
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004878#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004879#define NEED_RETRY
4880#endif
4881
4882/* XXX This code is limited to "true" double-byte encodings, as
4883 a) it assumes an incomplete character consists of a single byte, and
4884 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004886
4887static int is_dbcs_lead_byte(const char *s, int offset)
4888{
4889 const char *curr = s + offset;
4890
4891 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 const char *prev = CharPrev(s, curr);
4893 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004894 }
4895 return 0;
4896}
4897
4898/*
4899 * Decode MBCS string into unicode object. If 'final' is set, converts
4900 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4901 */
4902static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 const char *s, /* MBCS string */
4904 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004905 int final,
4906 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004907{
4908 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004909 Py_ssize_t n;
4910 DWORD usize;
4911 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004912
4913 assert(size >= 0);
4914
Victor Stinner554f3f02010-06-16 23:33:54 +00004915 /* check and handle 'errors' arg */
4916 if (errors==NULL || strcmp(errors, "strict")==0)
4917 flags = MB_ERR_INVALID_CHARS;
4918 else if (strcmp(errors, "ignore")==0)
4919 flags = 0;
4920 else {
4921 PyErr_Format(PyExc_ValueError,
4922 "mbcs encoding does not support errors='%s'",
4923 errors);
4924 return -1;
4925 }
4926
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004927 /* Skip trailing lead-byte unless 'final' is set */
4928 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004930
4931 /* First get the size of the result */
4932 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004933 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4934 if (usize==0)
4935 goto mbcs_decode_error;
4936 } else
4937 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004938
4939 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 /* Create unicode object */
4941 *v = _PyUnicode_New(usize);
4942 if (*v == NULL)
4943 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004944 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004945 }
4946 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 /* Extend unicode object */
4948 n = PyUnicode_GET_SIZE(*v);
4949 if (_PyUnicode_Resize(v, n + usize) < 0)
4950 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004951 }
4952
4953 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004954 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004956 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4957 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004959 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004960 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004961
4962mbcs_decode_error:
4963 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4964 we raise a UnicodeDecodeError - else it is a 'generic'
4965 windows error
4966 */
4967 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4968 /* Ideally, we should get reason from FormatMessage - this
4969 is the Windows 2000 English version of the message
4970 */
4971 PyObject *exc = NULL;
4972 const char *reason = "No mapping for the Unicode character exists "
4973 "in the target multi-byte code page.";
4974 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4975 if (exc != NULL) {
4976 PyCodec_StrictErrors(exc);
4977 Py_DECREF(exc);
4978 }
4979 } else {
4980 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4981 }
4982 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004983}
4984
4985PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 Py_ssize_t size,
4987 const char *errors,
4988 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004989{
4990 PyUnicodeObject *v = NULL;
4991 int done;
4992
4993 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004995
4996#ifdef NEED_RETRY
4997 retry:
4998 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004999 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005000 else
5001#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005002 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005003
5004 if (done < 0) {
5005 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005007 }
5008
5009 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005011
5012#ifdef NEED_RETRY
5013 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005014 s += done;
5015 size -= done;
5016 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005017 }
5018#endif
5019
5020 return (PyObject *)v;
5021}
5022
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005023PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 Py_ssize_t size,
5025 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005026{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005027 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5028}
5029
5030/*
5031 * Convert unicode into string object (MBCS).
5032 * Returns 0 if succeed, -1 otherwise.
5033 */
5034static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005036 int size, /* size of unicode */
5037 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005038{
Victor Stinner554f3f02010-06-16 23:33:54 +00005039 BOOL usedDefaultChar = FALSE;
5040 BOOL *pusedDefaultChar;
5041 int mbcssize;
5042 Py_ssize_t n;
5043 PyObject *exc = NULL;
5044 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005045
5046 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005047
Victor Stinner554f3f02010-06-16 23:33:54 +00005048 /* check and handle 'errors' arg */
5049 if (errors==NULL || strcmp(errors, "strict")==0) {
5050 flags = WC_NO_BEST_FIT_CHARS;
5051 pusedDefaultChar = &usedDefaultChar;
5052 } else if (strcmp(errors, "replace")==0) {
5053 flags = 0;
5054 pusedDefaultChar = NULL;
5055 } else {
5056 PyErr_Format(PyExc_ValueError,
5057 "mbcs encoding does not support errors='%s'",
5058 errors);
5059 return -1;
5060 }
5061
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005062 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005063 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005064 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5065 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 if (mbcssize == 0) {
5067 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5068 return -1;
5069 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005070 /* If we used a default char, then we failed! */
5071 if (pusedDefaultChar && *pusedDefaultChar)
5072 goto mbcs_encode_error;
5073 } else {
5074 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005075 }
5076
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005077 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 /* Create string object */
5079 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5080 if (*repr == NULL)
5081 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005082 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005083 }
5084 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 /* Extend string object */
5086 n = PyBytes_Size(*repr);
5087 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5088 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005089 }
5090
5091 /* Do the conversion */
5092 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005094 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5095 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5097 return -1;
5098 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005099 if (pusedDefaultChar && *pusedDefaultChar)
5100 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005101 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005102 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005103
5104mbcs_encode_error:
5105 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5106 Py_XDECREF(exc);
5107 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005108}
5109
5110PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 Py_ssize_t size,
5112 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005113{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005114 PyObject *repr = NULL;
5115 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005116
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005117#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005119 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005120 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005121 else
5122#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005123 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005124
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005125 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 Py_XDECREF(repr);
5127 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005128 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005129
5130#ifdef NEED_RETRY
5131 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 p += INT_MAX;
5133 size -= INT_MAX;
5134 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005135 }
5136#endif
5137
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005138 return repr;
5139}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005140
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005141PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5142{
5143 if (!PyUnicode_Check(unicode)) {
5144 PyErr_BadArgument();
5145 return NULL;
5146 }
5147 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 PyUnicode_GET_SIZE(unicode),
5149 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005150}
5151
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005152#undef NEED_RETRY
5153
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005154#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005155
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156/* --- Character Mapping Codec -------------------------------------------- */
5157
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 Py_ssize_t size,
5160 PyObject *mapping,
5161 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005163 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005164 Py_ssize_t startinpos;
5165 Py_ssize_t endinpos;
5166 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005167 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 PyUnicodeObject *v;
5169 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005170 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005171 PyObject *errorHandler = NULL;
5172 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005173 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005174 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005175
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 /* Default to Latin-1 */
5177 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179
5180 v = _PyUnicode_New(size);
5181 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005184 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005186 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005187 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 mapstring = PyUnicode_AS_UNICODE(mapping);
5189 maplen = PyUnicode_GET_SIZE(mapping);
5190 while (s < e) {
5191 unsigned char ch = *s;
5192 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 if (ch < maplen)
5195 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 if (x == 0xfffe) {
5198 /* undefined mapping */
5199 outpos = p-PyUnicode_AS_UNICODE(v);
5200 startinpos = s-starts;
5201 endinpos = startinpos+1;
5202 if (unicode_decode_call_errorhandler(
5203 errors, &errorHandler,
5204 "charmap", "character maps to <undefined>",
5205 &starts, &e, &startinpos, &endinpos, &exc, &s,
5206 &v, &outpos, &p)) {
5207 goto onError;
5208 }
5209 continue;
5210 }
5211 *p++ = x;
5212 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005213 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005214 }
5215 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 while (s < e) {
5217 unsigned char ch = *s;
5218 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005219
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5221 w = PyLong_FromLong((long)ch);
5222 if (w == NULL)
5223 goto onError;
5224 x = PyObject_GetItem(mapping, w);
5225 Py_DECREF(w);
5226 if (x == NULL) {
5227 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5228 /* No mapping found means: mapping is undefined. */
5229 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005230 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 } else
5232 goto onError;
5233 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005234
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005236 if (x == Py_None)
5237 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 if (PyLong_Check(x)) {
5239 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005240 if (value == 0xFFFE)
5241 goto Undefined;
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005242 if (value < 0 || value > 0x10FFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 PyErr_SetString(PyExc_TypeError,
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005244 "character mapping must be in range(0x110000)");
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 Py_DECREF(x);
5246 goto onError;
5247 }
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005248
5249#ifndef Py_UNICODE_WIDE
5250 if (value > 0xFFFF) {
5251 /* see the code for 1-n mapping below */
5252 if (extrachars < 2) {
5253 /* resize first */
5254 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5255 Py_ssize_t needed = 10 - extrachars;
5256 extrachars += needed;
5257 /* XXX overflow detection missing */
5258 if (_PyUnicode_Resize(&v,
5259 PyUnicode_GET_SIZE(v) + needed) < 0) {
5260 Py_DECREF(x);
5261 goto onError;
5262 }
5263 p = PyUnicode_AS_UNICODE(v) + oldpos;
5264 }
5265 value -= 0x10000;
5266 *p++ = 0xD800 | (value >> 10);
5267 *p++ = 0xDC00 | (value & 0x3FF);
5268 extrachars -= 2;
5269 }
5270 else
5271#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 *p++ = (Py_UNICODE)value;
5273 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 else if (PyUnicode_Check(x)) {
5275 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005276
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005277 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 /* 1-1 mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005279 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
5280 if (value == 0xFFFE)
5281 goto Undefined;
5282 *p++ = value;
5283 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 else if (targetsize > 1) {
5285 /* 1-n mapping */
5286 if (targetsize > extrachars) {
5287 /* resize first */
5288 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5289 Py_ssize_t needed = (targetsize - extrachars) + \
5290 (targetsize << 2);
5291 extrachars += needed;
5292 /* XXX overflow detection missing */
5293 if (_PyUnicode_Resize(&v,
5294 PyUnicode_GET_SIZE(v) + needed) < 0) {
5295 Py_DECREF(x);
5296 goto onError;
5297 }
5298 p = PyUnicode_AS_UNICODE(v) + oldpos;
5299 }
5300 Py_UNICODE_COPY(p,
5301 PyUnicode_AS_UNICODE(x),
5302 targetsize);
5303 p += targetsize;
5304 extrachars -= targetsize;
5305 }
5306 /* 1-0 mapping: skip the character */
5307 }
5308 else {
5309 /* wrong return value */
5310 PyErr_SetString(PyExc_TypeError,
5311 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005312 Py_DECREF(x);
5313 goto onError;
5314 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005315 Py_DECREF(x);
5316 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005317 continue;
5318Undefined:
5319 /* undefined mapping */
5320 Py_XDECREF(x);
5321 outpos = p-PyUnicode_AS_UNICODE(v);
5322 startinpos = s-starts;
5323 endinpos = startinpos+1;
5324 if (unicode_decode_call_errorhandler(
5325 errors, &errorHandler,
5326 "charmap", "character maps to <undefined>",
5327 &starts, &e, &startinpos, &endinpos, &exc, &s,
5328 &v, &outpos, &p)) {
5329 goto onError;
5330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 }
5333 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5335 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336 Py_XDECREF(errorHandler);
5337 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005339
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005341 Py_XDECREF(errorHandler);
5342 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 Py_XDECREF(v);
5344 return NULL;
5345}
5346
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005347/* Charmap encoding: the lookup table */
5348
5349struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 PyObject_HEAD
5351 unsigned char level1[32];
5352 int count2, count3;
5353 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005354};
5355
5356static PyObject*
5357encoding_map_size(PyObject *obj, PyObject* args)
5358{
5359 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005360 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005362}
5363
5364static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005365 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 PyDoc_STR("Return the size (in bytes) of this object") },
5367 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005368};
5369
5370static void
5371encoding_map_dealloc(PyObject* o)
5372{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005373 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005374}
5375
5376static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005377 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 "EncodingMap", /*tp_name*/
5379 sizeof(struct encoding_map), /*tp_basicsize*/
5380 0, /*tp_itemsize*/
5381 /* methods */
5382 encoding_map_dealloc, /*tp_dealloc*/
5383 0, /*tp_print*/
5384 0, /*tp_getattr*/
5385 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005386 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 0, /*tp_repr*/
5388 0, /*tp_as_number*/
5389 0, /*tp_as_sequence*/
5390 0, /*tp_as_mapping*/
5391 0, /*tp_hash*/
5392 0, /*tp_call*/
5393 0, /*tp_str*/
5394 0, /*tp_getattro*/
5395 0, /*tp_setattro*/
5396 0, /*tp_as_buffer*/
5397 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5398 0, /*tp_doc*/
5399 0, /*tp_traverse*/
5400 0, /*tp_clear*/
5401 0, /*tp_richcompare*/
5402 0, /*tp_weaklistoffset*/
5403 0, /*tp_iter*/
5404 0, /*tp_iternext*/
5405 encoding_map_methods, /*tp_methods*/
5406 0, /*tp_members*/
5407 0, /*tp_getset*/
5408 0, /*tp_base*/
5409 0, /*tp_dict*/
5410 0, /*tp_descr_get*/
5411 0, /*tp_descr_set*/
5412 0, /*tp_dictoffset*/
5413 0, /*tp_init*/
5414 0, /*tp_alloc*/
5415 0, /*tp_new*/
5416 0, /*tp_free*/
5417 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005418};
5419
5420PyObject*
5421PyUnicode_BuildEncodingMap(PyObject* string)
5422{
5423 Py_UNICODE *decode;
5424 PyObject *result;
5425 struct encoding_map *mresult;
5426 int i;
5427 int need_dict = 0;
5428 unsigned char level1[32];
5429 unsigned char level2[512];
5430 unsigned char *mlevel1, *mlevel2, *mlevel3;
5431 int count2 = 0, count3 = 0;
5432
5433 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5434 PyErr_BadArgument();
5435 return NULL;
5436 }
5437 decode = PyUnicode_AS_UNICODE(string);
5438 memset(level1, 0xFF, sizeof level1);
5439 memset(level2, 0xFF, sizeof level2);
5440
5441 /* If there isn't a one-to-one mapping of NULL to \0,
5442 or if there are non-BMP characters, we need to use
5443 a mapping dictionary. */
5444 if (decode[0] != 0)
5445 need_dict = 1;
5446 for (i = 1; i < 256; i++) {
5447 int l1, l2;
5448 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005449#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005450 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005451#endif
5452 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005453 need_dict = 1;
5454 break;
5455 }
5456 if (decode[i] == 0xFFFE)
5457 /* unmapped character */
5458 continue;
5459 l1 = decode[i] >> 11;
5460 l2 = decode[i] >> 7;
5461 if (level1[l1] == 0xFF)
5462 level1[l1] = count2++;
5463 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005464 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005465 }
5466
5467 if (count2 >= 0xFF || count3 >= 0xFF)
5468 need_dict = 1;
5469
5470 if (need_dict) {
5471 PyObject *result = PyDict_New();
5472 PyObject *key, *value;
5473 if (!result)
5474 return NULL;
5475 for (i = 0; i < 256; i++) {
5476 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005477 key = PyLong_FromLong(decode[i]);
5478 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005479 if (!key || !value)
5480 goto failed1;
5481 if (PyDict_SetItem(result, key, value) == -1)
5482 goto failed1;
5483 Py_DECREF(key);
5484 Py_DECREF(value);
5485 }
5486 return result;
5487 failed1:
5488 Py_XDECREF(key);
5489 Py_XDECREF(value);
5490 Py_DECREF(result);
5491 return NULL;
5492 }
5493
5494 /* Create a three-level trie */
5495 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5496 16*count2 + 128*count3 - 1);
5497 if (!result)
5498 return PyErr_NoMemory();
5499 PyObject_Init(result, &EncodingMapType);
5500 mresult = (struct encoding_map*)result;
5501 mresult->count2 = count2;
5502 mresult->count3 = count3;
5503 mlevel1 = mresult->level1;
5504 mlevel2 = mresult->level23;
5505 mlevel3 = mresult->level23 + 16*count2;
5506 memcpy(mlevel1, level1, 32);
5507 memset(mlevel2, 0xFF, 16*count2);
5508 memset(mlevel3, 0, 128*count3);
5509 count3 = 0;
5510 for (i = 1; i < 256; i++) {
5511 int o1, o2, o3, i2, i3;
5512 if (decode[i] == 0xFFFE)
5513 /* unmapped character */
5514 continue;
5515 o1 = decode[i]>>11;
5516 o2 = (decode[i]>>7) & 0xF;
5517 i2 = 16*mlevel1[o1] + o2;
5518 if (mlevel2[i2] == 0xFF)
5519 mlevel2[i2] = count3++;
5520 o3 = decode[i] & 0x7F;
5521 i3 = 128*mlevel2[i2] + o3;
5522 mlevel3[i3] = i;
5523 }
5524 return result;
5525}
5526
5527static int
5528encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5529{
5530 struct encoding_map *map = (struct encoding_map*)mapping;
5531 int l1 = c>>11;
5532 int l2 = (c>>7) & 0xF;
5533 int l3 = c & 0x7F;
5534 int i;
5535
5536#ifdef Py_UNICODE_WIDE
5537 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005539 }
5540#endif
5541 if (c == 0)
5542 return 0;
5543 /* level 1*/
5544 i = map->level1[l1];
5545 if (i == 0xFF) {
5546 return -1;
5547 }
5548 /* level 2*/
5549 i = map->level23[16*i+l2];
5550 if (i == 0xFF) {
5551 return -1;
5552 }
5553 /* level 3 */
5554 i = map->level23[16*map->count2 + 128*i + l3];
5555 if (i == 0) {
5556 return -1;
5557 }
5558 return i;
5559}
5560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561/* Lookup the character ch in the mapping. If the character
5562 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005563 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565{
Christian Heimes217cfd12007-12-02 14:31:20 +00005566 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567 PyObject *x;
5568
5569 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005571 x = PyObject_GetItem(mapping, w);
5572 Py_DECREF(w);
5573 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5575 /* No mapping found means: mapping is undefined. */
5576 PyErr_Clear();
5577 x = Py_None;
5578 Py_INCREF(x);
5579 return x;
5580 } else
5581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005583 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005584 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005585 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 long value = PyLong_AS_LONG(x);
5587 if (value < 0 || value > 255) {
5588 PyErr_SetString(PyExc_TypeError,
5589 "character mapping must be in range(256)");
5590 Py_DECREF(x);
5591 return NULL;
5592 }
5593 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005595 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 /* wrong return value */
5599 PyErr_Format(PyExc_TypeError,
5600 "character mapping must return integer, bytes or None, not %.400s",
5601 x->ob_type->tp_name);
5602 Py_DECREF(x);
5603 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 }
5605}
5606
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005607static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005608charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005609{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005610 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5611 /* exponentially overallocate to minimize reallocations */
5612 if (requiredsize < 2*outsize)
5613 requiredsize = 2*outsize;
5614 if (_PyBytes_Resize(outobj, requiredsize))
5615 return -1;
5616 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005617}
5618
Benjamin Peterson14339b62009-01-31 16:36:08 +00005619typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005621}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005623 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624 space is available. Return a new reference to the object that
5625 was put in the output buffer, or Py_None, if the mapping was undefined
5626 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005627 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005629charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005631{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005632 PyObject *rep;
5633 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005634 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005635
Christian Heimes90aa7642007-12-19 02:45:37 +00005636 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005637 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005638 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005639 if (res == -1)
5640 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 if (outsize<requiredsize)
5642 if (charmapencode_resize(outobj, outpos, requiredsize))
5643 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005644 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 outstart[(*outpos)++] = (char)res;
5646 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005647 }
5648
5649 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005652 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 Py_DECREF(rep);
5654 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005655 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 if (PyLong_Check(rep)) {
5657 Py_ssize_t requiredsize = *outpos+1;
5658 if (outsize<requiredsize)
5659 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5660 Py_DECREF(rep);
5661 return enc_EXCEPTION;
5662 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005663 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005665 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 else {
5667 const char *repchars = PyBytes_AS_STRING(rep);
5668 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5669 Py_ssize_t requiredsize = *outpos+repsize;
5670 if (outsize<requiredsize)
5671 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5672 Py_DECREF(rep);
5673 return enc_EXCEPTION;
5674 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005675 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 memcpy(outstart + *outpos, repchars, repsize);
5677 *outpos += repsize;
5678 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005680 Py_DECREF(rep);
5681 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005682}
5683
5684/* handle an error in PyUnicode_EncodeCharmap
5685 Return 0 on success, -1 on error */
5686static
5687int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005688 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005690 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005691 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692{
5693 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005694 Py_ssize_t repsize;
5695 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696 Py_UNICODE *uni2;
5697 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005698 Py_ssize_t collstartpos = *inpos;
5699 Py_ssize_t collendpos = *inpos+1;
5700 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 char *encoding = "charmap";
5702 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005703 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 /* find all unencodable characters */
5706 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005707 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005708 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 int res = encoding_map_lookup(p[collendpos], mapping);
5710 if (res != -1)
5711 break;
5712 ++collendpos;
5713 continue;
5714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005715
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 rep = charmapencode_lookup(p[collendpos], mapping);
5717 if (rep==NULL)
5718 return -1;
5719 else if (rep!=Py_None) {
5720 Py_DECREF(rep);
5721 break;
5722 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005723 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005724 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 }
5726 /* cache callback name lookup
5727 * (if not done yet, i.e. it's the first error) */
5728 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 if ((errors==NULL) || (!strcmp(errors, "strict")))
5730 *known_errorHandler = 1;
5731 else if (!strcmp(errors, "replace"))
5732 *known_errorHandler = 2;
5733 else if (!strcmp(errors, "ignore"))
5734 *known_errorHandler = 3;
5735 else if (!strcmp(errors, "xmlcharrefreplace"))
5736 *known_errorHandler = 4;
5737 else
5738 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 }
5740 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005741 case 1: /* strict */
5742 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5743 return -1;
5744 case 2: /* replace */
5745 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 x = charmapencode_output('?', mapping, res, respos);
5747 if (x==enc_EXCEPTION) {
5748 return -1;
5749 }
5750 else if (x==enc_FAILED) {
5751 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5752 return -1;
5753 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005754 }
5755 /* fall through */
5756 case 3: /* ignore */
5757 *inpos = collendpos;
5758 break;
5759 case 4: /* xmlcharrefreplace */
5760 /* generate replacement (temporarily (mis)uses p) */
5761 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 char buffer[2+29+1+1];
5763 char *cp;
5764 sprintf(buffer, "&#%d;", (int)p[collpos]);
5765 for (cp = buffer; *cp; ++cp) {
5766 x = charmapencode_output(*cp, mapping, res, respos);
5767 if (x==enc_EXCEPTION)
5768 return -1;
5769 else if (x==enc_FAILED) {
5770 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5771 return -1;
5772 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005773 }
5774 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005775 *inpos = collendpos;
5776 break;
5777 default:
5778 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 encoding, reason, p, size, exceptionObject,
5780 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005781 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005783 if (PyBytes_Check(repunicode)) {
5784 /* Directly copy bytes result to output. */
5785 Py_ssize_t outsize = PyBytes_Size(*res);
5786 Py_ssize_t requiredsize;
5787 repsize = PyBytes_Size(repunicode);
5788 requiredsize = *respos + repsize;
5789 if (requiredsize > outsize)
5790 /* Make room for all additional bytes. */
5791 if (charmapencode_resize(res, respos, requiredsize)) {
5792 Py_DECREF(repunicode);
5793 return -1;
5794 }
5795 memcpy(PyBytes_AsString(*res) + *respos,
5796 PyBytes_AsString(repunicode), repsize);
5797 *respos += repsize;
5798 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005799 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005800 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005801 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005802 /* generate replacement */
5803 repsize = PyUnicode_GET_SIZE(repunicode);
5804 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 x = charmapencode_output(*uni2, mapping, res, respos);
5806 if (x==enc_EXCEPTION) {
5807 return -1;
5808 }
5809 else if (x==enc_FAILED) {
5810 Py_DECREF(repunicode);
5811 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5812 return -1;
5813 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005814 }
5815 *inpos = newpos;
5816 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005817 }
5818 return 0;
5819}
5820
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 Py_ssize_t size,
5823 PyObject *mapping,
5824 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005826 /* output object */
5827 PyObject *res = NULL;
5828 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005829 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005830 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005832 PyObject *errorHandler = NULL;
5833 PyObject *exc = NULL;
5834 /* the following variable is used for caching string comparisons
5835 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5836 * 3=ignore, 4=xmlcharrefreplace */
5837 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838
5839 /* Default to Latin-1 */
5840 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005843 /* allocate enough for a simple encoding without
5844 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005845 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846 if (res == NULL)
5847 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005848 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 /* try to encode it */
5853 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5854 if (x==enc_EXCEPTION) /* error */
5855 goto onError;
5856 if (x==enc_FAILED) { /* unencodable character */
5857 if (charmap_encoding_error(p, size, &inpos, mapping,
5858 &exc,
5859 &known_errorHandler, &errorHandler, errors,
5860 &res, &respos)) {
5861 goto onError;
5862 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005863 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 else
5865 /* done with this character => adjust input position */
5866 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005870 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005871 if (_PyBytes_Resize(&res, respos) < 0)
5872 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005873
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 Py_XDECREF(exc);
5875 Py_XDECREF(errorHandler);
5876 return res;
5877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 Py_XDECREF(res);
5880 Py_XDECREF(exc);
5881 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 return NULL;
5883}
5884
5885PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887{
5888 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 PyErr_BadArgument();
5890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 }
5892 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 PyUnicode_GET_SIZE(unicode),
5894 mapping,
5895 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896}
5897
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898/* create or adjust a UnicodeTranslateError */
5899static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 const Py_UNICODE *unicode, Py_ssize_t size,
5901 Py_ssize_t startpos, Py_ssize_t endpos,
5902 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005905 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 }
5908 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5910 goto onError;
5911 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5912 goto onError;
5913 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5914 goto onError;
5915 return;
5916 onError:
5917 Py_DECREF(*exceptionObject);
5918 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 }
5920}
5921
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005922/* raises a UnicodeTranslateError */
5923static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 const Py_UNICODE *unicode, Py_ssize_t size,
5925 Py_ssize_t startpos, Py_ssize_t endpos,
5926 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927{
5928 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005932}
5933
5934/* error handling callback helper:
5935 build arguments, call the callback and check the arguments,
5936 put the result into newpos and return the replacement string, which
5937 has to be freed by the caller */
5938static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 PyObject **errorHandler,
5940 const char *reason,
5941 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5942 Py_ssize_t startpos, Py_ssize_t endpos,
5943 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005944{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005945 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005947 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005948 PyObject *restuple;
5949 PyObject *resunicode;
5950
5951 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005953 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005955 }
5956
5957 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005959 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961
5962 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005964 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005965 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005966 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005967 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 Py_DECREF(restuple);
5969 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970 }
5971 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 &resunicode, &i_newpos)) {
5973 Py_DECREF(restuple);
5974 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005975 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005976 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005978 else
5979 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005980 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5982 Py_DECREF(restuple);
5983 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005984 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005985 Py_INCREF(resunicode);
5986 Py_DECREF(restuple);
5987 return resunicode;
5988}
5989
5990/* Lookup the character ch in the mapping and put the result in result,
5991 which must be decrefed by the caller.
5992 Return 0 on success, -1 on error */
5993static
5994int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5995{
Christian Heimes217cfd12007-12-02 14:31:20 +00005996 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005997 PyObject *x;
5998
5999 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006001 x = PyObject_GetItem(mapping, w);
6002 Py_DECREF(w);
6003 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6005 /* No mapping found means: use 1:1 mapping. */
6006 PyErr_Clear();
6007 *result = NULL;
6008 return 0;
6009 } else
6010 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 }
6012 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 *result = x;
6014 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006016 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 long value = PyLong_AS_LONG(x);
6018 long max = PyUnicode_GetMax();
6019 if (value < 0 || value > max) {
6020 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006021 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 Py_DECREF(x);
6023 return -1;
6024 }
6025 *result = x;
6026 return 0;
6027 }
6028 else if (PyUnicode_Check(x)) {
6029 *result = x;
6030 return 0;
6031 }
6032 else {
6033 /* wrong return value */
6034 PyErr_SetString(PyExc_TypeError,
6035 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006036 Py_DECREF(x);
6037 return -1;
6038 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006039}
6040/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 if not reallocate and adjust various state variables.
6042 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043static
Walter Dörwald4894c302003-10-24 14:25:28 +00006044int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006047 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006048 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 /* remember old output position */
6050 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6051 /* exponentially overallocate to minimize reallocations */
6052 if (requiredsize < 2 * oldsize)
6053 requiredsize = 2 * oldsize;
6054 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6055 return -1;
6056 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006057 }
6058 return 0;
6059}
6060/* lookup the character, put the result in the output string and adjust
6061 various state variables. Return a new reference to the object that
6062 was put in the output buffer in *result, or Py_None, if the mapping was
6063 undefined (in which case no character was written).
6064 The called must decref result.
6065 Return 0 on success, -1 on error. */
6066static
Walter Dörwald4894c302003-10-24 14:25:28 +00006067int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6069 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070{
Walter Dörwald4894c302003-10-24 14:25:28 +00006071 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006073 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 /* not found => default to 1:1 mapping */
6075 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076 }
6077 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006079 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 /* no overflow check, because we know that the space is enough */
6081 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 }
6083 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6085 if (repsize==1) {
6086 /* no overflow check, because we know that the space is enough */
6087 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6088 }
6089 else if (repsize!=0) {
6090 /* more than one character */
6091 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6092 (insize - (curinp-startinp)) +
6093 repsize - 1;
6094 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6095 return -1;
6096 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6097 *outp += repsize;
6098 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099 }
6100 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102 return 0;
6103}
6104
6105PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 Py_ssize_t size,
6107 PyObject *mapping,
6108 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006110 /* output object */
6111 PyObject *res = NULL;
6112 /* pointers to the beginning and end+1 of input */
6113 const Py_UNICODE *startp = p;
6114 const Py_UNICODE *endp = p + size;
6115 /* pointer into the output */
6116 Py_UNICODE *str;
6117 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006118 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 char *reason = "character maps to <undefined>";
6120 PyObject *errorHandler = NULL;
6121 PyObject *exc = NULL;
6122 /* the following variable is used for caching string comparisons
6123 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6124 * 3=ignore, 4=xmlcharrefreplace */
6125 int known_errorHandler = -1;
6126
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 PyErr_BadArgument();
6129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131
6132 /* allocate enough for a simple 1:1 translation without
6133 replacements, if we need more, we'll resize */
6134 res = PyUnicode_FromUnicode(NULL, size);
6135 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 /* try to encode it */
6143 PyObject *x = NULL;
6144 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6145 Py_XDECREF(x);
6146 goto onError;
6147 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006148 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 if (x!=Py_None) /* it worked => adjust input pointer */
6150 ++p;
6151 else { /* untranslatable character */
6152 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6153 Py_ssize_t repsize;
6154 Py_ssize_t newpos;
6155 Py_UNICODE *uni2;
6156 /* startpos for collecting untranslatable chars */
6157 const Py_UNICODE *collstart = p;
6158 const Py_UNICODE *collend = p+1;
6159 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 /* find all untranslatable characters */
6162 while (collend < endp) {
6163 if (charmaptranslate_lookup(*collend, mapping, &x))
6164 goto onError;
6165 Py_XDECREF(x);
6166 if (x!=Py_None)
6167 break;
6168 ++collend;
6169 }
6170 /* cache callback name lookup
6171 * (if not done yet, i.e. it's the first error) */
6172 if (known_errorHandler==-1) {
6173 if ((errors==NULL) || (!strcmp(errors, "strict")))
6174 known_errorHandler = 1;
6175 else if (!strcmp(errors, "replace"))
6176 known_errorHandler = 2;
6177 else if (!strcmp(errors, "ignore"))
6178 known_errorHandler = 3;
6179 else if (!strcmp(errors, "xmlcharrefreplace"))
6180 known_errorHandler = 4;
6181 else
6182 known_errorHandler = 0;
6183 }
6184 switch (known_errorHandler) {
6185 case 1: /* strict */
6186 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006187 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 case 2: /* replace */
6189 /* No need to check for space, this is a 1:1 replacement */
6190 for (coll = collstart; coll<collend; ++coll)
6191 *str++ = '?';
6192 /* fall through */
6193 case 3: /* ignore */
6194 p = collend;
6195 break;
6196 case 4: /* xmlcharrefreplace */
6197 /* generate replacement (temporarily (mis)uses p) */
6198 for (p = collstart; p < collend; ++p) {
6199 char buffer[2+29+1+1];
6200 char *cp;
6201 sprintf(buffer, "&#%d;", (int)*p);
6202 if (charmaptranslate_makespace(&res, &str,
6203 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6204 goto onError;
6205 for (cp = buffer; *cp; ++cp)
6206 *str++ = *cp;
6207 }
6208 p = collend;
6209 break;
6210 default:
6211 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6212 reason, startp, size, &exc,
6213 collstart-startp, collend-startp, &newpos);
6214 if (repunicode == NULL)
6215 goto onError;
6216 /* generate replacement */
6217 repsize = PyUnicode_GET_SIZE(repunicode);
6218 if (charmaptranslate_makespace(&res, &str,
6219 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6220 Py_DECREF(repunicode);
6221 goto onError;
6222 }
6223 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6224 *str++ = *uni2;
6225 p = startp + newpos;
6226 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006227 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006228 }
6229 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006230 /* Resize if we allocated to much */
6231 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006232 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 if (PyUnicode_Resize(&res, respos) < 0)
6234 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006235 }
6236 Py_XDECREF(exc);
6237 Py_XDECREF(errorHandler);
6238 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006241 Py_XDECREF(res);
6242 Py_XDECREF(exc);
6243 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 return NULL;
6245}
6246
6247PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 PyObject *mapping,
6249 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250{
6251 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006252
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 str = PyUnicode_FromObject(str);
6254 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 PyUnicode_GET_SIZE(str),
6258 mapping,
6259 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 Py_DECREF(str);
6261 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006262
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 Py_XDECREF(str);
6265 return NULL;
6266}
Tim Petersced69f82003-09-16 20:30:58 +00006267
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006268PyObject *
6269PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6270 Py_ssize_t length)
6271{
6272 PyObject *result;
6273 Py_UNICODE *p; /* write pointer into result */
6274 Py_ssize_t i;
6275 /* Copy to a new string */
6276 result = (PyObject *)_PyUnicode_New(length);
6277 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6278 if (result == NULL)
6279 return result;
6280 p = PyUnicode_AS_UNICODE(result);
6281 /* Iterate over code points */
6282 for (i = 0; i < length; i++) {
6283 Py_UNICODE ch =s[i];
6284 if (ch > 127) {
6285 int decimal = Py_UNICODE_TODECIMAL(ch);
6286 if (decimal >= 0)
6287 p[i] = '0' + decimal;
6288 }
6289 }
6290 return result;
6291}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006292/* --- Decimal Encoder ---------------------------------------------------- */
6293
6294int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 Py_ssize_t length,
6296 char *output,
6297 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006298{
6299 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 PyObject *errorHandler = NULL;
6301 PyObject *exc = NULL;
6302 const char *encoding = "decimal";
6303 const char *reason = "invalid decimal Unicode string";
6304 /* the following variable is used for caching string comparisons
6305 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6306 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006307
6308 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 PyErr_BadArgument();
6310 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006311 }
6312
6313 p = s;
6314 end = s + length;
6315 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 register Py_UNICODE ch = *p;
6317 int decimal;
6318 PyObject *repunicode;
6319 Py_ssize_t repsize;
6320 Py_ssize_t newpos;
6321 Py_UNICODE *uni2;
6322 Py_UNICODE *collstart;
6323 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006324
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006326 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 ++p;
6328 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006329 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 decimal = Py_UNICODE_TODECIMAL(ch);
6331 if (decimal >= 0) {
6332 *output++ = '0' + decimal;
6333 ++p;
6334 continue;
6335 }
6336 if (0 < ch && ch < 256) {
6337 *output++ = (char)ch;
6338 ++p;
6339 continue;
6340 }
6341 /* All other characters are considered unencodable */
6342 collstart = p;
Victor Stinnerab1d16b2011-11-22 01:45:37 +01006343 for (collend = p+1; collend < end; collend++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 if ((0 < *collend && *collend < 256) ||
Victor Stinnerab1d16b2011-11-22 01:45:37 +01006345 Py_UNICODE_ISSPACE(*collend) ||
6346 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 break;
6348 }
6349 /* cache callback name lookup
6350 * (if not done yet, i.e. it's the first error) */
6351 if (known_errorHandler==-1) {
6352 if ((errors==NULL) || (!strcmp(errors, "strict")))
6353 known_errorHandler = 1;
6354 else if (!strcmp(errors, "replace"))
6355 known_errorHandler = 2;
6356 else if (!strcmp(errors, "ignore"))
6357 known_errorHandler = 3;
6358 else if (!strcmp(errors, "xmlcharrefreplace"))
6359 known_errorHandler = 4;
6360 else
6361 known_errorHandler = 0;
6362 }
6363 switch (known_errorHandler) {
6364 case 1: /* strict */
6365 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6366 goto onError;
6367 case 2: /* replace */
6368 for (p = collstart; p < collend; ++p)
6369 *output++ = '?';
6370 /* fall through */
6371 case 3: /* ignore */
6372 p = collend;
6373 break;
6374 case 4: /* xmlcharrefreplace */
6375 /* generate replacement (temporarily (mis)uses p) */
6376 for (p = collstart; p < collend; ++p)
6377 output += sprintf(output, "&#%d;", (int)*p);
6378 p = collend;
6379 break;
6380 default:
6381 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6382 encoding, reason, s, length, &exc,
6383 collstart-s, collend-s, &newpos);
6384 if (repunicode == NULL)
6385 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006386 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006387 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006388 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6389 Py_DECREF(repunicode);
6390 goto onError;
6391 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 /* generate replacement */
6393 repsize = PyUnicode_GET_SIZE(repunicode);
6394 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6395 Py_UNICODE ch = *uni2;
6396 if (Py_UNICODE_ISSPACE(ch))
6397 *output++ = ' ';
6398 else {
6399 decimal = Py_UNICODE_TODECIMAL(ch);
6400 if (decimal >= 0)
6401 *output++ = '0' + decimal;
6402 else if (0 < ch && ch < 256)
6403 *output++ = (char)ch;
6404 else {
6405 Py_DECREF(repunicode);
6406 raise_encode_exception(&exc, encoding,
6407 s, length, collstart-s, collend-s, reason);
6408 goto onError;
6409 }
6410 }
6411 }
6412 p = s + newpos;
6413 Py_DECREF(repunicode);
6414 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006415 }
6416 /* 0-terminate the output string */
6417 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 Py_XDECREF(exc);
6419 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006420 return 0;
6421
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 Py_XDECREF(exc);
6424 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006425 return -1;
6426}
6427
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428/* --- Helpers ------------------------------------------------------------ */
6429
Eric Smith8c663262007-08-25 02:26:07 +00006430#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006431#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006432
Thomas Wouters477c8d52006-05-27 19:21:47 +00006433#include "stringlib/count.h"
6434#include "stringlib/find.h"
6435#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006436#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006437
Eric Smith5807c412008-05-11 21:00:57 +00006438#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006439#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006440#include "stringlib/localeutil.h"
6441
Thomas Wouters477c8d52006-05-27 19:21:47 +00006442/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006443#define ADJUST_INDICES(start, end, len) \
6444 if (end > len) \
6445 end = len; \
6446 else if (end < 0) { \
6447 end += len; \
6448 if (end < 0) \
6449 end = 0; \
6450 } \
6451 if (start < 0) { \
6452 start += len; \
6453 if (start < 0) \
6454 start = 0; \
6455 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006456
Ezio Melotti93e7afc2011-08-22 14:08:38 +03006457/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
6458 * by 'ptr', possibly combining surrogate pairs on narrow builds.
6459 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
6460 * that should be returned and 'end' pointing to the end of the buffer.
6461 * ('end' is used on narrow builds to detect a lone surrogate at the
6462 * end of the buffer that should be returned unchanged.)
6463 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
6464 * The type of the returned char is always Py_UCS4.
6465 *
6466 * Note: the macro advances ptr to next char, so it might have side-effects
6467 * (especially if used with other macros).
6468 */
6469
6470/* helper macros used by _Py_UNICODE_NEXT */
6471#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
6472#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
6473/* Join two surrogate characters and return a single Py_UCS4 value. */
6474#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
6475 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
6476 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
6477
6478#ifdef Py_UNICODE_WIDE
6479#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
6480#else
6481#define _Py_UNICODE_NEXT(ptr, end) \
6482 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
6483 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
6484 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
6485 (Py_UCS4)*(ptr)++)
6486#endif
6487
Martin v. Löwis18e16552006-02-15 17:27:45 +00006488Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006489 PyObject *substr,
6490 Py_ssize_t start,
6491 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006493 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006494 PyUnicodeObject* str_obj;
6495 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006496
Thomas Wouters477c8d52006-05-27 19:21:47 +00006497 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6498 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006500 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6501 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 Py_DECREF(str_obj);
6503 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 }
Tim Petersced69f82003-09-16 20:30:58 +00006505
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006506 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006507 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006508 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6509 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006510 );
6511
6512 Py_DECREF(sub_obj);
6513 Py_DECREF(str_obj);
6514
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 return result;
6516}
6517
Martin v. Löwis18e16552006-02-15 17:27:45 +00006518Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006519 PyObject *sub,
6520 Py_ssize_t start,
6521 Py_ssize_t end,
6522 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006524 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006525
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006527 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006529 sub = PyUnicode_FromObject(sub);
6530 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 Py_DECREF(str);
6532 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 }
Tim Petersced69f82003-09-16 20:30:58 +00006534
Thomas Wouters477c8d52006-05-27 19:21:47 +00006535 if (direction > 0)
6536 result = stringlib_find_slice(
6537 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6538 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6539 start, end
6540 );
6541 else
6542 result = stringlib_rfind_slice(
6543 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6544 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6545 start, end
6546 );
6547
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006549 Py_DECREF(sub);
6550
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 return result;
6552}
6553
Tim Petersced69f82003-09-16 20:30:58 +00006554static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 PyUnicodeObject *substring,
6557 Py_ssize_t start,
6558 Py_ssize_t end,
6559 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 if (substring->length == 0)
6562 return 1;
6563
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006564 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 end -= substring->length;
6566 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568
6569 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 if (Py_UNICODE_MATCH(self, end, substring))
6571 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 } else {
6573 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 }
6576
6577 return 0;
6578}
6579
Martin v. Löwis18e16552006-02-15 17:27:45 +00006580Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 PyObject *substr,
6582 Py_ssize_t start,
6583 Py_ssize_t end,
6584 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006586 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006587
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 str = PyUnicode_FromObject(str);
6589 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 substr = PyUnicode_FromObject(substr);
6592 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 Py_DECREF(str);
6594 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 }
Tim Petersced69f82003-09-16 20:30:58 +00006596
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 (PyUnicodeObject *)substr,
6599 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 Py_DECREF(str);
6601 Py_DECREF(substr);
6602 return result;
6603}
6604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605/* Apply fixfct filter to the Unicode object self and return a
6606 reference to the modified object */
6607
Tim Petersced69f82003-09-16 20:30:58 +00006608static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611{
6612
6613 PyUnicodeObject *u;
6614
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006615 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006618
6619 Py_UNICODE_COPY(u->str, self->str, self->length);
6620
Tim Peters7a29bd52001-09-12 03:03:31 +00006621 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 /* fixfct should return TRUE if it modified the buffer. If
6623 FALSE, return a reference to the original buffer instead
6624 (to save space, not time) */
6625 Py_INCREF(self);
6626 Py_DECREF(u);
6627 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 }
6629 return (PyObject*) u;
6630}
6631
Tim Petersced69f82003-09-16 20:30:58 +00006632static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633int fixupper(PyUnicodeObject *self)
6634{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006635 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 Py_UNICODE *s = self->str;
6637 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006638
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006641
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 ch = Py_UNICODE_TOUPPER(*s);
6643 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 *s = ch;
6646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 s++;
6648 }
6649
6650 return status;
6651}
6652
Tim Petersced69f82003-09-16 20:30:58 +00006653static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654int fixlower(PyUnicodeObject *self)
6655{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006656 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 Py_UNICODE *s = self->str;
6658 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006659
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006661 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006662
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 ch = Py_UNICODE_TOLOWER(*s);
6664 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 *s = ch;
6667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 s++;
6669 }
6670
6671 return status;
6672}
6673
Tim Petersced69f82003-09-16 20:30:58 +00006674static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675int fixswapcase(PyUnicodeObject *self)
6676{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006677 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 Py_UNICODE *s = self->str;
6679 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006680
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 while (len-- > 0) {
6682 if (Py_UNICODE_ISUPPER(*s)) {
6683 *s = Py_UNICODE_TOLOWER(*s);
6684 status = 1;
6685 } else if (Py_UNICODE_ISLOWER(*s)) {
6686 *s = Py_UNICODE_TOUPPER(*s);
6687 status = 1;
6688 }
6689 s++;
6690 }
6691
6692 return status;
6693}
6694
Tim Petersced69f82003-09-16 20:30:58 +00006695static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696int fixcapitalize(PyUnicodeObject *self)
6697{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006698 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006699 Py_UNICODE *s = self->str;
6700 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006701
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006702 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 return 0;
Ezio Melottiee8d9982011-08-15 09:09:57 +03006704 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 *s = Py_UNICODE_TOUPPER(*s);
6706 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006708 s++;
6709 while (--len > 0) {
Ezio Melottiee8d9982011-08-15 09:09:57 +03006710 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006711 *s = Py_UNICODE_TOLOWER(*s);
6712 status = 1;
6713 }
6714 s++;
6715 }
6716 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
6719static
6720int fixtitle(PyUnicodeObject *self)
6721{
6722 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6723 register Py_UNICODE *e;
6724 int previous_is_cased;
6725
6726 /* Shortcut for single character strings */
6727 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6729 if (*p != ch) {
6730 *p = ch;
6731 return 1;
6732 }
6733 else
6734 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 }
Tim Petersced69f82003-09-16 20:30:58 +00006736
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 e = p + PyUnicode_GET_SIZE(self);
6738 previous_is_cased = 0;
6739 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006741
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 if (previous_is_cased)
6743 *p = Py_UNICODE_TOLOWER(ch);
6744 else
6745 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006746
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 if (Py_UNICODE_ISLOWER(ch) ||
6748 Py_UNICODE_ISUPPER(ch) ||
6749 Py_UNICODE_ISTITLE(ch))
6750 previous_is_cased = 1;
6751 else
6752 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 }
6754 return 1;
6755}
6756
Tim Peters8ce9f162004-08-27 01:49:32 +00006757PyObject *
6758PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759{
Skip Montanaro6543b452004-09-16 03:28:13 +00006760 const Py_UNICODE blank = ' ';
6761 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006762 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006763 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006764 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6765 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006766 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6767 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006768 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006769 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770
Tim Peters05eba1f2004-08-27 21:32:02 +00006771 fseq = PySequence_Fast(seq, "");
6772 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006773 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006774 }
6775
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006776 /* NOTE: the following code can't call back into Python code,
6777 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006778 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006779
Tim Peters05eba1f2004-08-27 21:32:02 +00006780 seqlen = PySequence_Fast_GET_SIZE(fseq);
6781 /* If empty sequence, return u"". */
6782 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006783 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6784 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006785 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006786 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006787 /* If singleton sequence with an exact Unicode, return that. */
6788 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 item = items[0];
6790 if (PyUnicode_CheckExact(item)) {
6791 Py_INCREF(item);
6792 res = (PyUnicodeObject *)item;
6793 goto Done;
6794 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006795 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006796 else {
6797 /* Set up sep and seplen */
6798 if (separator == NULL) {
6799 sep = &blank;
6800 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006801 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006802 else {
6803 if (!PyUnicode_Check(separator)) {
6804 PyErr_Format(PyExc_TypeError,
6805 "separator: expected str instance,"
6806 " %.80s found",
6807 Py_TYPE(separator)->tp_name);
6808 goto onError;
6809 }
6810 sep = PyUnicode_AS_UNICODE(separator);
6811 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006812 }
6813 }
6814
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006815 /* There are at least two things to join, or else we have a subclass
6816 * of str in the sequence.
6817 * Do a pre-pass to figure out the total amount of space we'll
6818 * need (sz), and see whether all argument are strings.
6819 */
6820 sz = 0;
6821 for (i = 0; i < seqlen; i++) {
6822 const Py_ssize_t old_sz = sz;
6823 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 if (!PyUnicode_Check(item)) {
6825 PyErr_Format(PyExc_TypeError,
6826 "sequence item %zd: expected str instance,"
6827 " %.80s found",
6828 i, Py_TYPE(item)->tp_name);
6829 goto onError;
6830 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006831 sz += PyUnicode_GET_SIZE(item);
6832 if (i != 0)
6833 sz += seplen;
6834 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6835 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006837 goto onError;
6838 }
6839 }
Tim Petersced69f82003-09-16 20:30:58 +00006840
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006841 res = _PyUnicode_New(sz);
6842 if (res == NULL)
6843 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006844
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006845 /* Catenate everything. */
6846 res_p = PyUnicode_AS_UNICODE(res);
6847 for (i = 0; i < seqlen; ++i) {
6848 Py_ssize_t itemlen;
6849 item = items[i];
6850 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 /* Copy item, and maybe the separator. */
6852 if (i) {
6853 Py_UNICODE_COPY(res_p, sep, seplen);
6854 res_p += seplen;
6855 }
6856 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6857 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006858 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006859
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006861 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862 return (PyObject *)res;
6863
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006865 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006866 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 return NULL;
6868}
6869
Tim Petersced69f82003-09-16 20:30:58 +00006870static
6871PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 Py_ssize_t left,
6873 Py_ssize_t right,
6874 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875{
6876 PyUnicodeObject *u;
6877
6878 if (left < 0)
6879 left = 0;
6880 if (right < 0)
6881 right = 0;
6882
Tim Peters7a29bd52001-09-12 03:03:31 +00006883 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 Py_INCREF(self);
6885 return self;
6886 }
6887
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006888 if (left > PY_SSIZE_T_MAX - self->length ||
6889 right > PY_SSIZE_T_MAX - (left + self->length)) {
6890 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6891 return NULL;
6892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 u = _PyUnicode_New(left + self->length + right);
6894 if (u) {
6895 if (left)
6896 Py_UNICODE_FILL(u->str, fill, left);
6897 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6898 if (right)
6899 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6900 }
6901
6902 return u;
6903}
6904
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006905PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908
6909 string = PyUnicode_FromObject(string);
6910 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006913 list = stringlib_splitlines(
6914 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6915 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916
6917 Py_DECREF(string);
6918 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919}
6920
Tim Petersced69f82003-09-16 20:30:58 +00006921static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 PyUnicodeObject *substring,
6924 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006927 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006930 return stringlib_split_whitespace(
6931 (PyObject*) self, self->str, self->length, maxcount
6932 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006934 return stringlib_split(
6935 (PyObject*) self, self->str, self->length,
6936 substring->str, substring->length,
6937 maxcount
6938 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939}
6940
Tim Petersced69f82003-09-16 20:30:58 +00006941static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006942PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 PyUnicodeObject *substring,
6944 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006945{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006946 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006947 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006948
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006949 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006950 return stringlib_rsplit_whitespace(
6951 (PyObject*) self, self->str, self->length, maxcount
6952 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006953
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006954 return stringlib_rsplit(
6955 (PyObject*) self, self->str, self->length,
6956 substring->str, substring->length,
6957 maxcount
6958 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006959}
6960
6961static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006963 PyUnicodeObject *str1,
6964 PyUnicodeObject *str2,
6965 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966{
6967 PyUnicodeObject *u;
6968
6969 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006971 else if (maxcount == 0 || self->length == 0)
6972 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973
Thomas Wouters477c8d52006-05-27 19:21:47 +00006974 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006975 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006976 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006977 if (str1->length == 0)
6978 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006979 if (str1->length == 1) {
6980 /* replace characters */
6981 Py_UNICODE u1, u2;
6982 if (!findchar(self->str, self->length, str1->str[0]))
6983 goto nothing;
6984 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6985 if (!u)
6986 return NULL;
6987 Py_UNICODE_COPY(u->str, self->str, self->length);
6988 u1 = str1->str[0];
6989 u2 = str2->str[0];
6990 for (i = 0; i < u->length; i++)
6991 if (u->str[i] == u1) {
6992 if (--maxcount < 0)
6993 break;
6994 u->str[i] = u2;
6995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006997 i = stringlib_find(
6998 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007000 if (i < 0)
7001 goto nothing;
7002 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7003 if (!u)
7004 return NULL;
7005 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007006
7007 /* change everything in-place, starting with this one */
7008 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7009 i += str1->length;
7010
7011 while ( --maxcount > 0) {
7012 i = stringlib_find(self->str+i, self->length-i,
7013 str1->str, str1->length,
7014 i);
7015 if (i == -1)
7016 break;
7017 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7018 i += str1->length;
7019 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007022
Victor Stinnerab1d16b2011-11-22 01:45:37 +01007023 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007024 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 Py_UNICODE *p;
7026
7027 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007028 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7029 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007030 if (n == 0)
7031 goto nothing;
7032 /* new_size = self->length + n * (str2->length - str1->length)); */
7033 delta = (str2->length - str1->length);
7034 if (delta == 0) {
7035 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007037 product = n * (str2->length - str1->length);
7038 if ((product / (str2->length - str1->length)) != n) {
7039 PyErr_SetString(PyExc_OverflowError,
7040 "replace string is too long");
7041 return NULL;
7042 }
7043 new_size = self->length + product;
7044 if (new_size < 0) {
7045 PyErr_SetString(PyExc_OverflowError,
7046 "replace string is too long");
7047 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 }
7049 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007050 u = _PyUnicode_New(new_size);
7051 if (!u)
7052 return NULL;
7053 i = 0;
7054 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007055 if (str1->length > 0) {
7056 while (n-- > 0) {
7057 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007058 j = stringlib_find(self->str+i, self->length-i,
7059 str1->str, str1->length,
7060 i);
7061 if (j == -1)
7062 break;
7063 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007064 /* copy unchanged part [i:j] */
7065 Py_UNICODE_COPY(p, self->str+i, j-i);
7066 p += j - i;
7067 }
7068 /* copy substitution string */
7069 if (str2->length > 0) {
7070 Py_UNICODE_COPY(p, str2->str, str2->length);
7071 p += str2->length;
7072 }
7073 i = j + str1->length;
7074 }
7075 if (i < self->length)
7076 /* copy tail [i:] */
7077 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7078 } else {
7079 /* interleave */
7080 while (n > 0) {
7081 Py_UNICODE_COPY(p, str2->str, str2->length);
7082 p += str2->length;
7083 if (--n <= 0)
7084 break;
7085 *p++ = self->str[i++];
7086 }
7087 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007091
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007093 /* nothing to replace; return original string (when possible) */
7094 if (PyUnicode_CheckExact(self)) {
7095 Py_INCREF(self);
7096 return (PyObject *) self;
7097 }
7098 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099}
7100
7101/* --- Unicode Object Methods --------------------------------------------- */
7102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007103PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105\n\
7106Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007107characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108
7109static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007110unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 return fixup(self, fixtitle);
7113}
7114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007115PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117\n\
7118Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007119have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
7121static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007122unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 return fixup(self, fixcapitalize);
7125}
7126
7127#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007128PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130\n\
7131Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007132normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133
7134static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007135unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136{
7137 PyObject *list;
7138 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007139 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 /* Split into words */
7142 list = split(self, NULL, -1);
7143 if (!list)
7144 return NULL;
7145
7146 /* Capitalize each word */
7147 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7148 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 if (item == NULL)
7151 goto onError;
7152 Py_DECREF(PyList_GET_ITEM(list, i));
7153 PyList_SET_ITEM(list, i, item);
7154 }
7155
7156 /* Join the words to form a new string */
7157 item = PyUnicode_Join(NULL, list);
7158
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 Py_DECREF(list);
7161 return (PyObject *)item;
7162}
7163#endif
7164
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007165/* Argument converter. Coerces to a single unicode character */
7166
7167static int
7168convert_uc(PyObject *obj, void *addr)
7169{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007170 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7171 PyObject *uniobj;
7172 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007173
Benjamin Peterson14339b62009-01-31 16:36:08 +00007174 uniobj = PyUnicode_FromObject(obj);
7175 if (uniobj == NULL) {
7176 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007178 return 0;
7179 }
7180 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7181 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007183 Py_DECREF(uniobj);
7184 return 0;
7185 }
7186 unistr = PyUnicode_AS_UNICODE(uniobj);
7187 *fillcharloc = unistr[0];
7188 Py_DECREF(uniobj);
7189 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007190}
7191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007192PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007195Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007196done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197
7198static PyObject *
7199unicode_center(PyUnicodeObject *self, PyObject *args)
7200{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007201 Py_ssize_t marg, left;
7202 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007203 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204
Thomas Woutersde017742006-02-16 19:34:37 +00007205 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 return NULL;
7207
Tim Peters7a29bd52001-09-12 03:03:31 +00007208 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209 Py_INCREF(self);
7210 return (PyObject*) self;
7211 }
7212
7213 marg = width - self->length;
7214 left = marg / 2 + (marg & width & 1);
7215
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007216 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217}
7218
Marc-André Lemburge5034372000-08-08 08:04:29 +00007219#if 0
7220
7221/* This code should go into some future Unicode collation support
7222 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007223 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007224
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007225/* speedy UTF-16 code point order comparison */
7226/* gleaned from: */
7227/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7228
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007229static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007230{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007231 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007232 0, 0, 0, 0, 0, 0, 0, 0,
7233 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007234 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007235};
7236
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237static int
7238unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7239{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007240 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007241
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 Py_UNICODE *s1 = str1->str;
7243 Py_UNICODE *s2 = str2->str;
7244
7245 len1 = str1->length;
7246 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007247
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007249 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007250
7251 c1 = *s1++;
7252 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007253
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 if (c1 > (1<<11) * 26)
7255 c1 += utf16Fixup[c1>>11];
7256 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007257 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007258 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007259
7260 if (c1 != c2)
7261 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007262
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007263 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 }
7265
7266 return (len1 < len2) ? -1 : (len1 != len2);
7267}
7268
Marc-André Lemburge5034372000-08-08 08:04:29 +00007269#else
7270
7271static int
7272unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7273{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007274 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007275
7276 Py_UNICODE *s1 = str1->str;
7277 Py_UNICODE *s2 = str2->str;
7278
7279 len1 = str1->length;
7280 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007281
Marc-André Lemburge5034372000-08-08 08:04:29 +00007282 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007283 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007284
Fredrik Lundh45714e92001-06-26 16:39:36 +00007285 c1 = *s1++;
7286 c2 = *s2++;
7287
7288 if (c1 != c2)
7289 return (c1 < c2) ? -1 : 1;
7290
Marc-André Lemburge5034372000-08-08 08:04:29 +00007291 len1--; len2--;
7292 }
7293
7294 return (len1 < len2) ? -1 : (len1 != len2);
7295}
7296
7297#endif
7298
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007302 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7303 return unicode_compare((PyUnicodeObject *)left,
7304 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007305 PyErr_Format(PyExc_TypeError,
7306 "Can't compare %.100s and %.100s",
7307 left->ob_type->tp_name,
7308 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 return -1;
7310}
7311
Martin v. Löwis5b222132007-06-10 09:51:05 +00007312int
7313PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7314{
7315 int i;
7316 Py_UNICODE *id;
7317 assert(PyUnicode_Check(uni));
7318 id = PyUnicode_AS_UNICODE(uni);
7319 /* Compare Unicode string and source character set string */
7320 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 if (id[i] != str[i])
7322 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007323 /* This check keeps Python strings that end in '\0' from comparing equal
7324 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007325 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007326 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007327 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007329 return 0;
7330}
7331
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007332
Benjamin Peterson29060642009-01-31 22:14:21 +00007333#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007334 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007335
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007336PyObject *PyUnicode_RichCompare(PyObject *left,
7337 PyObject *right,
7338 int op)
7339{
7340 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007341
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007342 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7343 PyObject *v;
7344 if (((PyUnicodeObject *) left)->length !=
7345 ((PyUnicodeObject *) right)->length) {
7346 if (op == Py_EQ) {
7347 Py_INCREF(Py_False);
7348 return Py_False;
7349 }
7350 if (op == Py_NE) {
7351 Py_INCREF(Py_True);
7352 return Py_True;
7353 }
7354 }
7355 if (left == right)
7356 result = 0;
7357 else
7358 result = unicode_compare((PyUnicodeObject *)left,
7359 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007360
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007361 /* Convert the return value to a Boolean */
7362 switch (op) {
7363 case Py_EQ:
7364 v = TEST_COND(result == 0);
7365 break;
7366 case Py_NE:
7367 v = TEST_COND(result != 0);
7368 break;
7369 case Py_LE:
7370 v = TEST_COND(result <= 0);
7371 break;
7372 case Py_GE:
7373 v = TEST_COND(result >= 0);
7374 break;
7375 case Py_LT:
7376 v = TEST_COND(result == -1);
7377 break;
7378 case Py_GT:
7379 v = TEST_COND(result == 1);
7380 break;
7381 default:
7382 PyErr_BadArgument();
7383 return NULL;
7384 }
7385 Py_INCREF(v);
7386 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007387 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007388
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007389 Py_INCREF(Py_NotImplemented);
7390 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007391}
7392
Guido van Rossum403d68b2000-03-13 15:55:09 +00007393int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007395{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007396 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007397 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007398
7399 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007400 sub = PyUnicode_FromObject(element);
7401 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 PyErr_Format(PyExc_TypeError,
7403 "'in <string>' requires string as left operand, not %s",
7404 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007405 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007406 }
7407
Thomas Wouters477c8d52006-05-27 19:21:47 +00007408 str = PyUnicode_FromObject(container);
7409 if (!str) {
7410 Py_DECREF(sub);
7411 return -1;
7412 }
7413
7414 result = stringlib_contains_obj(str, sub);
7415
7416 Py_DECREF(str);
7417 Py_DECREF(sub);
7418
Guido van Rossum403d68b2000-03-13 15:55:09 +00007419 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007420}
7421
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422/* Concat to string or Unicode object giving a new Unicode object. */
7423
7424PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426{
7427 PyUnicodeObject *u = NULL, *v = NULL, *w;
7428
7429 /* Coerce the two arguments */
7430 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7431 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7434 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436
7437 /* Shortcuts */
7438 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 Py_DECREF(v);
7440 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441 }
7442 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 Py_DECREF(u);
7444 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 }
7446
7447 /* Concat the two Unicode strings */
7448 w = _PyUnicode_New(u->length + v->length);
7449 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 Py_UNICODE_COPY(w->str, u->str, u->length);
7452 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7453
7454 Py_DECREF(u);
7455 Py_DECREF(v);
7456 return (PyObject *)w;
7457
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 Py_XDECREF(u);
7460 Py_XDECREF(v);
7461 return NULL;
7462}
7463
Walter Dörwald1ab83302007-05-18 17:15:44 +00007464void
7465PyUnicode_Append(PyObject **pleft, PyObject *right)
7466{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007467 PyObject *new;
7468 if (*pleft == NULL)
7469 return;
7470 if (right == NULL || !PyUnicode_Check(*pleft)) {
7471 Py_DECREF(*pleft);
7472 *pleft = NULL;
7473 return;
7474 }
7475 new = PyUnicode_Concat(*pleft, right);
7476 Py_DECREF(*pleft);
7477 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007478}
7479
7480void
7481PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7482{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007483 PyUnicode_Append(pleft, right);
7484 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007485}
7486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007487PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007490Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007491string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007492interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493
7494static PyObject *
7495unicode_count(PyUnicodeObject *self, PyObject *args)
7496{
7497 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007498 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007499 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 PyObject *result;
7501
Jesus Ceaac451502011-04-20 17:09:23 +02007502 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7503 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00007504 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007505
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007506 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007507 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007508 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007509 substring->str, substring->length,
7510 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007511 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512
7513 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007514
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 return result;
7516}
7517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007518PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007519 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007521Encode S using the codec registered for encoding. Default encoding\n\
7522is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007523handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007524a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7525'xmlcharrefreplace' as well as any other name registered with\n\
7526codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527
7528static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007529unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007531 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 char *encoding = NULL;
7533 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007534
Benjamin Peterson308d6372009-09-18 21:42:35 +00007535 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7536 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007538 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007539}
7540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007541PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543\n\
7544Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007545If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546
7547static PyObject*
7548unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7549{
7550 Py_UNICODE *e;
7551 Py_UNICODE *p;
7552 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007553 Py_UNICODE *qe;
7554 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555 PyUnicodeObject *u;
7556 int tabsize = 8;
7557
7558 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560
Thomas Wouters7e474022000-07-16 12:04:32 +00007561 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007562 i = 0; /* chars up to and including most recent \n or \r */
7563 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7564 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 for (p = self->str; p < e; p++)
7566 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007567 if (tabsize > 0) {
7568 incr = tabsize - (j % tabsize); /* cannot overflow */
7569 if (j > PY_SSIZE_T_MAX - incr)
7570 goto overflow1;
7571 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007572 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 if (j > PY_SSIZE_T_MAX - 1)
7576 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 j++;
7578 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 if (i > PY_SSIZE_T_MAX - j)
7580 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007582 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 }
7584 }
7585
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007586 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007588
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589 /* Second pass: create output string and fill it */
7590 u = _PyUnicode_New(i + j);
7591 if (!u)
7592 return NULL;
7593
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007594 j = 0; /* same as in first pass */
7595 q = u->str; /* next output char */
7596 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597
7598 for (p = self->str; p < e; p++)
7599 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 if (tabsize > 0) {
7601 i = tabsize - (j % tabsize);
7602 j += i;
7603 while (i--) {
7604 if (q >= qe)
7605 goto overflow2;
7606 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007607 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007609 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 else {
7611 if (q >= qe)
7612 goto overflow2;
7613 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007614 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 if (*p == '\n' || *p == '\r')
7616 j = 0;
7617 }
7618
7619 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007620
7621 overflow2:
7622 Py_DECREF(u);
7623 overflow1:
7624 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626}
7627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007628PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630\n\
7631Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08007632such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633arguments start and end are interpreted as in slice notation.\n\
7634\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007635Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636
7637static PyObject *
7638unicode_find(PyUnicodeObject *self, PyObject *args)
7639{
Jesus Ceaac451502011-04-20 17:09:23 +02007640 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007641 Py_ssize_t start;
7642 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007643 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644
Jesus Ceaac451502011-04-20 17:09:23 +02007645 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7646 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648
Thomas Wouters477c8d52006-05-27 19:21:47 +00007649 result = stringlib_find_slice(
7650 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7651 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7652 start, end
7653 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654
7655 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007656
Christian Heimes217cfd12007-12-02 14:31:20 +00007657 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658}
7659
7660static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007661unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662{
7663 if (index < 0 || index >= self->length) {
7664 PyErr_SetString(PyExc_IndexError, "string index out of range");
7665 return NULL;
7666 }
7667
7668 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7669}
7670
Guido van Rossumc2504932007-09-18 19:42:40 +00007671/* Believe it or not, this produces the same value for ASCII strings
7672 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007673static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007674unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675{
Guido van Rossumc2504932007-09-18 19:42:40 +00007676 Py_ssize_t len;
7677 Py_UNICODE *p;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -08007678 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +00007679
Benjamin Petersonf6622c82012-04-09 14:53:07 -04007680#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -05007681 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -04007682#endif
Guido van Rossumc2504932007-09-18 19:42:40 +00007683 if (self->hash != -1)
7684 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007685 len = Py_SIZE(self);
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007686 /*
7687 We make the hash of the empty string be 0, rather than using
7688 (prefix ^ suffix), since this slightly obfuscates the hash secret
7689 */
7690 if (len == 0) {
7691 self->hash = 0;
7692 return 0;
7693 }
Guido van Rossumc2504932007-09-18 19:42:40 +00007694 p = self->str;
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007695 x = _Py_HashSecret.prefix;
7696 x ^= *p << 7;
Guido van Rossumc2504932007-09-18 19:42:40 +00007697 while (--len >= 0)
Gregory P. Smith63e6c322012-01-14 15:31:34 -08007698 x = (_PyHASH_MULTIPLIER*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007699 x ^= Py_SIZE(self);
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007700 x ^= _Py_HashSecret.suffix;
Guido van Rossumc2504932007-09-18 19:42:40 +00007701 if (x == -1)
7702 x = -2;
7703 self->hash = x;
7704 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705}
7706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007707PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007710Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711
7712static PyObject *
7713unicode_index(PyUnicodeObject *self, PyObject *args)
7714{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007715 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02007716 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007717 Py_ssize_t start;
7718 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
Jesus Ceaac451502011-04-20 17:09:23 +02007720 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7721 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723
Thomas Wouters477c8d52006-05-27 19:21:47 +00007724 result = stringlib_find_slice(
7725 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7726 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7727 start, end
7728 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729
7730 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007731
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 if (result < 0) {
7733 PyErr_SetString(PyExc_ValueError, "substring not found");
7734 return NULL;
7735 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007736
Christian Heimes217cfd12007-12-02 14:31:20 +00007737 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738}
7739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007740PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007743Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007744at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745
7746static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007747unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748{
7749 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7750 register const Py_UNICODE *e;
7751 int cased;
7752
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 /* Shortcut for single character strings */
7754 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007757 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007758 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007760
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 e = p + PyUnicode_GET_SIZE(self);
7762 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007763 while (p < e) {
7764 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007765
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7767 return PyBool_FromLong(0);
7768 else if (!cased && Py_UNICODE_ISLOWER(ch))
7769 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007771 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772}
7773
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007774PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007777Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007778at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779
7780static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007781unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782{
7783 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7784 register const Py_UNICODE *e;
7785 int cased;
7786
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 /* Shortcut for single character strings */
7788 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007791 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007792 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007794
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795 e = p + PyUnicode_GET_SIZE(self);
7796 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007797 while (p < e) {
7798 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007799
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7801 return PyBool_FromLong(0);
7802 else if (!cased && Py_UNICODE_ISUPPER(ch))
7803 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007805 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806}
7807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007808PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007811Return True if S is a titlecased string and there is at least one\n\
7812character in S, i.e. upper- and titlecase characters may only\n\
7813follow uncased characters and lowercase characters only cased ones.\n\
7814Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815
7816static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007817unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818{
7819 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7820 register const Py_UNICODE *e;
7821 int cased, previous_is_cased;
7822
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 /* Shortcut for single character strings */
7824 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7826 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007828 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007829 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007831
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 e = p + PyUnicode_GET_SIZE(self);
7833 cased = 0;
7834 previous_is_cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007835 while (p < e) {
7836 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007837
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7839 if (previous_is_cased)
7840 return PyBool_FromLong(0);
7841 previous_is_cased = 1;
7842 cased = 1;
7843 }
7844 else if (Py_UNICODE_ISLOWER(ch)) {
7845 if (!previous_is_cased)
7846 return PyBool_FromLong(0);
7847 previous_is_cased = 1;
7848 cased = 1;
7849 }
7850 else
7851 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007853 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854}
7855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007856PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007859Return True if all characters in S are whitespace\n\
7860and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861
7862static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007863unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864{
7865 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7866 register const Py_UNICODE *e;
7867
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 /* Shortcut for single character strings */
7869 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 Py_UNICODE_ISSPACE(*p))
7871 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007873 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007874 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007876
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007878 while (p < e) {
7879 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7880 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007883 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884}
7885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007886PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007888\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007889Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007890and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007891
7892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007893unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007894{
7895 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7896 register const Py_UNICODE *e;
7897
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007898 /* Shortcut for single character strings */
7899 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 Py_UNICODE_ISALPHA(*p))
7901 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007902
7903 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007904 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007906
7907 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007908 while (p < e) {
7909 if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007911 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007912 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007913}
7914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007915PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007917\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007918Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007919and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007920
7921static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007922unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007923{
7924 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7925 register const Py_UNICODE *e;
7926
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007927 /* Shortcut for single character strings */
7928 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 Py_UNICODE_ISALNUM(*p))
7930 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007931
7932 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007933 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007935
7936 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007937 while (p < e) {
7938 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7939 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007941 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007942 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007943}
7944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007945PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007948Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007949False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950
7951static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007952unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953{
7954 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7955 register const Py_UNICODE *e;
7956
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 /* Shortcut for single character strings */
7958 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 Py_UNICODE_ISDECIMAL(*p))
7960 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007962 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007963 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007965
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007967 while (p < e) {
7968 if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007971 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972}
7973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007974PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007977Return True if all characters in S are digits\n\
7978and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979
7980static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007981unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982{
7983 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7984 register const Py_UNICODE *e;
7985
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 /* Shortcut for single character strings */
7987 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 Py_UNICODE_ISDIGIT(*p))
7989 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007991 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007992 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007994
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007996 while (p < e) {
7997 if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008000 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001}
8002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008003PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008006Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008007False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008
8009static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008010unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011{
8012 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8013 register const Py_UNICODE *e;
8014
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 /* Shortcut for single character strings */
8016 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 Py_UNICODE_ISNUMERIC(*p))
8018 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008020 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008021 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008023
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008025 while (p < e) {
8026 if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008029 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030}
8031
Martin v. Löwis47383402007-08-15 07:32:56 +00008032int
8033PyUnicode_IsIdentifier(PyObject *self)
8034{
Benjamin Petersonf413b802011-08-12 22:17:18 -05008035 const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008036 const Py_UNICODE *e;
8037 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +00008038
8039 /* Special case for empty strings */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008040 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008042
8043 /* PEP 3131 says that the first character must be in
8044 XID_Start and subsequent characters in XID_Continue,
8045 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008046 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008047 letters, digits, underscore). However, given the current
8048 definition of XID_Start and XID_Continue, it is sufficient
8049 to check just for these, except that _ must be allowed
8050 as starting an identifier. */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008051 e = p + PyUnicode_GET_SIZE(self);
8052 first = _Py_UNICODE_NEXT(p, e);
Benjamin Petersonf413b802011-08-12 22:17:18 -05008053 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +00008054 return 0;
8055
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008056 while (p < e)
8057 if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008059 return 1;
8060}
8061
8062PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008064\n\
8065Return True if S is a valid identifier according\n\
8066to the language definition.");
8067
8068static PyObject*
8069unicode_isidentifier(PyObject *self)
8070{
8071 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8072}
8073
Georg Brandl559e5d72008-06-11 18:37:52 +00008074PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008076\n\
8077Return True if all characters in S are considered\n\
8078printable in repr() or S is empty, False otherwise.");
8079
8080static PyObject*
8081unicode_isprintable(PyObject *self)
8082{
8083 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8084 register const Py_UNICODE *e;
8085
8086 /* Shortcut for single character strings */
8087 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8088 Py_RETURN_TRUE;
8089 }
8090
8091 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008092 while (p < e) {
8093 if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
Georg Brandl559e5d72008-06-11 18:37:52 +00008094 Py_RETURN_FALSE;
8095 }
8096 }
8097 Py_RETURN_TRUE;
8098}
8099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008100PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008101 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102\n\
8103Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008104iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105
8106static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008107unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008109 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110}
8111
Martin v. Löwis18e16552006-02-15 17:27:45 +00008112static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113unicode_length(PyUnicodeObject *self)
8114{
8115 return self->length;
8116}
8117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008118PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008121Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008122done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123
8124static PyObject *
8125unicode_ljust(PyUnicodeObject *self, PyObject *args)
8126{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008127 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008128 Py_UNICODE fillchar = ' ';
8129
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008130 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 return NULL;
8132
Tim Peters7a29bd52001-09-12 03:03:31 +00008133 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 Py_INCREF(self);
8135 return (PyObject*) self;
8136 }
8137
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008138 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139}
8140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008141PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008144Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145
8146static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008147unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 return fixup(self, fixlower);
8150}
8151
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008152#define LEFTSTRIP 0
8153#define RIGHTSTRIP 1
8154#define BOTHSTRIP 2
8155
8156/* Arrays indexed by above */
8157static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8158
8159#define STRIPNAME(i) (stripformat[i]+3)
8160
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008161/* externally visible for str.strip(unicode) */
8162PyObject *
8163_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8164{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008165 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8166 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8167 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8168 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8169 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008170
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008172
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 i = 0;
8174 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8176 i++;
8177 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008178 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008179
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 j = len;
8181 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 do {
8183 j--;
8184 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8185 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008187
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 Py_INCREF(self);
8190 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 }
8192 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008194}
8195
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196
8197static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008198do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008200 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8201 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008202
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 i = 0;
8204 if (striptype != RIGHTSTRIP) {
8205 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8206 i++;
8207 }
8208 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008209
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 j = len;
8211 if (striptype != LEFTSTRIP) {
8212 do {
8213 j--;
8214 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8215 j++;
8216 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008217
Benjamin Peterson14339b62009-01-31 16:36:08 +00008218 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8219 Py_INCREF(self);
8220 return (PyObject*)self;
8221 }
8222 else
8223 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224}
8225
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008226
8227static PyObject *
8228do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8229{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008230 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008231
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8233 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008234
Benjamin Peterson14339b62009-01-31 16:36:08 +00008235 if (sep != NULL && sep != Py_None) {
8236 if (PyUnicode_Check(sep))
8237 return _PyUnicode_XStrip(self, striptype, sep);
8238 else {
8239 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 "%s arg must be None or str",
8241 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008242 return NULL;
8243 }
8244 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008245
Benjamin Peterson14339b62009-01-31 16:36:08 +00008246 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008247}
8248
8249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008250PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008252\n\
8253Return a copy of the string S with leading and trailing\n\
8254whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008255If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008256
8257static PyObject *
8258unicode_strip(PyUnicodeObject *self, PyObject *args)
8259{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008260 if (PyTuple_GET_SIZE(args) == 0)
8261 return do_strip(self, BOTHSTRIP); /* Common case */
8262 else
8263 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008264}
8265
8266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008267PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008269\n\
8270Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008271If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008272
8273static PyObject *
8274unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8275{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008276 if (PyTuple_GET_SIZE(args) == 0)
8277 return do_strip(self, LEFTSTRIP); /* Common case */
8278 else
8279 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008280}
8281
8282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008283PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008285\n\
8286Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008287If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008288
8289static PyObject *
8290unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8291{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008292 if (PyTuple_GET_SIZE(args) == 0)
8293 return do_strip(self, RIGHTSTRIP); /* Common case */
8294 else
8295 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008296}
8297
8298
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008300unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301{
8302 PyUnicodeObject *u;
8303 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008304 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008305 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306
Serhiy Storchaka05997252013-01-26 12:14:02 +02008307 if (len < 1)
8308 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309
Tim Peters7a29bd52001-09-12 03:03:31 +00008310 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 /* no repeat, return original string */
8312 Py_INCREF(str);
8313 return (PyObject*) str;
8314 }
Tim Peters8f422462000-09-09 06:13:41 +00008315
8316 /* ensure # of chars needed doesn't overflow int and # of bytes
8317 * needed doesn't overflow size_t
8318 */
8319 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008320 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008321 PyErr_SetString(PyExc_OverflowError,
8322 "repeated string is too long");
8323 return NULL;
8324 }
8325 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8326 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8327 PyErr_SetString(PyExc_OverflowError,
8328 "repeated string is too long");
8329 return NULL;
8330 }
8331 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332 if (!u)
8333 return NULL;
8334
8335 p = u->str;
8336
Georg Brandl222de0f2009-04-12 12:01:50 +00008337 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008338 Py_UNICODE_FILL(p, str->str[0], len);
8339 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008340 Py_ssize_t done = str->length; /* number of characters copied this far */
8341 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008343 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008344 Py_UNICODE_COPY(p+done, p, n);
8345 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 }
8348
8349 return (PyObject*) u;
8350}
8351
8352PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 PyObject *subobj,
8354 PyObject *replobj,
8355 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356{
8357 PyObject *self;
8358 PyObject *str1;
8359 PyObject *str2;
8360 PyObject *result;
8361
8362 self = PyUnicode_FromObject(obj);
8363 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 str1 = PyUnicode_FromObject(subobj);
8366 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 Py_DECREF(self);
8368 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369 }
8370 str2 = PyUnicode_FromObject(replobj);
8371 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 Py_DECREF(self);
8373 Py_DECREF(str1);
8374 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 }
Tim Petersced69f82003-09-16 20:30:58 +00008376 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 (PyUnicodeObject *)str1,
8378 (PyUnicodeObject *)str2,
8379 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 Py_DECREF(self);
8381 Py_DECREF(str1);
8382 Py_DECREF(str2);
8383 return result;
8384}
8385
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008386PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008387 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388\n\
8389Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008390old replaced by new. If the optional argument count is\n\
8391given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392
8393static PyObject*
8394unicode_replace(PyUnicodeObject *self, PyObject *args)
8395{
8396 PyUnicodeObject *str1;
8397 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008398 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 PyObject *result;
8400
Martin v. Löwis18e16552006-02-15 17:27:45 +00008401 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 return NULL;
8403 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8404 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008407 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 Py_DECREF(str1);
8409 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411
8412 result = replace(self, str1, str2, maxcount);
8413
8414 Py_DECREF(str1);
8415 Py_DECREF(str2);
8416 return result;
8417}
8418
8419static
8420PyObject *unicode_repr(PyObject *unicode)
8421{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008422 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008423 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008424 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8425 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8426
8427 /* XXX(nnorwitz): rather than over-allocating, it would be
8428 better to choose a different scheme. Perhaps scan the
8429 first N-chars of the string and allocate based on that size.
8430 */
8431 /* Initial allocation is based on the longest-possible unichr
8432 escape.
8433
8434 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8435 unichr, so in this case it's the longest unichr escape. In
8436 narrow (UTF-16) builds this is five chars per source unichr
8437 since there are two unichrs in the surrogate pair, so in narrow
8438 (UTF-16) builds it's not the longest unichr escape.
8439
8440 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8441 so in the narrow (UTF-16) build case it's the longest unichr
8442 escape.
8443 */
8444
Walter Dörwald1ab83302007-05-18 17:15:44 +00008445 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008447#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008449#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008451#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008453 if (repr == NULL)
8454 return NULL;
8455
Walter Dörwald1ab83302007-05-18 17:15:44 +00008456 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008457
8458 /* Add quote */
8459 *p++ = (findchar(s, size, '\'') &&
8460 !findchar(s, size, '"')) ? '"' : '\'';
8461 while (size-- > 0) {
8462 Py_UNICODE ch = *s++;
8463
8464 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008465 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008466 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008467 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008468 continue;
8469 }
8470
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008472 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008473 *p++ = '\\';
8474 *p++ = 't';
8475 }
8476 else if (ch == '\n') {
8477 *p++ = '\\';
8478 *p++ = 'n';
8479 }
8480 else if (ch == '\r') {
8481 *p++ = '\\';
8482 *p++ = 'r';
8483 }
8484
8485 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008486 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008487 *p++ = '\\';
8488 *p++ = 'x';
8489 *p++ = hexdigits[(ch >> 4) & 0x000F];
8490 *p++ = hexdigits[ch & 0x000F];
8491 }
8492
Georg Brandl559e5d72008-06-11 18:37:52 +00008493 /* Copy ASCII characters as-is */
8494 else if (ch < 0x7F) {
8495 *p++ = ch;
8496 }
8497
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008499 else {
8500 Py_UCS4 ucs = ch;
8501
8502#ifndef Py_UNICODE_WIDE
8503 Py_UNICODE ch2 = 0;
8504 /* Get code point from surrogate pair */
8505 if (size > 0) {
8506 ch2 = *s;
8507 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008509 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008511 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008512 size--;
8513 }
8514 }
8515#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008516 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008517 (categories Z* and C* except ASCII space)
8518 */
8519 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8520 /* Map 8-bit characters to '\xhh' */
8521 if (ucs <= 0xff) {
8522 *p++ = '\\';
8523 *p++ = 'x';
8524 *p++ = hexdigits[(ch >> 4) & 0x000F];
8525 *p++ = hexdigits[ch & 0x000F];
8526 }
8527 /* Map 21-bit characters to '\U00xxxxxx' */
8528 else if (ucs >= 0x10000) {
8529 *p++ = '\\';
8530 *p++ = 'U';
8531 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8532 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8533 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8534 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8535 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8536 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8537 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8538 *p++ = hexdigits[ucs & 0x0000000F];
8539 }
8540 /* Map 16-bit characters to '\uxxxx' */
8541 else {
8542 *p++ = '\\';
8543 *p++ = 'u';
8544 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8545 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8546 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8547 *p++ = hexdigits[ucs & 0x000F];
8548 }
8549 }
8550 /* Copy characters as-is */
8551 else {
8552 *p++ = ch;
8553#ifndef Py_UNICODE_WIDE
8554 if (ucs >= 0x10000)
8555 *p++ = ch2;
8556#endif
8557 }
8558 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008559 }
8560 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008561 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008562
8563 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008564 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008565 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566}
8567
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008568PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570\n\
8571Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08008572such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573arguments start and end are interpreted as in slice notation.\n\
8574\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008575Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576
8577static PyObject *
8578unicode_rfind(PyUnicodeObject *self, PyObject *args)
8579{
Jesus Ceaac451502011-04-20 17:09:23 +02008580 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008581 Py_ssize_t start;
8582 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008583 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584
Jesus Ceaac451502011-04-20 17:09:23 +02008585 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8586 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
Thomas Wouters477c8d52006-05-27 19:21:47 +00008589 result = stringlib_rfind_slice(
8590 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8591 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8592 start, end
8593 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594
8595 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008596
Christian Heimes217cfd12007-12-02 14:31:20 +00008597 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598}
8599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008600PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008603Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604
8605static PyObject *
8606unicode_rindex(PyUnicodeObject *self, PyObject *args)
8607{
Jesus Ceaac451502011-04-20 17:09:23 +02008608 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008609 Py_ssize_t start;
8610 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008611 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612
Jesus Ceaac451502011-04-20 17:09:23 +02008613 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8614 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616
Thomas Wouters477c8d52006-05-27 19:21:47 +00008617 result = stringlib_rfind_slice(
8618 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8619 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8620 start, end
8621 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622
8623 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008624
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 if (result < 0) {
8626 PyErr_SetString(PyExc_ValueError, "substring not found");
8627 return NULL;
8628 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008629 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630}
8631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008632PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008635Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008636done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637
8638static PyObject *
8639unicode_rjust(PyUnicodeObject *self, PyObject *args)
8640{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008641 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008642 Py_UNICODE fillchar = ' ';
8643
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008644 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 return NULL;
8646
Tim Peters7a29bd52001-09-12 03:03:31 +00008647 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 Py_INCREF(self);
8649 return (PyObject*) self;
8650 }
8651
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008652 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653}
8654
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 PyObject *sep,
8657 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658{
8659 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008660
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661 s = PyUnicode_FromObject(s);
8662 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008663 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 if (sep != NULL) {
8665 sep = PyUnicode_FromObject(sep);
8666 if (sep == NULL) {
8667 Py_DECREF(s);
8668 return NULL;
8669 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 }
8671
8672 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8673
8674 Py_DECREF(s);
8675 Py_XDECREF(sep);
8676 return result;
8677}
8678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008679PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681\n\
8682Return a list of the words in S, using sep as the\n\
8683delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008684splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008685whitespace string is a separator and empty strings are\n\
8686removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687
8688static PyObject*
8689unicode_split(PyUnicodeObject *self, PyObject *args)
8690{
8691 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008692 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693
Martin v. Löwis18e16552006-02-15 17:27:45 +00008694 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 return NULL;
8696
8697 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703}
8704
Thomas Wouters477c8d52006-05-27 19:21:47 +00008705PyObject *
8706PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8707{
8708 PyObject* str_obj;
8709 PyObject* sep_obj;
8710 PyObject* out;
8711
8712 str_obj = PyUnicode_FromObject(str_in);
8713 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008715 sep_obj = PyUnicode_FromObject(sep_in);
8716 if (!sep_obj) {
8717 Py_DECREF(str_obj);
8718 return NULL;
8719 }
8720
8721 out = stringlib_partition(
8722 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8723 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8724 );
8725
8726 Py_DECREF(sep_obj);
8727 Py_DECREF(str_obj);
8728
8729 return out;
8730}
8731
8732
8733PyObject *
8734PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8735{
8736 PyObject* str_obj;
8737 PyObject* sep_obj;
8738 PyObject* out;
8739
8740 str_obj = PyUnicode_FromObject(str_in);
8741 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008743 sep_obj = PyUnicode_FromObject(sep_in);
8744 if (!sep_obj) {
8745 Py_DECREF(str_obj);
8746 return NULL;
8747 }
8748
8749 out = stringlib_rpartition(
8750 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8751 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8752 );
8753
8754 Py_DECREF(sep_obj);
8755 Py_DECREF(str_obj);
8756
8757 return out;
8758}
8759
8760PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008762\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008763Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008764the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008765found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008766
8767static PyObject*
8768unicode_partition(PyUnicodeObject *self, PyObject *separator)
8769{
8770 return PyUnicode_Partition((PyObject *)self, separator);
8771}
8772
8773PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008774 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008775\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008776Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008777the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008778separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008779
8780static PyObject*
8781unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8782{
8783 return PyUnicode_RPartition((PyObject *)self, separator);
8784}
8785
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008786PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 PyObject *sep,
8788 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008789{
8790 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008791
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008792 s = PyUnicode_FromObject(s);
8793 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008794 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 if (sep != NULL) {
8796 sep = PyUnicode_FromObject(sep);
8797 if (sep == NULL) {
8798 Py_DECREF(s);
8799 return NULL;
8800 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008801 }
8802
8803 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8804
8805 Py_DECREF(s);
8806 Py_XDECREF(sep);
8807 return result;
8808}
8809
8810PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008812\n\
8813Return a list of the words in S, using sep as the\n\
8814delimiter string, starting at the end of the string and\n\
8815working to the front. If maxsplit is given, at most maxsplit\n\
8816splits are done. If sep is not specified, any whitespace string\n\
8817is a separator.");
8818
8819static PyObject*
8820unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8821{
8822 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008823 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008824
Martin v. Löwis18e16552006-02-15 17:27:45 +00008825 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008826 return NULL;
8827
8828 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008830 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008831 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008832 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008834}
8835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008836PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838\n\
8839Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008840Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008841is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842
8843static PyObject*
8844unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8845{
Guido van Rossum86662912000-04-11 15:38:46 +00008846 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847
Guido van Rossum86662912000-04-11 15:38:46 +00008848 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 return NULL;
8850
Guido van Rossum86662912000-04-11 15:38:46 +00008851 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852}
8853
8854static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008855PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856{
Walter Dörwald346737f2007-05-31 10:44:43 +00008857 if (PyUnicode_CheckExact(self)) {
8858 Py_INCREF(self);
8859 return self;
8860 } else
8861 /* Subtype -- return genuine unicode string with the same value. */
8862 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8863 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864}
8865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008866PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008867 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868\n\
8869Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008870and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871
8872static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008873unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875 return fixup(self, fixswapcase);
8876}
8877
Georg Brandlceee0772007-11-27 23:48:05 +00008878PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008880\n\
8881Return a translation table usable for str.translate().\n\
8882If there is only one argument, it must be a dictionary mapping Unicode\n\
8883ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008884Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008885If there are two arguments, they must be strings of equal length, and\n\
8886in the resulting dictionary, each character in x will be mapped to the\n\
8887character at the same position in y. If there is a third argument, it\n\
8888must be a string, whose characters will be mapped to None in the result.");
8889
8890static PyObject*
8891unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8892{
8893 PyObject *x, *y = NULL, *z = NULL;
8894 PyObject *new = NULL, *key, *value;
8895 Py_ssize_t i = 0;
8896 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008897
Georg Brandlceee0772007-11-27 23:48:05 +00008898 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8899 return NULL;
8900 new = PyDict_New();
8901 if (!new)
8902 return NULL;
8903 if (y != NULL) {
8904 /* x must be a string too, of equal length */
8905 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8906 if (!PyUnicode_Check(x)) {
8907 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8908 "be a string if there is a second argument");
8909 goto err;
8910 }
8911 if (PyUnicode_GET_SIZE(x) != ylen) {
8912 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8913 "arguments must have equal length");
8914 goto err;
8915 }
8916 /* create entries for translating chars in x to those in y */
8917 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008918 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
Benjamin Peterson53aa1d72011-12-20 13:29:45 -06008919 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +00008920 goto err;
Benjamin Peterson53aa1d72011-12-20 13:29:45 -06008921 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8922 if (!value) {
8923 Py_DECREF(key);
8924 goto err;
8925 }
Georg Brandlceee0772007-11-27 23:48:05 +00008926 res = PyDict_SetItem(new, key, value);
8927 Py_DECREF(key);
8928 Py_DECREF(value);
8929 if (res < 0)
8930 goto err;
8931 }
8932 /* create entries for deleting chars in z */
8933 if (z != NULL) {
8934 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008935 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008936 if (!key)
8937 goto err;
8938 res = PyDict_SetItem(new, key, Py_None);
8939 Py_DECREF(key);
8940 if (res < 0)
8941 goto err;
8942 }
8943 }
8944 } else {
8945 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008946 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008947 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8948 "to maketrans it must be a dict");
8949 goto err;
8950 }
8951 /* copy entries into the new dict, converting string keys to int keys */
8952 while (PyDict_Next(x, &i, &key, &value)) {
8953 if (PyUnicode_Check(key)) {
8954 /* convert string keys to integer keys */
8955 PyObject *newkey;
8956 if (PyUnicode_GET_SIZE(key) != 1) {
8957 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8958 "table must be of length 1");
8959 goto err;
8960 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008961 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008962 if (!newkey)
8963 goto err;
8964 res = PyDict_SetItem(new, newkey, value);
8965 Py_DECREF(newkey);
8966 if (res < 0)
8967 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008968 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008969 /* just keep integer keys */
8970 if (PyDict_SetItem(new, key, value) < 0)
8971 goto err;
8972 } else {
8973 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8974 "be strings or integers");
8975 goto err;
8976 }
8977 }
8978 }
8979 return new;
8980 err:
8981 Py_DECREF(new);
8982 return NULL;
8983}
8984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008985PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987\n\
8988Return a copy of the string S, where all characters have been mapped\n\
8989through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008990Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008991Unmapped characters are left untouched. Characters mapped to None\n\
8992are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993
8994static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008995unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996{
Georg Brandlceee0772007-11-27 23:48:05 +00008997 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998}
8999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009000PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009003Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004
9005static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009006unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 return fixup(self, fixupper);
9009}
9010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009011PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009014Pad a numeric string S with zeros on the left, to fill a field\n\
9015of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016
9017static PyObject *
9018unicode_zfill(PyUnicodeObject *self, PyObject *args)
9019{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009020 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021 PyUnicodeObject *u;
9022
Martin v. Löwis18e16552006-02-15 17:27:45 +00009023 Py_ssize_t width;
9024 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 return NULL;
9026
9027 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009028 if (PyUnicode_CheckExact(self)) {
9029 Py_INCREF(self);
9030 return (PyObject*) self;
9031 }
9032 else
9033 return PyUnicode_FromUnicode(
9034 PyUnicode_AS_UNICODE(self),
9035 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 }
9038
9039 fill = width - self->length;
9040
9041 u = pad(self, fill, 0, '0');
9042
Walter Dörwald068325e2002-04-15 13:36:47 +00009043 if (u == NULL)
9044 return NULL;
9045
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046 if (u->str[fill] == '+' || u->str[fill] == '-') {
9047 /* move sign to beginning of string */
9048 u->str[0] = u->str[fill];
9049 u->str[fill] = '0';
9050 }
9051
9052 return (PyObject*) u;
9053}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009054
9055#if 0
9056static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009057unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058{
Christian Heimes2202f872008-02-06 14:31:34 +00009059 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009061
9062static PyObject *
9063unicode__decimal2ascii(PyObject *self)
9064{
9065 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9066 PyUnicode_GET_SIZE(self));
9067}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068#endif
9069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009070PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009071 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009073Return True if S starts with the specified prefix, False otherwise.\n\
9074With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009075With optional end, stop comparing S at that position.\n\
9076prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077
9078static PyObject *
9079unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009082 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009084 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009085 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009086 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087
Jesus Ceaac451502011-04-20 17:09:23 +02009088 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009090 if (PyTuple_Check(subobj)) {
9091 Py_ssize_t i;
9092 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9093 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009095 if (substring == NULL)
9096 return NULL;
9097 result = tailmatch(self, substring, start, end, -1);
9098 Py_DECREF(substring);
9099 if (result) {
9100 Py_RETURN_TRUE;
9101 }
9102 }
9103 /* nothing matched */
9104 Py_RETURN_FALSE;
9105 }
9106 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009107 if (substring == NULL) {
9108 if (PyErr_ExceptionMatches(PyExc_TypeError))
9109 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9110 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009112 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009113 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009115 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116}
9117
9118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009119PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009122Return True if S ends with the specified suffix, False otherwise.\n\
9123With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009124With optional end, stop comparing S at that position.\n\
9125suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126
9127static PyObject *
9128unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009131 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009133 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009134 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009135 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136
Jesus Ceaac451502011-04-20 17:09:23 +02009137 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009139 if (PyTuple_Check(subobj)) {
9140 Py_ssize_t i;
9141 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9142 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009144 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009146 result = tailmatch(self, substring, start, end, +1);
9147 Py_DECREF(substring);
9148 if (result) {
9149 Py_RETURN_TRUE;
9150 }
9151 }
9152 Py_RETURN_FALSE;
9153 }
9154 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009155 if (substring == NULL) {
9156 if (PyErr_ExceptionMatches(PyExc_TypeError))
9157 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9158 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009160 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009161 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009163 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164}
9165
Eric Smith8c663262007-08-25 02:26:07 +00009166#include "stringlib/string_format.h"
9167
9168PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009170\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009171Return a formatted version of S, using substitutions from args and kwargs.\n\
9172The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009173
Eric Smith27bbca62010-11-04 17:06:58 +00009174PyDoc_STRVAR(format_map__doc__,
9175 "S.format_map(mapping) -> str\n\
9176\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009177Return a formatted version of S, using substitutions from mapping.\n\
9178The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009179
Eric Smith4a7d76d2008-05-30 18:10:19 +00009180static PyObject *
9181unicode__format__(PyObject* self, PyObject* args)
9182{
9183 PyObject *format_spec;
9184
9185 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9186 return NULL;
9187
9188 return _PyUnicode_FormatAdvanced(self,
9189 PyUnicode_AS_UNICODE(format_spec),
9190 PyUnicode_GET_SIZE(format_spec));
9191}
9192
Eric Smith8c663262007-08-25 02:26:07 +00009193PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009195\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009196Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009197
9198static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009199unicode__sizeof__(PyUnicodeObject *v)
9200{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009201 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9202 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009203}
9204
9205PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009207
9208static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009209unicode_getnewargs(PyUnicodeObject *v)
9210{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009211 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009212}
9213
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009215 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009216 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9217 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009218 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009219 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9220 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9221 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9222 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9223 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9224 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9225 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009226 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009227 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9228 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9229 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009230 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009231 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9232 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9233 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009234 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009235 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009236 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009237 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009238 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9239 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9240 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9241 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9242 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9243 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9244 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9245 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9246 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9247 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9248 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9249 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9250 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9251 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009252 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009253 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009254 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009255 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009256 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009257 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009258 {"maketrans", (PyCFunction) unicode_maketrans,
9259 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009260 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009261#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009262 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263#endif
9264
9265#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009266 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009267 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009268 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269#endif
9270
Benjamin Peterson14339b62009-01-31 16:36:08 +00009271 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009272 {NULL, NULL}
9273};
9274
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009275static PyObject *
9276unicode_mod(PyObject *v, PyObject *w)
9277{
Benjamin Peterson29060642009-01-31 22:14:21 +00009278 if (!PyUnicode_Check(v)) {
9279 Py_INCREF(Py_NotImplemented);
9280 return Py_NotImplemented;
9281 }
9282 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009283}
9284
9285static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009286 0, /*nb_add*/
9287 0, /*nb_subtract*/
9288 0, /*nb_multiply*/
9289 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009290};
9291
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009293 (lenfunc) unicode_length, /* sq_length */
9294 PyUnicode_Concat, /* sq_concat */
9295 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9296 (ssizeargfunc) unicode_getitem, /* sq_item */
9297 0, /* sq_slice */
9298 0, /* sq_ass_item */
9299 0, /* sq_ass_slice */
9300 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009301};
9302
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009303static PyObject*
9304unicode_subscript(PyUnicodeObject* self, PyObject* item)
9305{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009306 if (PyIndex_Check(item)) {
9307 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009308 if (i == -1 && PyErr_Occurred())
9309 return NULL;
9310 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009311 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009312 return unicode_getitem(self, i);
9313 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009314 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009315 Py_UNICODE* source_buf;
9316 Py_UNICODE* result_buf;
9317 PyObject* result;
9318
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009319 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009320 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009321 return NULL;
9322 }
9323
9324 if (slicelength <= 0) {
9325 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009326 } else if (start == 0 && step == 1 && slicelength == self->length &&
9327 PyUnicode_CheckExact(self)) {
9328 Py_INCREF(self);
9329 return (PyObject *)self;
9330 } else if (step == 1) {
9331 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009332 } else {
9333 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009334 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9335 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009336
Benjamin Peterson29060642009-01-31 22:14:21 +00009337 if (result_buf == NULL)
9338 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009339
9340 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9341 result_buf[i] = source_buf[cur];
9342 }
Tim Petersced69f82003-09-16 20:30:58 +00009343
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009344 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009345 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009346 return result;
9347 }
9348 } else {
9349 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9350 return NULL;
9351 }
9352}
9353
9354static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009355 (lenfunc)unicode_length, /* mp_length */
9356 (binaryfunc)unicode_subscript, /* mp_subscript */
9357 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009358};
9359
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361/* Helpers for PyUnicode_Format() */
9362
9363static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009364getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009366 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009368 (*p_argidx)++;
9369 if (arglen < 0)
9370 return args;
9371 else
9372 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 }
9374 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009375 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376 return NULL;
9377}
9378
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009379/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009381static PyObject *
9382formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009384 char *p;
9385 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009387
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 x = PyFloat_AsDouble(v);
9389 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009390 return NULL;
9391
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009393 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009394
Eric Smith0923d1d2009-04-16 20:16:10 +00009395 p = PyOS_double_to_string(x, type, prec,
9396 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009397 if (p == NULL)
9398 return NULL;
9399 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009400 PyMem_Free(p);
9401 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009402}
9403
Tim Peters38fd5b62000-09-21 05:43:11 +00009404static PyObject*
9405formatlong(PyObject *val, int flags, int prec, int type)
9406{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009407 char *buf;
9408 int len;
9409 PyObject *str; /* temporary string object. */
9410 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009411
Benjamin Peterson14339b62009-01-31 16:36:08 +00009412 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9413 if (!str)
9414 return NULL;
9415 result = PyUnicode_FromStringAndSize(buf, len);
9416 Py_DECREF(str);
9417 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009418}
9419
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420static int
9421formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009422 size_t buflen,
9423 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009425 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009426 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009427 if (PyUnicode_GET_SIZE(v) == 1) {
9428 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9429 buf[1] = '\0';
9430 return 1;
9431 }
9432#ifndef Py_UNICODE_WIDE
9433 if (PyUnicode_GET_SIZE(v) == 2) {
9434 /* Decode a valid surrogate pair */
9435 int c0 = PyUnicode_AS_UNICODE(v)[0];
9436 int c1 = PyUnicode_AS_UNICODE(v)[1];
9437 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9438 0xDC00 <= c1 && c1 <= 0xDFFF) {
9439 buf[0] = c0;
9440 buf[1] = c1;
9441 buf[2] = '\0';
9442 return 2;
9443 }
9444 }
9445#endif
9446 goto onError;
9447 }
9448 else {
9449 /* Integer input truncated to a character */
9450 long x;
9451 x = PyLong_AsLong(v);
9452 if (x == -1 && PyErr_Occurred())
9453 goto onError;
9454
9455 if (x < 0 || x > 0x10ffff) {
9456 PyErr_SetString(PyExc_OverflowError,
9457 "%c arg not in range(0x110000)");
9458 return -1;
9459 }
9460
9461#ifndef Py_UNICODE_WIDE
9462 if (x > 0xffff) {
9463 x -= 0x10000;
9464 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9465 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9466 return 2;
9467 }
9468#endif
9469 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009470 buf[1] = '\0';
9471 return 1;
9472 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009473
Benjamin Peterson29060642009-01-31 22:14:21 +00009474 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009475 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009476 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009477 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478}
9479
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009480/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009481 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009482*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009483#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009484
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009486 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487{
9488 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009489 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490 int args_owned = 0;
9491 PyUnicodeObject *result = NULL;
9492 PyObject *dict = NULL;
9493 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009494
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009496 PyErr_BadInternalCall();
9497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498 }
9499 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009500 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 fmt = PyUnicode_AS_UNICODE(uformat);
9503 fmtcnt = PyUnicode_GET_SIZE(uformat);
9504
9505 reslen = rescnt = fmtcnt + 100;
9506 result = _PyUnicode_New(reslen);
9507 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509 res = PyUnicode_AS_UNICODE(result);
9510
9511 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 arglen = PyTuple_Size(args);
9513 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 }
9515 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 arglen = -1;
9517 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -04009519 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009520 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521
9522 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009523 if (*fmt != '%') {
9524 if (--rescnt < 0) {
9525 rescnt = fmtcnt + 100;
9526 reslen += rescnt;
9527 if (_PyUnicode_Resize(&result, reslen) < 0)
9528 goto onError;
9529 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9530 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009531 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009533 }
9534 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 /* Got a format specifier */
9536 int flags = 0;
9537 Py_ssize_t width = -1;
9538 int prec = -1;
9539 Py_UNICODE c = '\0';
9540 Py_UNICODE fill;
9541 int isnumok;
9542 PyObject *v = NULL;
9543 PyObject *temp = NULL;
9544 Py_UNICODE *pbuf;
9545 Py_UNICODE sign;
9546 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009547 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 fmt++;
9550 if (*fmt == '(') {
9551 Py_UNICODE *keystart;
9552 Py_ssize_t keylen;
9553 PyObject *key;
9554 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009555
Benjamin Peterson29060642009-01-31 22:14:21 +00009556 if (dict == NULL) {
9557 PyErr_SetString(PyExc_TypeError,
9558 "format requires a mapping");
9559 goto onError;
9560 }
9561 ++fmt;
9562 --fmtcnt;
9563 keystart = fmt;
9564 /* Skip over balanced parentheses */
9565 while (pcount > 0 && --fmtcnt >= 0) {
9566 if (*fmt == ')')
9567 --pcount;
9568 else if (*fmt == '(')
9569 ++pcount;
9570 fmt++;
9571 }
9572 keylen = fmt - keystart - 1;
9573 if (fmtcnt < 0 || pcount > 0) {
9574 PyErr_SetString(PyExc_ValueError,
9575 "incomplete format key");
9576 goto onError;
9577 }
9578#if 0
9579 /* keys are converted to strings using UTF-8 and
9580 then looked up since Python uses strings to hold
9581 variables names etc. in its namespaces and we
9582 wouldn't want to break common idioms. */
9583 key = PyUnicode_EncodeUTF8(keystart,
9584 keylen,
9585 NULL);
9586#else
9587 key = PyUnicode_FromUnicode(keystart, keylen);
9588#endif
9589 if (key == NULL)
9590 goto onError;
9591 if (args_owned) {
9592 Py_DECREF(args);
9593 args_owned = 0;
9594 }
9595 args = PyObject_GetItem(dict, key);
9596 Py_DECREF(key);
9597 if (args == NULL) {
9598 goto onError;
9599 }
9600 args_owned = 1;
9601 arglen = -1;
9602 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009603 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 while (--fmtcnt >= 0) {
9605 switch (c = *fmt++) {
9606 case '-': flags |= F_LJUST; continue;
9607 case '+': flags |= F_SIGN; continue;
9608 case ' ': flags |= F_BLANK; continue;
9609 case '#': flags |= F_ALT; continue;
9610 case '0': flags |= F_ZERO; continue;
9611 }
9612 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009613 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009614 if (c == '*') {
9615 v = getnextarg(args, arglen, &argidx);
9616 if (v == NULL)
9617 goto onError;
9618 if (!PyLong_Check(v)) {
9619 PyErr_SetString(PyExc_TypeError,
9620 "* wants int");
9621 goto onError;
9622 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +02009623 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00009624 if (width == -1 && PyErr_Occurred())
9625 goto onError;
9626 if (width < 0) {
9627 flags |= F_LJUST;
9628 width = -width;
9629 }
9630 if (--fmtcnt >= 0)
9631 c = *fmt++;
9632 }
9633 else if (c >= '0' && c <= '9') {
9634 width = c - '0';
9635 while (--fmtcnt >= 0) {
9636 c = *fmt++;
9637 if (c < '0' || c > '9')
9638 break;
Mark Dickinsonfb90c092012-10-28 10:18:03 +00009639 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009640 PyErr_SetString(PyExc_ValueError,
9641 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009642 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009643 }
9644 width = width*10 + (c - '0');
9645 }
9646 }
9647 if (c == '.') {
9648 prec = 0;
9649 if (--fmtcnt >= 0)
9650 c = *fmt++;
9651 if (c == '*') {
9652 v = getnextarg(args, arglen, &argidx);
9653 if (v == NULL)
9654 goto onError;
9655 if (!PyLong_Check(v)) {
9656 PyErr_SetString(PyExc_TypeError,
9657 "* wants int");
9658 goto onError;
9659 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +02009660 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 if (prec == -1 && PyErr_Occurred())
9662 goto onError;
9663 if (prec < 0)
9664 prec = 0;
9665 if (--fmtcnt >= 0)
9666 c = *fmt++;
9667 }
9668 else if (c >= '0' && c <= '9') {
9669 prec = c - '0';
9670 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009671 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009672 if (c < '0' || c > '9')
9673 break;
Mark Dickinsonfb90c092012-10-28 10:18:03 +00009674 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009675 PyErr_SetString(PyExc_ValueError,
9676 "prec too big");
9677 goto onError;
9678 }
9679 prec = prec*10 + (c - '0');
9680 }
9681 }
9682 } /* prec */
9683 if (fmtcnt >= 0) {
9684 if (c == 'h' || c == 'l' || c == 'L') {
9685 if (--fmtcnt >= 0)
9686 c = *fmt++;
9687 }
9688 }
9689 if (fmtcnt < 0) {
9690 PyErr_SetString(PyExc_ValueError,
9691 "incomplete format");
9692 goto onError;
9693 }
9694 if (c != '%') {
9695 v = getnextarg(args, arglen, &argidx);
9696 if (v == NULL)
9697 goto onError;
9698 }
9699 sign = 0;
9700 fill = ' ';
9701 switch (c) {
9702
9703 case '%':
9704 pbuf = formatbuf;
9705 /* presume that buffer length is at least 1 */
9706 pbuf[0] = '%';
9707 len = 1;
9708 break;
9709
9710 case 's':
9711 case 'r':
9712 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009713 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009714 temp = v;
9715 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009716 }
9717 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009718 if (c == 's')
9719 temp = PyObject_Str(v);
9720 else if (c == 'r')
9721 temp = PyObject_Repr(v);
9722 else
9723 temp = PyObject_ASCII(v);
9724 if (temp == NULL)
9725 goto onError;
9726 if (PyUnicode_Check(temp))
9727 /* nothing to do */;
9728 else {
9729 Py_DECREF(temp);
9730 PyErr_SetString(PyExc_TypeError,
9731 "%s argument has non-string str()");
9732 goto onError;
9733 }
9734 }
9735 pbuf = PyUnicode_AS_UNICODE(temp);
9736 len = PyUnicode_GET_SIZE(temp);
9737 if (prec >= 0 && len > prec)
9738 len = prec;
9739 break;
9740
9741 case 'i':
9742 case 'd':
9743 case 'u':
9744 case 'o':
9745 case 'x':
9746 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +00009747 isnumok = 0;
9748 if (PyNumber_Check(v)) {
9749 PyObject *iobj=NULL;
9750
9751 if (PyLong_Check(v)) {
9752 iobj = v;
9753 Py_INCREF(iobj);
9754 }
9755 else {
9756 iobj = PyNumber_Long(v);
9757 }
9758 if (iobj!=NULL) {
9759 if (PyLong_Check(iobj)) {
9760 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07009761 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +00009762 Py_DECREF(iobj);
9763 if (!temp)
9764 goto onError;
9765 pbuf = PyUnicode_AS_UNICODE(temp);
9766 len = PyUnicode_GET_SIZE(temp);
9767 sign = 1;
9768 }
9769 else {
9770 Py_DECREF(iobj);
9771 }
9772 }
9773 }
9774 if (!isnumok) {
9775 PyErr_Format(PyExc_TypeError,
9776 "%%%c format: a number is required, "
9777 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9778 goto onError;
9779 }
9780 if (flags & F_ZERO)
9781 fill = '0';
9782 break;
9783
9784 case 'e':
9785 case 'E':
9786 case 'f':
9787 case 'F':
9788 case 'g':
9789 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009790 temp = formatfloat(v, flags, prec, c);
9791 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009792 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009793 pbuf = PyUnicode_AS_UNICODE(temp);
9794 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009795 sign = 1;
9796 if (flags & F_ZERO)
9797 fill = '0';
9798 break;
9799
9800 case 'c':
9801 pbuf = formatbuf;
9802 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9803 if (len < 0)
9804 goto onError;
9805 break;
9806
9807 default:
9808 PyErr_Format(PyExc_ValueError,
9809 "unsupported format character '%c' (0x%x) "
9810 "at index %zd",
9811 (31<=c && c<=126) ? (char)c : '?',
9812 (int)c,
9813 (Py_ssize_t)(fmt - 1 -
9814 PyUnicode_AS_UNICODE(uformat)));
9815 goto onError;
9816 }
9817 if (sign) {
9818 if (*pbuf == '-' || *pbuf == '+') {
9819 sign = *pbuf++;
9820 len--;
9821 }
9822 else if (flags & F_SIGN)
9823 sign = '+';
9824 else if (flags & F_BLANK)
9825 sign = ' ';
9826 else
9827 sign = 0;
9828 }
9829 if (width < len)
9830 width = len;
9831 if (rescnt - (sign != 0) < width) {
9832 reslen -= rescnt;
9833 rescnt = width + fmtcnt + 100;
9834 reslen += rescnt;
9835 if (reslen < 0) {
9836 Py_XDECREF(temp);
9837 PyErr_NoMemory();
9838 goto onError;
9839 }
9840 if (_PyUnicode_Resize(&result, reslen) < 0) {
9841 Py_XDECREF(temp);
9842 goto onError;
9843 }
9844 res = PyUnicode_AS_UNICODE(result)
9845 + reslen - rescnt;
9846 }
9847 if (sign) {
9848 if (fill != ' ')
9849 *res++ = sign;
9850 rescnt--;
9851 if (width > len)
9852 width--;
9853 }
9854 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9855 assert(pbuf[0] == '0');
9856 assert(pbuf[1] == c);
9857 if (fill != ' ') {
9858 *res++ = *pbuf++;
9859 *res++ = *pbuf++;
9860 }
9861 rescnt -= 2;
9862 width -= 2;
9863 if (width < 0)
9864 width = 0;
9865 len -= 2;
9866 }
9867 if (width > len && !(flags & F_LJUST)) {
9868 do {
9869 --rescnt;
9870 *res++ = fill;
9871 } while (--width > len);
9872 }
9873 if (fill == ' ') {
9874 if (sign)
9875 *res++ = sign;
9876 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9877 assert(pbuf[0] == '0');
9878 assert(pbuf[1] == c);
9879 *res++ = *pbuf++;
9880 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009881 }
9882 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009883 Py_UNICODE_COPY(res, pbuf, len);
9884 res += len;
9885 rescnt -= len;
9886 while (--width >= len) {
9887 --rescnt;
9888 *res++ = ' ';
9889 }
9890 if (dict && (argidx < arglen) && c != '%') {
9891 PyErr_SetString(PyExc_TypeError,
9892 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009893 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009894 goto onError;
9895 }
9896 Py_XDECREF(temp);
9897 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898 } /* until end */
9899 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009900 PyErr_SetString(PyExc_TypeError,
9901 "not all arguments converted during string formatting");
9902 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903 }
9904
Thomas Woutersa96affe2006-03-12 00:29:36 +00009905 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009906 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009908 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909 }
9910 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911 return (PyObject *)result;
9912
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914 Py_XDECREF(result);
9915 Py_DECREF(uformat);
9916 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918 }
9919 return NULL;
9920}
9921
Jeremy Hylton938ace62002-07-17 16:30:39 +00009922static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009923unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9924
Tim Peters6d6c1a32001-08-02 04:15:00 +00009925static PyObject *
9926unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9927{
Benjamin Peterson29060642009-01-31 22:14:21 +00009928 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009929 static char *kwlist[] = {"object", "encoding", "errors", 0};
9930 char *encoding = NULL;
9931 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009932
Benjamin Peterson14339b62009-01-31 16:36:08 +00009933 if (type != &PyUnicode_Type)
9934 return unicode_subtype_new(type, args, kwds);
9935 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009936 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009937 return NULL;
9938 if (x == NULL)
9939 return (PyObject *)_PyUnicode_New(0);
9940 if (encoding == NULL && errors == NULL)
9941 return PyObject_Str(x);
9942 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009943 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009944}
9945
Guido van Rossume023fe02001-08-30 03:12:59 +00009946static PyObject *
9947unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9948{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009949 PyUnicodeObject *tmp, *pnew;
9950 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009951
Benjamin Peterson14339b62009-01-31 16:36:08 +00009952 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9953 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9954 if (tmp == NULL)
9955 return NULL;
9956 assert(PyUnicode_Check(tmp));
9957 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9958 if (pnew == NULL) {
9959 Py_DECREF(tmp);
9960 return NULL;
9961 }
9962 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9963 if (pnew->str == NULL) {
9964 _Py_ForgetReference((PyObject *)pnew);
9965 PyObject_Del(pnew);
9966 Py_DECREF(tmp);
9967 return PyErr_NoMemory();
9968 }
9969 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9970 pnew->length = n;
9971 pnew->hash = tmp->hash;
9972 Py_DECREF(tmp);
9973 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009974}
9975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009976PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -07009977"str(object='') -> str\n\
9978str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009979\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +10009980Create a new string object from the given object. If encoding or\n\
9981errors is specified, then the object must expose a data buffer\n\
9982that will be decoded using the given encoding and error handler.\n\
9983Otherwise, returns the result of object.__str__() (if defined)\n\
9984or repr(object).\n\
9985encoding defaults to sys.getdefaultencoding().\n\
9986errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009987
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009988static PyObject *unicode_iter(PyObject *seq);
9989
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009991 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009992 "str", /* tp_name */
9993 sizeof(PyUnicodeObject), /* tp_size */
9994 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009996 (destructor)unicode_dealloc, /* tp_dealloc */
9997 0, /* tp_print */
9998 0, /* tp_getattr */
9999 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010000 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010001 unicode_repr, /* tp_repr */
10002 &unicode_as_number, /* tp_as_number */
10003 &unicode_as_sequence, /* tp_as_sequence */
10004 &unicode_as_mapping, /* tp_as_mapping */
10005 (hashfunc) unicode_hash, /* tp_hash*/
10006 0, /* tp_call*/
10007 (reprfunc) unicode_str, /* tp_str */
10008 PyObject_GenericGetAttr, /* tp_getattro */
10009 0, /* tp_setattro */
10010 0, /* tp_as_buffer */
10011 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000010012 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010013 unicode_doc, /* tp_doc */
10014 0, /* tp_traverse */
10015 0, /* tp_clear */
10016 PyUnicode_RichCompare, /* tp_richcompare */
10017 0, /* tp_weaklistoffset */
10018 unicode_iter, /* tp_iter */
10019 0, /* tp_iternext */
10020 unicode_methods, /* tp_methods */
10021 0, /* tp_members */
10022 0, /* tp_getset */
10023 &PyBaseObject_Type, /* tp_base */
10024 0, /* tp_dict */
10025 0, /* tp_descr_get */
10026 0, /* tp_descr_set */
10027 0, /* tp_dictoffset */
10028 0, /* tp_init */
10029 0, /* tp_alloc */
10030 unicode_new, /* tp_new */
10031 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032};
10033
10034/* Initialize the Unicode implementation */
10035
Thomas Wouters78890102000-07-22 19:25:51 +000010036void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010038 /* XXX - move this array to unicodectype.c ? */
10039 Py_UNICODE linebreak[] = {
10040 0x000A, /* LINE FEED */
10041 0x000D, /* CARRIAGE RETURN */
10042 0x001C, /* FILE SEPARATOR */
10043 0x001D, /* GROUP SEPARATOR */
10044 0x001E, /* RECORD SEPARATOR */
10045 0x0085, /* NEXT LINE */
10046 0x2028, /* LINE SEPARATOR */
10047 0x2029, /* PARAGRAPH SEPARATOR */
10048 };
10049
Fred Drakee4315f52000-05-09 19:53:39 +000010050 /* Init the implementation */
Serhiy Storchaka05997252013-01-26 12:14:02 +020010051 if (!unicode_empty) {
10052 unicode_empty = _PyUnicode_New(0);
10053 if (!unicode_empty)
10054 return;
10055 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010056
Guido van Rossumcacfc072002-05-24 19:01:59 +000010057 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010058 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010059
10060 /* initialize the linebreak bloom filter */
10061 bloom_linebreak = make_bloom_mask(
10062 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10063 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010064
10065 PyType_Ready(&EncodingMapType);
Benjamin Petersonc4311282012-10-30 23:21:10 -040010066
10067 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
10068 Py_FatalError("Can't initialize field name iterator type");
10069
10070 if (PyType_Ready(&PyFormatterIter_Type) < 0)
10071 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010072}
10073
10074/* Finalize the Unicode implementation */
10075
Christian Heimesa156e092008-02-16 07:38:31 +000010076int
10077PyUnicode_ClearFreeList(void)
10078{
10079 int freelist_size = numfree;
10080 PyUnicodeObject *u;
10081
10082 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010083 PyUnicodeObject *v = u;
10084 u = *(PyUnicodeObject **)u;
10085 if (v->str)
10086 PyObject_DEL(v->str);
10087 Py_XDECREF(v->defenc);
10088 PyObject_Del(v);
10089 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010090 }
10091 free_list = NULL;
10092 assert(numfree == 0);
10093 return freelist_size;
10094}
10095
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096void
Thomas Wouters78890102000-07-22 19:25:51 +000010097_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010099 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100
Serhiy Storchaka05997252013-01-26 12:14:02 +020010101 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010102
Serhiy Storchaka05997252013-01-26 12:14:02 +020010103 for (i = 0; i < 256; i++)
10104 Py_CLEAR(unicode_latin1[i]);
10105
Christian Heimesa156e092008-02-16 07:38:31 +000010106 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010108
Walter Dörwald16807132007-05-25 13:52:07 +000010109void
10110PyUnicode_InternInPlace(PyObject **p)
10111{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010112 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10113 PyObject *t;
10114 if (s == NULL || !PyUnicode_Check(s))
10115 Py_FatalError(
10116 "PyUnicode_InternInPlace: unicode strings only please!");
10117 /* If it's a subclass, we don't really know what putting
10118 it in the interned dict might do. */
10119 if (!PyUnicode_CheckExact(s))
10120 return;
10121 if (PyUnicode_CHECK_INTERNED(s))
10122 return;
10123 if (interned == NULL) {
10124 interned = PyDict_New();
10125 if (interned == NULL) {
10126 PyErr_Clear(); /* Don't leave an exception */
10127 return;
10128 }
10129 }
10130 /* It might be that the GetItem call fails even
10131 though the key is present in the dictionary,
10132 namely when this happens during a stack overflow. */
10133 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010134 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010135 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010136
Benjamin Peterson29060642009-01-31 22:14:21 +000010137 if (t) {
10138 Py_INCREF(t);
10139 Py_DECREF(*p);
10140 *p = t;
10141 return;
10142 }
Walter Dörwald16807132007-05-25 13:52:07 +000010143
Benjamin Peterson14339b62009-01-31 16:36:08 +000010144 PyThreadState_GET()->recursion_critical = 1;
10145 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10146 PyErr_Clear();
10147 PyThreadState_GET()->recursion_critical = 0;
10148 return;
10149 }
10150 PyThreadState_GET()->recursion_critical = 0;
10151 /* The two references in interned are not counted by refcnt.
10152 The deallocator will take care of this */
10153 Py_REFCNT(s) -= 2;
10154 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010155}
10156
10157void
10158PyUnicode_InternImmortal(PyObject **p)
10159{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010160 PyUnicode_InternInPlace(p);
10161 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10162 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10163 Py_INCREF(*p);
10164 }
Walter Dörwald16807132007-05-25 13:52:07 +000010165}
10166
10167PyObject *
10168PyUnicode_InternFromString(const char *cp)
10169{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010170 PyObject *s = PyUnicode_FromString(cp);
10171 if (s == NULL)
10172 return NULL;
10173 PyUnicode_InternInPlace(&s);
10174 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010175}
10176
10177void _Py_ReleaseInternedUnicodeStrings(void)
10178{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010179 PyObject *keys;
10180 PyUnicodeObject *s;
10181 Py_ssize_t i, n;
10182 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010183
Benjamin Peterson14339b62009-01-31 16:36:08 +000010184 if (interned == NULL || !PyDict_Check(interned))
10185 return;
10186 keys = PyDict_Keys(interned);
10187 if (keys == NULL || !PyList_Check(keys)) {
10188 PyErr_Clear();
10189 return;
10190 }
Walter Dörwald16807132007-05-25 13:52:07 +000010191
Benjamin Peterson14339b62009-01-31 16:36:08 +000010192 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10193 detector, interned unicode strings are not forcibly deallocated;
10194 rather, we give them their stolen references back, and then clear
10195 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010196
Benjamin Peterson14339b62009-01-31 16:36:08 +000010197 n = PyList_GET_SIZE(keys);
10198 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010199 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010200 for (i = 0; i < n; i++) {
10201 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10202 switch (s->state) {
10203 case SSTATE_NOT_INTERNED:
10204 /* XXX Shouldn't happen */
10205 break;
10206 case SSTATE_INTERNED_IMMORTAL:
10207 Py_REFCNT(s) += 1;
10208 immortal_size += s->length;
10209 break;
10210 case SSTATE_INTERNED_MORTAL:
10211 Py_REFCNT(s) += 2;
10212 mortal_size += s->length;
10213 break;
10214 default:
10215 Py_FatalError("Inconsistent interned string state.");
10216 }
10217 s->state = SSTATE_NOT_INTERNED;
10218 }
10219 fprintf(stderr, "total size of all interned strings: "
10220 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10221 "mortal/immortal\n", mortal_size, immortal_size);
10222 Py_DECREF(keys);
10223 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020010224 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000010225}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010226
10227
10228/********************* Unicode Iterator **************************/
10229
10230typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010231 PyObject_HEAD
10232 Py_ssize_t it_index;
10233 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010234} unicodeiterobject;
10235
10236static void
10237unicodeiter_dealloc(unicodeiterobject *it)
10238{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010239 _PyObject_GC_UNTRACK(it);
10240 Py_XDECREF(it->it_seq);
10241 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010242}
10243
10244static int
10245unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10246{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010247 Py_VISIT(it->it_seq);
10248 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010249}
10250
10251static PyObject *
10252unicodeiter_next(unicodeiterobject *it)
10253{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010254 PyUnicodeObject *seq;
10255 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010256
Benjamin Peterson14339b62009-01-31 16:36:08 +000010257 assert(it != NULL);
10258 seq = it->it_seq;
10259 if (seq == NULL)
10260 return NULL;
10261 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010262
Benjamin Peterson14339b62009-01-31 16:36:08 +000010263 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10264 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010265 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010266 if (item != NULL)
10267 ++it->it_index;
10268 return item;
10269 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010270
Benjamin Peterson14339b62009-01-31 16:36:08 +000010271 Py_DECREF(seq);
10272 it->it_seq = NULL;
10273 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010274}
10275
10276static PyObject *
10277unicodeiter_len(unicodeiterobject *it)
10278{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010279 Py_ssize_t len = 0;
10280 if (it->it_seq)
10281 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10282 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010283}
10284
10285PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10286
10287static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010288 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010289 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010290 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010291};
10292
10293PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010294 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10295 "str_iterator", /* tp_name */
10296 sizeof(unicodeiterobject), /* tp_basicsize */
10297 0, /* tp_itemsize */
10298 /* methods */
10299 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10300 0, /* tp_print */
10301 0, /* tp_getattr */
10302 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010303 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010304 0, /* tp_repr */
10305 0, /* tp_as_number */
10306 0, /* tp_as_sequence */
10307 0, /* tp_as_mapping */
10308 0, /* tp_hash */
10309 0, /* tp_call */
10310 0, /* tp_str */
10311 PyObject_GenericGetAttr, /* tp_getattro */
10312 0, /* tp_setattro */
10313 0, /* tp_as_buffer */
10314 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10315 0, /* tp_doc */
10316 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10317 0, /* tp_clear */
10318 0, /* tp_richcompare */
10319 0, /* tp_weaklistoffset */
10320 PyObject_SelfIter, /* tp_iter */
10321 (iternextfunc)unicodeiter_next, /* tp_iternext */
10322 unicodeiter_methods, /* tp_methods */
10323 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010324};
10325
10326static PyObject *
10327unicode_iter(PyObject *seq)
10328{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010329 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010330
Benjamin Peterson14339b62009-01-31 16:36:08 +000010331 if (!PyUnicode_Check(seq)) {
10332 PyErr_BadInternalCall();
10333 return NULL;
10334 }
10335 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10336 if (it == NULL)
10337 return NULL;
10338 it->it_index = 0;
10339 Py_INCREF(seq);
10340 it->it_seq = (PyUnicodeObject *)seq;
10341 _PyObject_GC_TRACK(it);
10342 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010343}
10344
Martin v. Löwis5b222132007-06-10 09:51:05 +000010345size_t
10346Py_UNICODE_strlen(const Py_UNICODE *u)
10347{
10348 int res = 0;
10349 while(*u++)
10350 res++;
10351 return res;
10352}
10353
10354Py_UNICODE*
10355Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10356{
10357 Py_UNICODE *u = s1;
10358 while ((*u++ = *s2++));
10359 return s1;
10360}
10361
10362Py_UNICODE*
10363Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10364{
10365 Py_UNICODE *u = s1;
10366 while ((*u++ = *s2++))
10367 if (n-- == 0)
10368 break;
10369 return s1;
10370}
10371
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010372Py_UNICODE*
10373Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10374{
10375 Py_UNICODE *u1 = s1;
10376 u1 += Py_UNICODE_strlen(u1);
10377 Py_UNICODE_strcpy(u1, s2);
10378 return s1;
10379}
10380
Martin v. Löwis5b222132007-06-10 09:51:05 +000010381int
10382Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10383{
10384 while (*s1 && *s2 && *s1 == *s2)
10385 s1++, s2++;
10386 if (*s1 && *s2)
10387 return (*s1 < *s2) ? -1 : +1;
10388 if (*s1)
10389 return 1;
10390 if (*s2)
10391 return -1;
10392 return 0;
10393}
10394
Victor Stinneref8d95c2010-08-16 22:03:11 +000010395int
10396Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10397{
10398 register Py_UNICODE u1, u2;
10399 for (; n != 0; n--) {
10400 u1 = *s1;
10401 u2 = *s2;
10402 if (u1 != u2)
10403 return (u1 < u2) ? -1 : +1;
10404 if (u1 == '\0')
10405 return 0;
10406 s1++;
10407 s2++;
10408 }
10409 return 0;
10410}
10411
Martin v. Löwis5b222132007-06-10 09:51:05 +000010412Py_UNICODE*
10413Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10414{
10415 const Py_UNICODE *p;
10416 for (p = s; *p; p++)
10417 if (*p == c)
10418 return (Py_UNICODE*)p;
10419 return NULL;
10420}
10421
Victor Stinner331ea922010-08-10 16:37:20 +000010422Py_UNICODE*
10423Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10424{
10425 const Py_UNICODE *p;
10426 p = s + Py_UNICODE_strlen(s);
10427 while (p != s) {
10428 p--;
10429 if (*p == c)
10430 return (Py_UNICODE*)p;
10431 }
10432 return NULL;
10433}
10434
Victor Stinner71133ff2010-09-01 23:43:53 +000010435Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010436PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010437{
10438 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10439 Py_UNICODE *copy;
10440 Py_ssize_t size;
10441
10442 /* Ensure we won't overflow the size. */
10443 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10444 PyErr_NoMemory();
10445 return NULL;
10446 }
10447 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10448 size *= sizeof(Py_UNICODE);
10449 copy = PyMem_Malloc(size);
10450 if (copy == NULL) {
10451 PyErr_NoMemory();
10452 return NULL;
10453 }
10454 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10455 return copy;
10456}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010457
Georg Brandl66c221e2010-10-14 07:04:07 +000010458/* A _string module, to export formatter_parser and formatter_field_name_split
10459 to the string.Formatter class implemented in Python. */
10460
10461static PyMethodDef _string_methods[] = {
10462 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10463 METH_O, PyDoc_STR("split the argument as a field name")},
10464 {"formatter_parser", (PyCFunction) formatter_parser,
10465 METH_O, PyDoc_STR("parse the argument as a format string")},
10466 {NULL, NULL}
10467};
10468
10469static struct PyModuleDef _string_module = {
10470 PyModuleDef_HEAD_INIT,
10471 "_string",
10472 PyDoc_STR("string helper module"),
10473 0,
10474 _string_methods,
10475 NULL,
10476 NULL,
10477 NULL,
10478 NULL
10479};
10480
10481PyMODINIT_FUNC
10482PyInit__string(void)
10483{
10484 return PyModule_Create(&_string_module);
10485}
10486
10487
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010488#ifdef __cplusplus
10489}
10490#endif