blob: 92d17771e2adc1a7d66341a73de20a068f70d7cb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
Serhiy Storchaka05997252013-01-26 12:14:02 +020083NOTE: In the interpreter's initialization phase, some globals are currently
84 initialized dynamically as needed. In the process Unicode objects may
85 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086
87*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000089
90#ifdef __cplusplus
91extern "C" {
92#endif
93
Walter Dörwald16807132007-05-25 13:52:07 +000094/* This dictionary holds all interned unicode strings. Note that references
95 to strings in this dictionary are *not* counted in the string's ob_refcnt.
96 When the interned string reaches a refcnt of 0 the string deallocation
97 function will delete the reference from this dictionary.
98
99 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000100 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000101*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200102static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000103
Guido van Rossumd57fd912000-03-10 22:53:23 +0000104/* Free list for Unicode objects */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200105static PyUnicodeObject *free_list = NULL;
106static int numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000107
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000108/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200109static PyUnicodeObject *unicode_empty = NULL;
110
111#define _Py_RETURN_UNICODE_EMPTY() \
112 do { \
113 if (unicode_empty != NULL) \
114 Py_INCREF(unicode_empty); \
115 else { \
116 unicode_empty = _PyUnicode_New(0); \
117 if (unicode_empty != NULL) \
118 Py_INCREF(unicode_empty); \
119 } \
120 return (PyObject *)unicode_empty; \
121 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000122
123/* Single character Unicode strings in the Latin-1 range are being
124 shared as well. */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200125static PyUnicodeObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000126
Christian Heimes190d79e2008-01-30 11:58:22 +0000127/* Fast detection of the most frequent whitespace characters */
128const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000129 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000130/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000131/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000132/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000133/* case 0x000C: * FORM FEED */
134/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 1, 1, 1, 1, 1, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000137/* case 0x001C: * FILE SEPARATOR */
138/* case 0x001D: * GROUP SEPARATOR */
139/* case 0x001E: * RECORD SEPARATOR */
140/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000141 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000142/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000143 1, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000147
Benjamin Peterson14339b62009-01-31 16:36:08 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000156};
157
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000158static PyObject *unicode_encode_call_errorhandler(const char *errors,
159 PyObject **errorHandler,const char *encoding, const char *reason,
160 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
161 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
162
Victor Stinner31be90b2010-04-22 19:38:16 +0000163static void raise_encode_exception(PyObject **exceptionObject,
164 const char *encoding,
165 const Py_UNICODE *unicode, Py_ssize_t size,
166 Py_ssize_t startpos, Py_ssize_t endpos,
167 const char *reason);
168
Christian Heimes190d79e2008-01-30 11:58:22 +0000169/* Same for linebreaks */
170static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000172/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000173/* 0x000B, * LINE TABULATION */
174/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000175/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000176 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000178/* 0x001C, * FILE SEPARATOR */
179/* 0x001D, * GROUP SEPARATOR */
180/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000181 0, 0, 0, 0, 1, 1, 1, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000186
Benjamin Peterson14339b62009-01-31 16:36:08 +0000187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 0, 0, 0, 0, 0, 0, 0, 0,
190 0, 0, 0, 0, 0, 0, 0, 0,
191 0, 0, 0, 0, 0, 0, 0, 0,
192 0, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000195};
196
197
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000198Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000199PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000200{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000201#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000203#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 /* This is actually an illegal character, so it should
205 not be passed to unichr. */
206 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000207#endif
208}
209
Thomas Wouters477c8d52006-05-27 19:21:47 +0000210/* --- Bloom Filters ----------------------------------------------------- */
211
212/* stuff to implement simple "bloom filters" for Unicode characters.
213 to keep things simple, we use a single bitmask, using the least 5
214 bits from each unicode characters as the bit index. */
215
216/* the linebreak mask is set up by Unicode_Init below */
217
Antoine Pitrouf068f942010-01-13 14:19:12 +0000218#if LONG_BIT >= 128
219#define BLOOM_WIDTH 128
220#elif LONG_BIT >= 64
221#define BLOOM_WIDTH 64
222#elif LONG_BIT >= 32
223#define BLOOM_WIDTH 32
224#else
225#error "LONG_BIT is smaller than 32"
226#endif
227
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228#define BLOOM_MASK unsigned long
229
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231
Antoine Pitrouf068f942010-01-13 14:19:12 +0000232#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
233#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234
Benjamin Peterson29060642009-01-31 22:14:21 +0000235#define BLOOM_LINEBREAK(ch) \
236 ((ch) < 128U ? ascii_linebreak[(ch)] : \
237 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000238
239Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
240{
241 /* calculate simple bloom-style bitmask for a given unicode string */
242
Antoine Pitrouf068f942010-01-13 14:19:12 +0000243 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000244 Py_ssize_t i;
245
246 mask = 0;
247 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000248 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000249
250 return mask;
251}
252
253Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
254{
255 Py_ssize_t i;
256
257 for (i = 0; i < setlen; i++)
258 if (set[i] == chr)
259 return 1;
260
261 return 0;
262}
263
Benjamin Peterson29060642009-01-31 22:14:21 +0000264#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000265 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
266
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267/* --- Unicode Object ----------------------------------------------------- */
268
269static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272{
273 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000274
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000275 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000277 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000279 /* Resizing shared object (unicode_empty or single character
280 objects) in-place is not allowed. Use PyUnicode_Resize()
281 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000284 (unicode->length == 1 &&
285 unicode->str[0] < 256U &&
286 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000288 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 return -1;
290 }
291
Thomas Wouters477c8d52006-05-27 19:21:47 +0000292 /* We allocate one more byte to make sure the string is Ux0000 terminated.
293 The overallocation is also used by fastsearch, which assumes that it's
294 safe to look at str[length] (without making any assumptions about what
295 it contains). */
296
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000298 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000301 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 PyErr_NoMemory();
303 return -1;
304 }
305 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307
Benjamin Peterson29060642009-01-31 22:14:21 +0000308 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000310 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000311 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312 }
313 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000314
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315 return 0;
316}
317
318/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000319 Ux0000 terminated; some code (e.g. new_identifier)
320 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000321
322 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000323 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324
325*/
326
327static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000328PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329{
330 register PyUnicodeObject *unicode;
331
Thomas Wouters477c8d52006-05-27 19:21:47 +0000332 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 if (length == 0 && unicode_empty != NULL) {
334 Py_INCREF(unicode_empty);
335 return unicode_empty;
336 }
337
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000338 /* Ensure we won't overflow the size. */
339 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
340 return (PyUnicodeObject *)PyErr_NoMemory();
341 }
342
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000344 if (free_list) {
345 unicode = free_list;
346 free_list = *(PyUnicodeObject **)unicode;
347 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 if (unicode->str) {
349 /* Keep-Alive optimization: we only upsize the buffer,
350 never downsize it. */
351 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000352 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 PyObject_DEL(unicode->str);
354 unicode->str = NULL;
355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000357 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000360 }
361 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 }
363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000365 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (unicode == NULL)
367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000368 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
369 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000370 }
371
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000372 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000373 PyErr_NoMemory();
374 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000375 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000376 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000377 * the caller fails before initializing str -- unicode_resize()
378 * reads str[0], and the Keep-Alive optimization can keep memory
379 * allocated for str alive across a call to unicode_dealloc(unicode).
380 * We don't want unicode_resize to read uninitialized memory in
381 * that case.
382 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000383 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000385 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000387 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000388 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000390
Benjamin Peterson29060642009-01-31 22:14:21 +0000391 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000392 /* XXX UNREF/NEWREF interface should be more symmetrical */
393 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000394 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000395 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397}
398
399static
Guido van Rossum9475a232001-10-05 20:51:39 +0000400void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401{
Walter Dörwald16807132007-05-25 13:52:07 +0000402 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_NOT_INTERNED:
404 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 case SSTATE_INTERNED_MORTAL:
407 /* revive dead object temporarily for DelItem */
408 Py_REFCNT(unicode) = 3;
409 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
410 Py_FatalError(
411 "deletion of interned string failed");
412 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000413
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 case SSTATE_INTERNED_IMMORTAL:
415 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000416
Benjamin Peterson29060642009-01-31 22:14:21 +0000417 default:
418 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000419 }
420
Guido van Rossum604ddf82001-12-06 20:03:56 +0000421 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000423 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000424 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
425 PyObject_DEL(unicode->str);
426 unicode->str = NULL;
427 unicode->length = 0;
428 }
429 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000430 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000431 }
432 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000433 *(PyUnicodeObject **)unicode = free_list;
434 free_list = unicode;
435 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436 }
437 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyObject_DEL(unicode->str);
439 Py_XDECREF(unicode->defenc);
440 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441 }
442}
443
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444static
445int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000446{
447 register PyUnicodeObject *v;
448
449 /* Argument checks */
450 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 PyErr_BadInternalCall();
452 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000453 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000454 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000455 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000456 PyErr_BadInternalCall();
457 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000458 }
459
460 /* Resizing unicode_empty and single character objects is not
461 possible since these are being shared. We simply return a fresh
462 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000463 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000464 (v == unicode_empty || v->length == 1)) {
465 PyUnicodeObject *w = _PyUnicode_New(length);
466 if (w == NULL)
467 return -1;
468 Py_UNICODE_COPY(w->str, v->str,
469 length < v->length ? length : v->length);
470 Py_DECREF(*unicode);
471 *unicode = w;
472 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000473 }
474
475 /* Note that we don't have to modify *unicode for unshared Unicode
476 objects, since we can modify them in-place. */
477 return unicode_resize(v, length);
478}
479
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000480int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
481{
482 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
483}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000484
Guido van Rossumd57fd912000-03-10 22:53:23 +0000485PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000487{
488 PyUnicodeObject *unicode;
489
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000490 /* If the Unicode data is known at construction time, we can apply
491 some optimizations which share commonly used objects. */
492 if (u != NULL) {
493
Benjamin Peterson29060642009-01-31 22:14:21 +0000494 /* Optimization for empty strings */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200495 if (size == 0)
496 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +0000497
498 /* Single character Unicode objects in the Latin-1 range are
499 shared when using this constructor */
500 if (size == 1 && *u < 256) {
501 unicode = unicode_latin1[*u];
502 if (!unicode) {
503 unicode = _PyUnicode_New(1);
504 if (!unicode)
505 return NULL;
506 unicode->str[0] = *u;
507 unicode_latin1[*u] = unicode;
508 }
509 Py_INCREF(unicode);
510 return (PyObject *)unicode;
511 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000512 }
Tim Petersced69f82003-09-16 20:30:58 +0000513
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode = _PyUnicode_New(size);
515 if (!unicode)
516 return NULL;
517
518 /* Copy the Unicode data into the new object */
519 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521
522 return (PyObject *)unicode;
523}
524
Walter Dörwaldd2034312007-05-18 16:29:38 +0000525PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000526{
527 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000528
Benjamin Peterson14339b62009-01-31 16:36:08 +0000529 if (size < 0) {
530 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000531 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000532 return NULL;
533 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000534
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000535 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000536 some optimizations which share commonly used objects.
537 Also, this means the input must be UTF-8, so fall back to the
538 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000539 if (u != NULL) {
540
Benjamin Peterson29060642009-01-31 22:14:21 +0000541 /* Optimization for empty strings */
Serhiy Storchaka05997252013-01-26 12:14:02 +0200542 if (size == 0)
543 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +0000544
545 /* Single characters are shared when using this constructor.
546 Restrict to ASCII, since the input must be UTF-8. */
547 if (size == 1 && Py_CHARMASK(*u) < 128) {
548 unicode = unicode_latin1[Py_CHARMASK(*u)];
549 if (!unicode) {
550 unicode = _PyUnicode_New(1);
551 if (!unicode)
552 return NULL;
553 unicode->str[0] = Py_CHARMASK(*u);
554 unicode_latin1[Py_CHARMASK(*u)] = unicode;
555 }
556 Py_INCREF(unicode);
557 return (PyObject *)unicode;
558 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000559
560 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 }
562
Walter Dörwald55507312007-05-18 13:12:10 +0000563 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 if (!unicode)
565 return NULL;
566
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000567 return (PyObject *)unicode;
568}
569
Walter Dörwaldd2034312007-05-18 16:29:38 +0000570PyObject *PyUnicode_FromString(const char *u)
571{
572 size_t size = strlen(u);
573 if (size > PY_SSIZE_T_MAX) {
574 PyErr_SetString(PyExc_OverflowError, "input too long");
575 return NULL;
576 }
577
578 return PyUnicode_FromStringAndSize(u, size);
579}
580
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581#ifdef HAVE_WCHAR_H
582
Mark Dickinson081dfee2009-03-18 14:47:41 +0000583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594{
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 if (size == 0)
602 return PyUnicode_FromStringAndSize(NULL, 0);
603 PyErr_BadInternalCall();
604 return NULL;
605 }
606
607 if (size == -1) {
608 size = wcslen(w);
609 }
610
611 alloc = size;
612 orig_w = w;
613 for (i = size; i > 0; i--) {
614 if (*w > 0xFFFF)
615 alloc++;
616 w++;
617 }
618 w = orig_w;
619 unicode = _PyUnicode_New(alloc);
620 if (!unicode)
621 return NULL;
622
623 /* Copy the wchar_t data into the new object */
624 {
625 register Py_UNICODE *u;
626 u = PyUnicode_AS_UNICODE(unicode);
627 for (i = size; i > 0; i--) {
628 if (*w > 0xFFFF) {
629 wchar_t ordinal = *w++;
630 ordinal -= 0x10000;
631 *u++ = 0xD800 | (ordinal >> 10);
632 *u++ = 0xDC00 | (ordinal & 0x3FF);
633 }
634 else
635 *u++ = *w++;
636 }
637 }
638 return (PyObject *)unicode;
639}
640
641#else
642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000644 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000645{
646 PyUnicodeObject *unicode;
647
648 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == 0)
650 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000651 PyErr_BadInternalCall();
652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 }
654
Martin v. Löwis790465f2008-04-05 20:41:37 +0000655 if (size == -1) {
656 size = wcslen(w);
657 }
658
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 unicode = _PyUnicode_New(size);
660 if (!unicode)
661 return NULL;
662
663 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000664#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000666#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000668 register Py_UNICODE *u;
669 register Py_ssize_t i;
670 u = PyUnicode_AS_UNICODE(unicode);
671 for (i = size; i > 0; i--)
672 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000673 }
674#endif
675
676 return (PyObject *)unicode;
677}
678
Mark Dickinson081dfee2009-03-18 14:47:41 +0000679#endif /* CONVERT_WCHAR_TO_SURROGATES */
680
681#undef CONVERT_WCHAR_TO_SURROGATES
682
Walter Dörwald346737f2007-05-31 10:44:43 +0000683static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000684makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
685 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000686{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000687 *fmt++ = '%';
688 if (width) {
689 if (zeropad)
690 *fmt++ = '0';
691 fmt += sprintf(fmt, "%d", width);
692 }
693 if (precision)
694 fmt += sprintf(fmt, ".%d", precision);
695 if (longflag)
696 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000697 else if (longlongflag) {
698 /* longlongflag should only ever be nonzero on machines with
699 HAVE_LONG_LONG defined */
700#ifdef HAVE_LONG_LONG
701 char *f = PY_FORMAT_LONG_LONG;
702 while (*f)
703 *fmt++ = *f++;
704#else
705 /* we shouldn't ever get here */
706 assert(0);
707 *fmt++ = 'l';
708#endif
709 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000710 else if (size_tflag) {
711 char *f = PY_FORMAT_SIZE_T;
712 while (*f)
713 *fmt++ = *f++;
714 }
715 *fmt++ = c;
716 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000717}
718
Walter Dörwaldd2034312007-05-18 16:29:38 +0000719#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
720
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000721/* size of fixed-size buffer for formatting single arguments */
722#define ITEM_BUFFER_LEN 21
723/* maximum number of characters required for output of %ld. 21 characters
724 allows for 64-bit integers (in decimal) and an optional sign. */
725#define MAX_LONG_CHARS 21
726/* maximum number of characters required for output of %lld.
727 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
728 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
729#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
730
Walter Dörwaldd2034312007-05-18 16:29:38 +0000731PyObject *
732PyUnicode_FromFormatV(const char *format, va_list vargs)
733{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000734 va_list count;
735 Py_ssize_t callcount = 0;
736 PyObject **callresults = NULL;
737 PyObject **callresult = NULL;
738 Py_ssize_t n = 0;
739 int width = 0;
740 int precision = 0;
741 int zeropad;
742 const char* f;
743 Py_UNICODE *s;
744 PyObject *string;
745 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 /* use abuffer instead of buffer, if we need more space
748 * (which can happen if there's a format specifier with width). */
749 char *abuffer = NULL;
750 char *realbuffer;
751 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000752 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000753 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000754
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000755 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000756 /* step 1: count the number of %S/%R/%A/%s format specifications
757 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
758 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
759 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000760 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000761 if (*f == '%') {
762 if (*(f+1)=='%')
763 continue;
Victor Stinner2b574a22011-03-01 22:48:49 +0000764 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000765 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000766 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000767 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000768 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 ;
770 if (*f == 's')
771 ++callcount;
772 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000773 else if (128 <= (unsigned char)*f) {
774 PyErr_Format(PyExc_ValueError,
775 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000776 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000777 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000778 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000779 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000780 }
781 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000782 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000783 if (callcount) {
784 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
785 if (!callresults) {
786 PyErr_NoMemory();
787 return NULL;
788 }
789 callresult = callresults;
790 }
791 /* step 3: figure out how large a buffer we need */
792 for (f = format; *f; f++) {
793 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000794#ifdef HAVE_LONG_LONG
795 int longlongflag = 0;
796#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 const char* p = f;
798 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000799 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000801 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000802 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000803
Benjamin Peterson14339b62009-01-31 16:36:08 +0000804 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
805 * they don't affect the amount of space we reserve.
806 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000807 if (*f == 'l') {
808 if (f[1] == 'd' || f[1] == 'u') {
809 ++f;
810 }
811#ifdef HAVE_LONG_LONG
812 else if (f[1] == 'l' &&
813 (f[2] == 'd' || f[2] == 'u')) {
814 longlongflag = 1;
815 f += 2;
816 }
817#endif
818 }
819 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000820 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000821 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000822
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 switch (*f) {
824 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +0000825 {
826#ifndef Py_UNICODE_WIDE
827 int ordinal = va_arg(count, int);
828 if (ordinal > 0xffff)
829 n += 2;
830 else
831 n++;
832#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000833 (void)va_arg(count, int);
Victor Stinner659eb842011-02-23 12:14:22 +0000834 n++;
835#endif
836 break;
837 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000838 case '%':
839 n++;
840 break;
841 case 'd': case 'u': case 'i': case 'x':
842 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000843#ifdef HAVE_LONG_LONG
844 if (longlongflag) {
845 if (width < MAX_LONG_LONG_CHARS)
846 width = MAX_LONG_LONG_CHARS;
847 }
848 else
849#endif
850 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
851 including sign. Decimal takes the most space. This
852 isn't enough for octal. If a width is specified we
853 need more (which we allocate later). */
854 if (width < MAX_LONG_CHARS)
855 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000856 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000857 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000858 if (abuffersize < width)
859 abuffersize = width;
860 break;
861 case 's':
862 {
863 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000864 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000865 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
866 if (!str)
867 goto fail;
868 n += PyUnicode_GET_SIZE(str);
869 /* Remember the str and switch to the next slot */
870 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000871 break;
872 }
873 case 'U':
874 {
875 PyObject *obj = va_arg(count, PyObject *);
876 assert(obj && PyUnicode_Check(obj));
877 n += PyUnicode_GET_SIZE(obj);
878 break;
879 }
880 case 'V':
881 {
882 PyObject *obj = va_arg(count, PyObject *);
883 const char *str = va_arg(count, const char *);
Victor Stinner2b574a22011-03-01 22:48:49 +0000884 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000885 assert(obj || str);
886 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2b574a22011-03-01 22:48:49 +0000887 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000888 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2b574a22011-03-01 22:48:49 +0000889 *callresult++ = NULL;
890 }
891 else {
892 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
893 if (!str_obj)
894 goto fail;
895 n += PyUnicode_GET_SIZE(str_obj);
896 *callresult++ = str_obj;
897 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000898 break;
899 }
900 case 'S':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *str;
904 assert(obj);
905 str = PyObject_Str(obj);
906 if (!str)
907 goto fail;
908 n += PyUnicode_GET_SIZE(str);
909 /* Remember the str and switch to the next slot */
910 *callresult++ = str;
911 break;
912 }
913 case 'R':
914 {
915 PyObject *obj = va_arg(count, PyObject *);
916 PyObject *repr;
917 assert(obj);
918 repr = PyObject_Repr(obj);
919 if (!repr)
920 goto fail;
921 n += PyUnicode_GET_SIZE(repr);
922 /* Remember the repr and switch to the next slot */
923 *callresult++ = repr;
924 break;
925 }
926 case 'A':
927 {
928 PyObject *obj = va_arg(count, PyObject *);
929 PyObject *ascii;
930 assert(obj);
931 ascii = PyObject_ASCII(obj);
932 if (!ascii)
933 goto fail;
934 n += PyUnicode_GET_SIZE(ascii);
935 /* Remember the repr and switch to the next slot */
936 *callresult++ = ascii;
937 break;
938 }
939 case 'p':
940 (void) va_arg(count, int);
941 /* maximum 64-bit pointer representation:
942 * 0xffffffffffffffff
943 * so 19 characters is enough.
944 * XXX I count 18 -- what's the extra for?
945 */
946 n += 19;
947 break;
948 default:
949 /* if we stumble upon an unknown
950 formatting code, copy the rest of
951 the format string to the output
952 string. (we cannot just skip the
953 code, since there's no way to know
954 what's in the argument list) */
955 n += strlen(p);
956 goto expand;
957 }
958 } else
959 n++;
960 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000961 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000962 if (abuffersize > ITEM_BUFFER_LEN) {
963 /* add 1 for sprintf's trailing null byte */
964 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000965 if (!abuffer) {
966 PyErr_NoMemory();
967 goto fail;
968 }
969 realbuffer = abuffer;
970 }
971 else
972 realbuffer = buffer;
973 /* step 4: fill the buffer */
974 /* Since we've analyzed how much space we need for the worst case,
975 we don't have to resize the string.
976 There can be no errors beyond this point. */
977 string = PyUnicode_FromUnicode(NULL, n);
978 if (!string)
979 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000980
Benjamin Peterson14339b62009-01-31 16:36:08 +0000981 s = PyUnicode_AS_UNICODE(string);
982 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000983
Benjamin Peterson14339b62009-01-31 16:36:08 +0000984 for (f = format; *f; f++) {
985 if (*f == '%') {
986 const char* p = f++;
987 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000988 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 int size_tflag = 0;
990 zeropad = (*f == '0');
991 /* parse the width.precision part */
992 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000993 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000994 width = (width*10) + *f++ - '0';
995 precision = 0;
996 if (*f == '.') {
997 f++;
David Malcolm96960882010-11-05 17:23:41 +0000998 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000999 precision = (precision*10) + *f++ - '0';
1000 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001001 /* Handle %ld, %lu, %lld and %llu. */
1002 if (*f == 'l') {
1003 if (f[1] == 'd' || f[1] == 'u') {
1004 longflag = 1;
1005 ++f;
1006 }
1007#ifdef HAVE_LONG_LONG
1008 else if (f[1] == 'l' &&
1009 (f[2] == 'd' || f[2] == 'u')) {
1010 longlongflag = 1;
1011 f += 2;
1012 }
1013#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001014 }
1015 /* handle the size_t flag. */
1016 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1017 size_tflag = 1;
1018 ++f;
1019 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 switch (*f) {
1022 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +00001023 {
1024 int ordinal = va_arg(vargs, int);
1025#ifndef Py_UNICODE_WIDE
1026 if (ordinal > 0xffff) {
1027 ordinal -= 0x10000;
1028 *s++ = 0xD800 | (ordinal >> 10);
1029 *s++ = 0xDC00 | (ordinal & 0x3FF);
1030 } else
1031#endif
1032 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 break;
Victor Stinner659eb842011-02-23 12:14:22 +00001034 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001036 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1037 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001038 if (longflag)
1039 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001040#ifdef HAVE_LONG_LONG
1041 else if (longlongflag)
1042 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1043#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001044 else if (size_tflag)
1045 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1046 else
1047 sprintf(realbuffer, fmt, va_arg(vargs, int));
1048 appendstring(realbuffer);
1049 break;
1050 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001051 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1052 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001053 if (longflag)
1054 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001055#ifdef HAVE_LONG_LONG
1056 else if (longlongflag)
1057 sprintf(realbuffer, fmt, va_arg(vargs,
1058 unsigned PY_LONG_LONG));
1059#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001060 else if (size_tflag)
1061 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1062 else
1063 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1064 appendstring(realbuffer);
1065 break;
1066 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001067 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001068 sprintf(realbuffer, fmt, va_arg(vargs, int));
1069 appendstring(realbuffer);
1070 break;
1071 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001072 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001073 sprintf(realbuffer, fmt, va_arg(vargs, int));
1074 appendstring(realbuffer);
1075 break;
1076 case 's':
1077 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001078 /* unused, since we already have the result */
1079 (void) va_arg(vargs, char *);
1080 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1081 PyUnicode_GET_SIZE(*callresult));
1082 s += PyUnicode_GET_SIZE(*callresult);
1083 /* We're done with the unicode()/repr() => forget it */
1084 Py_DECREF(*callresult);
1085 /* switch to next unicode()/repr() result */
1086 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001087 break;
1088 }
1089 case 'U':
1090 {
1091 PyObject *obj = va_arg(vargs, PyObject *);
1092 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1093 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1094 s += size;
1095 break;
1096 }
1097 case 'V':
1098 {
1099 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2b574a22011-03-01 22:48:49 +00001100 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001101 if (obj) {
1102 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1103 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1104 s += size;
1105 } else {
Victor Stinner2b574a22011-03-01 22:48:49 +00001106 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1107 PyUnicode_GET_SIZE(*callresult));
1108 s += PyUnicode_GET_SIZE(*callresult);
1109 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001110 }
Victor Stinner2b574a22011-03-01 22:48:49 +00001111 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001112 break;
1113 }
1114 case 'S':
1115 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001116 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001117 {
1118 Py_UNICODE *ucopy;
1119 Py_ssize_t usize;
1120 Py_ssize_t upos;
1121 /* unused, since we already have the result */
1122 (void) va_arg(vargs, PyObject *);
1123 ucopy = PyUnicode_AS_UNICODE(*callresult);
1124 usize = PyUnicode_GET_SIZE(*callresult);
1125 for (upos = 0; upos<usize;)
1126 *s++ = ucopy[upos++];
1127 /* We're done with the unicode()/repr() => forget it */
1128 Py_DECREF(*callresult);
1129 /* switch to next unicode()/repr() result */
1130 ++callresult;
1131 break;
1132 }
1133 case 'p':
1134 sprintf(buffer, "%p", va_arg(vargs, void*));
1135 /* %p is ill-defined: ensure leading 0x. */
1136 if (buffer[1] == 'X')
1137 buffer[1] = 'x';
1138 else if (buffer[1] != 'x') {
1139 memmove(buffer+2, buffer, strlen(buffer)+1);
1140 buffer[0] = '0';
1141 buffer[1] = 'x';
1142 }
1143 appendstring(buffer);
1144 break;
1145 case '%':
1146 *s++ = '%';
1147 break;
1148 default:
1149 appendstring(p);
1150 goto end;
1151 }
Victor Stinner1205f272010-09-11 00:54:47 +00001152 }
Victor Stinner1205f272010-09-11 00:54:47 +00001153 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001154 *s++ = *f;
1155 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001156
Benjamin Peterson29060642009-01-31 22:14:21 +00001157 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001158 if (callresults)
1159 PyObject_Free(callresults);
1160 if (abuffer)
1161 PyObject_Free(abuffer);
1162 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1163 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001164 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001165 if (callresults) {
1166 PyObject **callresult2 = callresults;
1167 while (callresult2 < callresult) {
Victor Stinner2b574a22011-03-01 22:48:49 +00001168 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001169 ++callresult2;
1170 }
1171 PyObject_Free(callresults);
1172 }
1173 if (abuffer)
1174 PyObject_Free(abuffer);
1175 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001176}
1177
1178#undef appendstring
1179
1180PyObject *
1181PyUnicode_FromFormat(const char *format, ...)
1182{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001183 PyObject* ret;
1184 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001185
1186#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001187 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001188#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001189 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001190#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 ret = PyUnicode_FromFormatV(format, vargs);
1192 va_end(vargs);
1193 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001194}
1195
Victor Stinner5593d8a2010-10-02 11:11:27 +00001196/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1197 convert a Unicode object to a wide character string.
1198
Victor Stinnerd88d9832011-09-06 02:00:05 +02001199 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001200 character) required to convert the unicode object. Ignore size argument.
1201
Victor Stinnerd88d9832011-09-06 02:00:05 +02001202 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001203 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001204 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001205static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001206unicode_aswidechar(PyUnicodeObject *unicode,
1207 wchar_t *w,
1208 Py_ssize_t size)
1209{
1210#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001211 Py_ssize_t res;
1212 if (w != NULL) {
1213 res = PyUnicode_GET_SIZE(unicode);
1214 if (size > res)
1215 size = res + 1;
1216 else
1217 res = size;
1218 memcpy(w, unicode->str, size * sizeof(wchar_t));
1219 return res;
1220 }
1221 else
1222 return PyUnicode_GET_SIZE(unicode) + 1;
1223#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1224 register const Py_UNICODE *u;
1225 const Py_UNICODE *uend;
1226 const wchar_t *worig, *wend;
1227 Py_ssize_t nchar;
1228
Victor Stinner137c34c2010-09-29 10:25:54 +00001229 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001230 uend = u + PyUnicode_GET_SIZE(unicode);
1231 if (w != NULL) {
1232 worig = w;
1233 wend = w + size;
1234 while (u != uend && w != wend) {
1235 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1236 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1237 {
1238 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1239 u += 2;
1240 }
1241 else {
1242 *w = *u;
1243 u++;
1244 }
1245 w++;
1246 }
1247 if (w != wend)
1248 *w = L'\0';
1249 return w - worig;
1250 }
1251 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001252 nchar = 1; /* null character at the end */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001253 while (u != uend) {
1254 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1255 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1256 u += 2;
1257 else
1258 u++;
1259 nchar++;
1260 }
1261 }
1262 return nchar;
1263#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1264 register Py_UNICODE *u, *uend, ordinal;
1265 register Py_ssize_t i;
1266 wchar_t *worig, *wend;
1267 Py_ssize_t nchar;
1268
1269 u = PyUnicode_AS_UNICODE(unicode);
1270 uend = u + PyUnicode_GET_SIZE(u);
1271 if (w != NULL) {
1272 worig = w;
1273 wend = w + size;
1274 while (u != uend && w != wend) {
1275 ordinal = *u;
1276 if (ordinal > 0xffff) {
1277 ordinal -= 0x10000;
1278 *w++ = 0xD800 | (ordinal >> 10);
1279 *w++ = 0xDC00 | (ordinal & 0x3FF);
1280 }
1281 else
1282 *w++ = ordinal;
1283 u++;
1284 }
1285 if (w != wend)
1286 *w = 0;
1287 return w - worig;
1288 }
1289 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001290 nchar = 1; /* null character */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001291 while (u != uend) {
1292 if (*u > 0xffff)
1293 nchar += 2;
1294 else
1295 nchar++;
1296 u++;
1297 }
1298 return nchar;
1299 }
1300#else
1301# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001302#endif
1303}
1304
1305Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001306PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001307 wchar_t *w,
1308 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001309{
1310 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001311 PyErr_BadInternalCall();
1312 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001313 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315}
1316
Victor Stinner137c34c2010-09-29 10:25:54 +00001317wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001318PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001319 Py_ssize_t *size)
1320{
1321 wchar_t* buffer;
1322 Py_ssize_t buflen;
1323
1324 if (unicode == NULL) {
1325 PyErr_BadInternalCall();
1326 return NULL;
1327 }
1328
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001329 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001330 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001331 PyErr_NoMemory();
1332 return NULL;
1333 }
1334
Victor Stinner137c34c2010-09-29 10:25:54 +00001335 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1336 if (buffer == NULL) {
1337 PyErr_NoMemory();
1338 return NULL;
1339 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001340 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001341 if (size != NULL)
1342 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001343 return buffer;
1344}
1345
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346#endif
1347
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001348PyObject *PyUnicode_FromOrdinal(int ordinal)
1349{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001350 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001351
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001352 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 PyErr_SetString(PyExc_ValueError,
1354 "chr() arg not in range(0x110000)");
1355 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001356 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001357
1358#ifndef Py_UNICODE_WIDE
1359 if (ordinal > 0xffff) {
1360 ordinal -= 0x10000;
1361 s[0] = 0xD800 | (ordinal >> 10);
1362 s[1] = 0xDC00 | (ordinal & 0x3FF);
1363 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001364 }
1365#endif
1366
Hye-Shik Chang40574832004-04-06 07:24:51 +00001367 s[0] = (Py_UNICODE)ordinal;
1368 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001369}
1370
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371PyObject *PyUnicode_FromObject(register PyObject *obj)
1372{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001373 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001375 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 Py_INCREF(obj);
1377 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001378 }
1379 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001380 /* For a Unicode subtype that's not a Unicode object,
1381 return a true Unicode object with the same data. */
1382 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1383 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001384 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001385 PyErr_Format(PyExc_TypeError,
1386 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001387 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001388 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001389}
1390
1391PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001392 const char *encoding,
1393 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001394{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001395 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001396 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001397
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001399 PyErr_BadInternalCall();
1400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001402
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001403 /* Decoding bytes objects is the most common case and should be fast */
1404 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02001405 if (PyBytes_GET_SIZE(obj) == 0)
1406 _Py_RETURN_UNICODE_EMPTY();
1407 v = PyUnicode_Decode(
1408 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1409 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001410 return v;
1411 }
1412
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001413 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001414 PyErr_SetString(PyExc_TypeError,
1415 "decoding str is not supported");
1416 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001417 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001418
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001419 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1420 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1421 PyErr_Format(PyExc_TypeError,
1422 "coercing to str: need bytes, bytearray "
1423 "or buffer-like object, %.80s found",
1424 Py_TYPE(obj)->tp_name);
1425 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001426 }
Tim Petersced69f82003-09-16 20:30:58 +00001427
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001428 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02001429 PyBuffer_Release(&buffer);
1430 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001432
Serhiy Storchaka05997252013-01-26 12:14:02 +02001433 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001434 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001435 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436}
1437
Victor Stinner600d3be2010-06-10 12:00:55 +00001438/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001439 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1440 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01001441int
1442_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00001443 char *lower,
1444 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001446 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001447 char *l;
1448 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001449
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001450 e = encoding;
1451 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001452 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001453 while (*e) {
1454 if (l == l_end)
1455 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001456 if (Py_ISUPPER(*e)) {
1457 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001458 }
1459 else if (*e == '_') {
1460 *l++ = '-';
1461 e++;
1462 }
1463 else {
1464 *l++ = *e++;
1465 }
1466 }
1467 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001468 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001469}
1470
1471PyObject *PyUnicode_Decode(const char *s,
1472 Py_ssize_t size,
1473 const char *encoding,
1474 const char *errors)
1475{
1476 PyObject *buffer = NULL, *unicode;
1477 Py_buffer info;
1478 char lower[11]; /* Enough for any encoding shortcut */
1479
1480 if (encoding == NULL)
1481 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001482
1483 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01001484 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Victor Stinner37296e82010-06-10 13:36:23 +00001485 if (strcmp(lower, "utf-8") == 0)
1486 return PyUnicode_DecodeUTF8(s, size, errors);
1487 else if ((strcmp(lower, "latin-1") == 0) ||
1488 (strcmp(lower, "iso-8859-1") == 0))
1489 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001490#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001491 else if (strcmp(lower, "mbcs") == 0)
1492 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001493#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001494 else if (strcmp(lower, "ascii") == 0)
1495 return PyUnicode_DecodeASCII(s, size, errors);
1496 else if (strcmp(lower, "utf-16") == 0)
1497 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1498 else if (strcmp(lower, "utf-32") == 0)
1499 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1500 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501
1502 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001503 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001504 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001505 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001506 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 if (buffer == NULL)
1508 goto onError;
1509 unicode = PyCodec_Decode(buffer, encoding, errors);
1510 if (unicode == NULL)
1511 goto onError;
1512 if (!PyUnicode_Check(unicode)) {
1513 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001514 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001515 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516 Py_DECREF(unicode);
1517 goto onError;
1518 }
1519 Py_DECREF(buffer);
1520 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001521
Benjamin Peterson29060642009-01-31 22:14:21 +00001522 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523 Py_XDECREF(buffer);
1524 return NULL;
1525}
1526
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001527PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1528 const char *encoding,
1529 const char *errors)
1530{
1531 PyObject *v;
1532
1533 if (!PyUnicode_Check(unicode)) {
1534 PyErr_BadArgument();
1535 goto onError;
1536 }
1537
1538 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001539 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001540
1541 /* Decode via the codec registry */
1542 v = PyCodec_Decode(unicode, encoding, errors);
1543 if (v == NULL)
1544 goto onError;
1545 return v;
1546
Benjamin Peterson29060642009-01-31 22:14:21 +00001547 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001548 return NULL;
1549}
1550
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001551PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1552 const char *encoding,
1553 const char *errors)
1554{
1555 PyObject *v;
1556
1557 if (!PyUnicode_Check(unicode)) {
1558 PyErr_BadArgument();
1559 goto onError;
1560 }
1561
1562 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001563 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001564
1565 /* Decode via the codec registry */
1566 v = PyCodec_Decode(unicode, encoding, errors);
1567 if (v == NULL)
1568 goto onError;
1569 if (!PyUnicode_Check(v)) {
1570 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001571 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001572 Py_TYPE(v)->tp_name);
1573 Py_DECREF(v);
1574 goto onError;
1575 }
1576 return v;
1577
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001579 return NULL;
1580}
1581
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001583 Py_ssize_t size,
1584 const char *encoding,
1585 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586{
1587 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001588
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 unicode = PyUnicode_FromUnicode(s, size);
1590 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1593 Py_DECREF(unicode);
1594 return v;
1595}
1596
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001597PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1598 const char *encoding,
1599 const char *errors)
1600{
1601 PyObject *v;
1602
1603 if (!PyUnicode_Check(unicode)) {
1604 PyErr_BadArgument();
1605 goto onError;
1606 }
1607
1608 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001609 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001610
1611 /* Encode via the codec registry */
1612 v = PyCodec_Encode(unicode, encoding, errors);
1613 if (v == NULL)
1614 goto onError;
1615 return v;
1616
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001618 return NULL;
1619}
1620
Victor Stinnerad158722010-10-27 00:25:46 +00001621PyObject *
1622PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001623{
Victor Stinner313a1202010-06-11 23:56:51 +00001624#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001625 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1626 PyUnicode_GET_SIZE(unicode),
1627 NULL);
1628#elif defined(__APPLE__)
1629 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1630 PyUnicode_GET_SIZE(unicode),
1631 "surrogateescape");
1632#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001633 PyInterpreterState *interp = PyThreadState_GET()->interp;
1634 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1635 cannot use it to encode and decode filenames before it is loaded. Load
1636 the Python codec requires to encode at least its own filename. Use the C
1637 version of the locale codec until the codec registry is initialized and
1638 the Python codec is loaded.
1639
1640 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1641 cannot only rely on it: check also interp->fscodec_initialized for
1642 subinterpreters. */
1643 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001644 return PyUnicode_AsEncodedString(unicode,
1645 Py_FileSystemDefaultEncoding,
1646 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001647 }
1648 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001649 /* locale encoding with surrogateescape */
1650 wchar_t *wchar;
1651 char *bytes;
1652 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001653 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001654
1655 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1656 if (wchar == NULL)
1657 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001658 bytes = _Py_wchar2char(wchar, &error_pos);
1659 if (bytes == NULL) {
1660 if (error_pos != (size_t)-1) {
1661 char *errmsg = strerror(errno);
1662 PyObject *exc = NULL;
1663 if (errmsg == NULL)
1664 errmsg = "Py_wchar2char() failed";
1665 raise_encode_exception(&exc,
1666 "filesystemencoding",
1667 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1668 error_pos, error_pos+1,
1669 errmsg);
1670 Py_XDECREF(exc);
1671 }
1672 else
1673 PyErr_NoMemory();
1674 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001675 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001676 }
1677 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001678
1679 bytes_obj = PyBytes_FromString(bytes);
1680 PyMem_Free(bytes);
1681 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001682 }
Victor Stinnerad158722010-10-27 00:25:46 +00001683#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001684}
1685
Guido van Rossumd57fd912000-03-10 22:53:23 +00001686PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1687 const char *encoding,
1688 const char *errors)
1689{
1690 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001691 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001692
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 if (!PyUnicode_Check(unicode)) {
1694 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001695 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 }
Fred Drakee4315f52000-05-09 19:53:39 +00001697
Tim Petersced69f82003-09-16 20:30:58 +00001698 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001699 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001700
1701 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01001702 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Victor Stinner37296e82010-06-10 13:36:23 +00001703 if (strcmp(lower, "utf-8") == 0)
1704 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1705 PyUnicode_GET_SIZE(unicode),
1706 errors);
1707 else if ((strcmp(lower, "latin-1") == 0) ||
1708 (strcmp(lower, "iso-8859-1") == 0))
1709 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1710 PyUnicode_GET_SIZE(unicode),
1711 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001712#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001713 else if (strcmp(lower, "mbcs") == 0)
1714 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1715 PyUnicode_GET_SIZE(unicode),
1716 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001717#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001718 else if (strcmp(lower, "ascii") == 0)
1719 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1720 PyUnicode_GET_SIZE(unicode),
1721 errors);
1722 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001723 /* During bootstrap, we may need to find the encodings
1724 package, to load the file system encoding, and require the
1725 file system encoding in order to load the encodings
1726 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001727
Victor Stinner59e62db2010-05-15 13:14:32 +00001728 Break out of this dependency by assuming that the path to
1729 the encodings module is ASCII-only. XXX could try wcstombs
1730 instead, if the file system encoding is the locale's
1731 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001732 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001733 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1734 !PyThreadState_GET()->interp->codecs_initialized)
1735 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1736 PyUnicode_GET_SIZE(unicode),
1737 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738
1739 /* Encode via the codec registry */
1740 v = PyCodec_Encode(unicode, encoding, errors);
1741 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001742 return NULL;
1743
1744 /* The normal path */
1745 if (PyBytes_Check(v))
1746 return v;
1747
1748 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001749 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001750 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001751 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001752
1753 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1754 "encoder %s returned bytearray instead of bytes",
1755 encoding);
1756 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001757 Py_DECREF(v);
1758 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001759 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001760
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001761 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1762 Py_DECREF(v);
1763 return b;
1764 }
1765
1766 PyErr_Format(PyExc_TypeError,
1767 "encoder did not return a bytes object (type=%.400s)",
1768 Py_TYPE(v)->tp_name);
1769 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001770 return NULL;
1771}
1772
1773PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1774 const char *encoding,
1775 const char *errors)
1776{
1777 PyObject *v;
1778
1779 if (!PyUnicode_Check(unicode)) {
1780 PyErr_BadArgument();
1781 goto onError;
1782 }
1783
1784 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001785 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001786
1787 /* Encode via the codec registry */
1788 v = PyCodec_Encode(unicode, encoding, errors);
1789 if (v == NULL)
1790 goto onError;
1791 if (!PyUnicode_Check(v)) {
1792 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001793 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001794 Py_TYPE(v)->tp_name);
1795 Py_DECREF(v);
1796 goto onError;
1797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001799
Benjamin Peterson29060642009-01-31 22:14:21 +00001800 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 return NULL;
1802}
1803
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001804PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001805 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001806{
1807 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001808 if (v)
1809 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001810 if (errors != NULL)
1811 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001812 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001813 PyUnicode_GET_SIZE(unicode),
1814 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001815 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001816 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001817 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001818 return v;
1819}
1820
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001821PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001822PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001823 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001824 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1825}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001826
Christian Heimes5894ba72007-11-04 11:43:14 +00001827PyObject*
1828PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1829{
Victor Stinnerad158722010-10-27 00:25:46 +00001830#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1831 return PyUnicode_DecodeMBCS(s, size, NULL);
1832#elif defined(__APPLE__)
1833 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1834#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001835 PyInterpreterState *interp = PyThreadState_GET()->interp;
1836 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1837 cannot use it to encode and decode filenames before it is loaded. Load
1838 the Python codec requires to encode at least its own filename. Use the C
1839 version of the locale codec until the codec registry is initialized and
1840 the Python codec is loaded.
1841
1842 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1843 cannot only rely on it: check also interp->fscodec_initialized for
1844 subinterpreters. */
1845 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001846 return PyUnicode_Decode(s, size,
1847 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001848 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001849 }
1850 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001851 /* locale encoding with surrogateescape */
1852 wchar_t *wchar;
1853 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001854 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001855
1856 if (s[size] != '\0' || size != strlen(s)) {
1857 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1858 return NULL;
1859 }
1860
Victor Stinner168e1172010-10-16 23:16:16 +00001861 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001862 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001863 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001864
Victor Stinner168e1172010-10-16 23:16:16 +00001865 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001866 PyMem_Free(wchar);
1867 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001868 }
Victor Stinnerad158722010-10-27 00:25:46 +00001869#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001870}
1871
Martin v. Löwis011e8422009-05-05 04:43:17 +00001872
1873int
Antoine Pitrou13348842012-01-29 18:36:34 +01001874_PyUnicode_HasNULChars(PyObject* s)
1875{
1876 static PyObject *nul = NULL;
1877
1878 if (nul == NULL)
1879 nul = PyUnicode_FromStringAndSize("\0", 1);
1880 if (nul == NULL)
1881 return -1;
1882 return PyUnicode_Contains(s, nul);
1883}
1884
1885
1886int
Martin v. Löwis011e8422009-05-05 04:43:17 +00001887PyUnicode_FSConverter(PyObject* arg, void* addr)
1888{
1889 PyObject *output = NULL;
1890 Py_ssize_t size;
1891 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001892 if (arg == NULL) {
1893 Py_DECREF(*(PyObject**)addr);
1894 return 1;
1895 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001896 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001897 output = arg;
1898 Py_INCREF(output);
1899 }
1900 else {
1901 arg = PyUnicode_FromObject(arg);
1902 if (!arg)
1903 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001904 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001905 Py_DECREF(arg);
1906 if (!output)
1907 return 0;
1908 if (!PyBytes_Check(output)) {
1909 Py_DECREF(output);
1910 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1911 return 0;
1912 }
1913 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001914 size = PyBytes_GET_SIZE(output);
1915 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001916 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05001917 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001918 Py_DECREF(output);
1919 return 0;
1920 }
1921 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001922 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001923}
1924
1925
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001926int
1927PyUnicode_FSDecoder(PyObject* arg, void* addr)
1928{
1929 PyObject *output = NULL;
1930 Py_ssize_t size;
1931 void *data;
1932 if (arg == NULL) {
1933 Py_DECREF(*(PyObject**)addr);
1934 return 1;
1935 }
1936 if (PyUnicode_Check(arg)) {
1937 output = arg;
1938 Py_INCREF(output);
1939 }
1940 else {
1941 arg = PyBytes_FromObject(arg);
1942 if (!arg)
1943 return 0;
1944 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1945 PyBytes_GET_SIZE(arg));
1946 Py_DECREF(arg);
1947 if (!output)
1948 return 0;
1949 if (!PyUnicode_Check(output)) {
1950 Py_DECREF(output);
1951 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1952 return 0;
1953 }
1954 }
1955 size = PyUnicode_GET_SIZE(output);
1956 data = PyUnicode_AS_UNICODE(output);
1957 if (size != Py_UNICODE_strlen(data)) {
1958 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1959 Py_DECREF(output);
1960 return 0;
1961 }
1962 *(PyObject**)addr = output;
1963 return Py_CLEANUP_SUPPORTED;
1964}
1965
1966
Martin v. Löwis5b222132007-06-10 09:51:05 +00001967char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001968_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001969{
Christian Heimesf3863112007-11-22 07:46:41 +00001970 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001971 if (!PyUnicode_Check(unicode)) {
1972 PyErr_BadArgument();
1973 return NULL;
1974 }
Christian Heimesf3863112007-11-22 07:46:41 +00001975 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1976 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001977 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001978 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001979 *psize = PyBytes_GET_SIZE(bytes);
1980 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001981}
1982
1983char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001984_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001985{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001986 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001987}
1988
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1990{
1991 if (!PyUnicode_Check(unicode)) {
1992 PyErr_BadArgument();
1993 goto onError;
1994 }
1995 return PyUnicode_AS_UNICODE(unicode);
1996
Benjamin Peterson29060642009-01-31 22:14:21 +00001997 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998 return NULL;
1999}
2000
Martin v. Löwis18e16552006-02-15 17:27:45 +00002001Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002002{
2003 if (!PyUnicode_Check(unicode)) {
2004 PyErr_BadArgument();
2005 goto onError;
2006 }
2007 return PyUnicode_GET_SIZE(unicode);
2008
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 return -1;
2011}
2012
Thomas Wouters78890102000-07-22 19:25:51 +00002013const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002014{
Victor Stinner42cb4622010-09-01 19:39:01 +00002015 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002016}
2017
Victor Stinner554f3f02010-06-16 23:33:54 +00002018/* create or adjust a UnicodeDecodeError */
2019static void
2020make_decode_exception(PyObject **exceptionObject,
2021 const char *encoding,
2022 const char *input, Py_ssize_t length,
2023 Py_ssize_t startpos, Py_ssize_t endpos,
2024 const char *reason)
2025{
2026 if (*exceptionObject == NULL) {
2027 *exceptionObject = PyUnicodeDecodeError_Create(
2028 encoding, input, length, startpos, endpos, reason);
2029 }
2030 else {
2031 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2032 goto onError;
2033 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2034 goto onError;
2035 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2036 goto onError;
2037 }
2038 return;
2039
2040onError:
2041 Py_DECREF(*exceptionObject);
2042 *exceptionObject = NULL;
2043}
2044
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002045/* error handling callback helper:
2046 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002047 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 and adjust various state variables.
2049 return 0 on success, -1 on error
2050*/
2051
2052static
2053int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 const char *encoding, const char *reason,
2055 const char **input, const char **inend, Py_ssize_t *startinpos,
2056 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2057 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002059 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060
2061 PyObject *restuple = NULL;
2062 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002063 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002064 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002065 Py_ssize_t requiredsize;
2066 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002068 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002069 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 int res = -1;
2071
2072 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002073 *errorHandler = PyCodec_LookupError(errors);
2074 if (*errorHandler == NULL)
2075 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002076 }
2077
Victor Stinner554f3f02010-06-16 23:33:54 +00002078 make_decode_exception(exceptionObject,
2079 encoding,
2080 *input, *inend - *input,
2081 *startinpos, *endinpos,
2082 reason);
2083 if (*exceptionObject == NULL)
2084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085
2086 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2087 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002088 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002089 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002090 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002092 }
2093 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002094 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002095
2096 /* Copy back the bytes variables, which might have been modified by the
2097 callback */
2098 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2099 if (!inputobj)
2100 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002101 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002102 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002103 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002104 *input = PyBytes_AS_STRING(inputobj);
2105 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002106 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002107 /* we can DECREF safely, as the exception has another reference,
2108 so the object won't go away. */
2109 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002110
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002111 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002112 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002113 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002114 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2115 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002116 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117
2118 /* need more space? (at least enough for what we
2119 have+the replacement+the rest of the string (starting
2120 at the new input position), so we won't have to check space
2121 when there are no errors in the rest of the string) */
2122 repptr = PyUnicode_AS_UNICODE(repunicode);
2123 repsize = PyUnicode_GET_SIZE(repunicode);
2124 requiredsize = *outpos + repsize + insize-newpos;
2125 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002126 if (requiredsize<2*outsize)
2127 requiredsize = 2*outsize;
2128 if (_PyUnicode_Resize(output, requiredsize) < 0)
2129 goto onError;
2130 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002131 }
2132 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002133 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 Py_UNICODE_COPY(*outptr, repptr, repsize);
2135 *outptr += repsize;
2136 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002137
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 /* we made it! */
2139 res = 0;
2140
Benjamin Peterson29060642009-01-31 22:14:21 +00002141 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 Py_XDECREF(restuple);
2143 return res;
2144}
2145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002146/* --- UTF-7 Codec -------------------------------------------------------- */
2147
Antoine Pitrou244651a2009-05-04 18:56:13 +00002148/* See RFC2152 for details. We encode conservatively and decode liberally. */
2149
2150/* Three simple macros defining base-64. */
2151
2152/* Is c a base-64 character? */
2153
2154#define IS_BASE64(c) \
2155 (((c) >= 'A' && (c) <= 'Z') || \
2156 ((c) >= 'a' && (c) <= 'z') || \
2157 ((c) >= '0' && (c) <= '9') || \
2158 (c) == '+' || (c) == '/')
2159
2160/* given that c is a base-64 character, what is its base-64 value? */
2161
2162#define FROM_BASE64(c) \
2163 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2164 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2165 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2166 (c) == '+' ? 62 : 63)
2167
2168/* What is the base-64 character of the bottom 6 bits of n? */
2169
2170#define TO_BASE64(n) \
2171 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2172
2173/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2174 * decoded as itself. We are permissive on decoding; the only ASCII
2175 * byte not decoding to itself is the + which begins a base64
2176 * string. */
2177
2178#define DECODE_DIRECT(c) \
2179 ((c) <= 127 && (c) != '+')
2180
2181/* The UTF-7 encoder treats ASCII characters differently according to
2182 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2183 * the above). See RFC2152. This array identifies these different
2184 * sets:
2185 * 0 : "Set D"
2186 * alphanumeric and '(),-./:?
2187 * 1 : "Set O"
2188 * !"#$%&*;<=>@[]^_`{|}
2189 * 2 : "whitespace"
2190 * ht nl cr sp
2191 * 3 : special (must be base64 encoded)
2192 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2193 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002194
Tim Petersced69f82003-09-16 20:30:58 +00002195static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002196char utf7_category[128] = {
2197/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2198 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2199/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2200 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2201/* sp ! " # $ % & ' ( ) * + , - . / */
2202 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2203/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2205/* @ A B C D E F G H I J K L M N O */
2206 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2207/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2209/* ` a b c d e f g h i j k l m n o */
2210 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2211/* p q r s t u v w x y z { | } ~ del */
2212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002213};
2214
Antoine Pitrou244651a2009-05-04 18:56:13 +00002215/* ENCODE_DIRECT: this character should be encoded as itself. The
2216 * answer depends on whether we are encoding set O as itself, and also
2217 * on whether we are encoding whitespace as itself. RFC2152 makes it
2218 * clear that the answers to these questions vary between
2219 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002220
Antoine Pitrou244651a2009-05-04 18:56:13 +00002221#define ENCODE_DIRECT(c, directO, directWS) \
2222 ((c) < 128 && (c) > 0 && \
2223 ((utf7_category[(c)] == 0) || \
2224 (directWS && (utf7_category[(c)] == 2)) || \
2225 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002226
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002227PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002228 Py_ssize_t size,
2229 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002230{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002231 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2232}
2233
Antoine Pitrou244651a2009-05-04 18:56:13 +00002234/* The decoder. The only state we preserve is our read position,
2235 * i.e. how many characters we have consumed. So if we end in the
2236 * middle of a shift sequence we have to back off the read position
2237 * and the output to the beginning of the sequence, otherwise we lose
2238 * all the shift state (seen bits, number of bits seen, high
2239 * surrogate). */
2240
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002241PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002242 Py_ssize_t size,
2243 const char *errors,
2244 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002245{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002246 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002247 Py_ssize_t startinpos;
2248 Py_ssize_t endinpos;
2249 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002250 const char *e;
2251 PyUnicodeObject *unicode;
2252 Py_UNICODE *p;
2253 const char *errmsg = "";
2254 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002255 Py_UNICODE *shiftOutStart;
2256 unsigned int base64bits = 0;
2257 unsigned long base64buffer = 0;
2258 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 PyObject *errorHandler = NULL;
2260 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002261
2262 unicode = _PyUnicode_New(size);
2263 if (!unicode)
2264 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002265 if (size == 0) {
2266 if (consumed)
2267 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002268 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002269 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002270
2271 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002272 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002273 e = s + size;
2274
2275 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002276 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002277 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002278 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002279
Antoine Pitrou244651a2009-05-04 18:56:13 +00002280 if (inShift) { /* in a base-64 section */
2281 if (IS_BASE64(ch)) { /* consume a base-64 character */
2282 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2283 base64bits += 6;
2284 s++;
2285 if (base64bits >= 16) {
2286 /* we have enough bits for a UTF-16 value */
2287 Py_UNICODE outCh = (Py_UNICODE)
2288 (base64buffer >> (base64bits-16));
2289 base64bits -= 16;
2290 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2291 if (surrogate) {
2292 /* expecting a second surrogate */
2293 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2294#ifdef Py_UNICODE_WIDE
2295 *p++ = (((surrogate & 0x3FF)<<10)
2296 | (outCh & 0x3FF)) + 0x10000;
2297#else
2298 *p++ = surrogate;
2299 *p++ = outCh;
2300#endif
2301 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002302 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002303 }
2304 else {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002305 *p++ = surrogate;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002306 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002307 }
2308 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002309 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002310 /* first surrogate */
2311 surrogate = outCh;
2312 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002313 else {
2314 *p++ = outCh;
2315 }
2316 }
2317 }
2318 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002319 inShift = 0;
2320 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002321 if (surrogate) {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002322 *p++ = surrogate;
2323 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002324 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002325 if (base64bits > 0) { /* left-over bits */
2326 if (base64bits >= 6) {
2327 /* We've seen at least one base-64 character */
2328 errmsg = "partial character in shift sequence";
2329 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002330 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002331 else {
2332 /* Some bits remain; they should be zero */
2333 if (base64buffer != 0) {
2334 errmsg = "non-zero padding bits in shift sequence";
2335 goto utf7Error;
2336 }
2337 }
2338 }
2339 if (ch != '-') {
2340 /* '-' is absorbed; other terminating
2341 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002342 *p++ = ch;
2343 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002344 }
2345 }
2346 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002347 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002348 s++; /* consume '+' */
2349 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002350 s++;
2351 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002352 }
2353 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002354 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002355 shiftOutStart = p;
2356 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002357 }
2358 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002359 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002360 *p++ = ch;
2361 s++;
2362 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002363 else {
2364 startinpos = s-starts;
2365 s++;
2366 errmsg = "unexpected special character";
2367 goto utf7Error;
2368 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002369 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002370utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002371 outpos = p-PyUnicode_AS_UNICODE(unicode);
2372 endinpos = s-starts;
2373 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002374 errors, &errorHandler,
2375 "utf7", errmsg,
2376 &starts, &e, &startinpos, &endinpos, &exc, &s,
2377 &unicode, &outpos, &p))
2378 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002379 }
2380
Antoine Pitrou244651a2009-05-04 18:56:13 +00002381 /* end of string */
2382
2383 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2384 /* if we're in an inconsistent state, that's an error */
2385 if (surrogate ||
2386 (base64bits >= 6) ||
2387 (base64bits > 0 && base64buffer != 0)) {
2388 outpos = p-PyUnicode_AS_UNICODE(unicode);
2389 endinpos = size;
2390 if (unicode_decode_call_errorhandler(
2391 errors, &errorHandler,
2392 "utf7", "unterminated shift sequence",
2393 &starts, &e, &startinpos, &endinpos, &exc, &s,
2394 &unicode, &outpos, &p))
2395 goto onError;
2396 if (s < e)
2397 goto restart;
2398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002400
2401 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002402 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002403 if (inShift) {
2404 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002405 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002406 }
2407 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002408 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002409 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002410 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002411
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002412 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002413 goto onError;
2414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002417 return (PyObject *)unicode;
2418
Benjamin Peterson29060642009-01-31 22:14:21 +00002419 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002420 Py_XDECREF(errorHandler);
2421 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002422 Py_DECREF(unicode);
2423 return NULL;
2424}
2425
2426
2427PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002428 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002429 int base64SetO,
2430 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002431 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002432{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002433 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002434 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002435 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002436 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002437 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002438 unsigned int base64bits = 0;
2439 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002440 char * out;
2441 char * start;
2442
2443 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002444 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002445
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002446 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002447 return PyErr_NoMemory();
2448
Antoine Pitrou244651a2009-05-04 18:56:13 +00002449 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002450 if (v == NULL)
2451 return NULL;
2452
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002453 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002454 for (;i < size; ++i) {
2455 Py_UNICODE ch = s[i];
2456
Antoine Pitrou244651a2009-05-04 18:56:13 +00002457 if (inShift) {
2458 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2459 /* shifting out */
2460 if (base64bits) { /* output remaining bits */
2461 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2462 base64buffer = 0;
2463 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002464 }
2465 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002466 /* Characters not in the BASE64 set implicitly unshift the sequence
2467 so no '-' is required, except if the character is itself a '-' */
2468 if (IS_BASE64(ch) || ch == '-') {
2469 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002471 *out++ = (char) ch;
2472 }
2473 else {
2474 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002475 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002476 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002477 else { /* not in a shift sequence */
2478 if (ch == '+') {
2479 *out++ = '+';
2480 *out++ = '-';
2481 }
2482 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2483 *out++ = (char) ch;
2484 }
2485 else {
2486 *out++ = '+';
2487 inShift = 1;
2488 goto encode_char;
2489 }
2490 }
2491 continue;
2492encode_char:
2493#ifdef Py_UNICODE_WIDE
2494 if (ch >= 0x10000) {
2495 /* code first surrogate */
2496 base64bits += 16;
2497 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2498 while (base64bits >= 6) {
2499 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2500 base64bits -= 6;
2501 }
2502 /* prepare second surrogate */
2503 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2504 }
2505#endif
2506 base64bits += 16;
2507 base64buffer = (base64buffer << 16) | ch;
2508 while (base64bits >= 6) {
2509 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2510 base64bits -= 6;
2511 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002512 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002513 if (base64bits)
2514 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2515 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002516 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002517 if (_PyBytes_Resize(&v, out - start) < 0)
2518 return NULL;
2519 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002520}
2521
Antoine Pitrou244651a2009-05-04 18:56:13 +00002522#undef IS_BASE64
2523#undef FROM_BASE64
2524#undef TO_BASE64
2525#undef DECODE_DIRECT
2526#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002527
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528/* --- UTF-8 Codec -------------------------------------------------------- */
2529
Tim Petersced69f82003-09-16 20:30:58 +00002530static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002532 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2533 illegal prefix. See RFC 3629 for details */
2534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2544 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2546 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2547 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2548 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2549 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550};
2551
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002553 Py_ssize_t size,
2554 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555{
Walter Dörwald69652032004-09-07 20:24:22 +00002556 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2557}
2558
Antoine Pitrouab868312009-01-10 15:40:25 +00002559/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2560#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2561
2562/* Mask to quickly check whether a C 'long' contains a
2563 non-ASCII, UTF8-encoded char. */
2564#if (SIZEOF_LONG == 8)
2565# define ASCII_CHAR_MASK 0x8080808080808080L
2566#elif (SIZEOF_LONG == 4)
2567# define ASCII_CHAR_MASK 0x80808080L
2568#else
2569# error C 'long' size should be either 4 or 8!
2570#endif
2571
Walter Dörwald69652032004-09-07 20:24:22 +00002572PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002573 Py_ssize_t size,
2574 const char *errors,
2575 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002576{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002579 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002580 Py_ssize_t startinpos;
2581 Py_ssize_t endinpos;
2582 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002583 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 PyUnicodeObject *unicode;
2585 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002586 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002587 PyObject *errorHandler = NULL;
2588 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589
2590 /* Note: size will always be longer than the resulting Unicode
2591 character count */
2592 unicode = _PyUnicode_New(size);
2593 if (!unicode)
2594 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002595 if (size == 0) {
2596 if (consumed)
2597 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600
2601 /* Unpack UTF-8 encoded data */
2602 p = unicode->str;
2603 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002604 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605
2606 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002607 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608
2609 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002610 /* Fast path for runs of ASCII characters. Given that common UTF-8
2611 input will consist of an overwhelming majority of ASCII
2612 characters, we try to optimize for this case by checking
2613 as many characters as a C 'long' can contain.
2614 First, check if we can do an aligned read, as most CPUs have
2615 a penalty for unaligned reads.
2616 */
2617 if (!((size_t) s & LONG_PTR_MASK)) {
2618 /* Help register allocation */
2619 register const char *_s = s;
2620 register Py_UNICODE *_p = p;
2621 while (_s < aligned_end) {
2622 /* Read a whole long at a time (either 4 or 8 bytes),
2623 and do a fast unrolled copy if it only contains ASCII
2624 characters. */
2625 unsigned long data = *(unsigned long *) _s;
2626 if (data & ASCII_CHAR_MASK)
2627 break;
2628 _p[0] = (unsigned char) _s[0];
2629 _p[1] = (unsigned char) _s[1];
2630 _p[2] = (unsigned char) _s[2];
2631 _p[3] = (unsigned char) _s[3];
2632#if (SIZEOF_LONG == 8)
2633 _p[4] = (unsigned char) _s[4];
2634 _p[5] = (unsigned char) _s[5];
2635 _p[6] = (unsigned char) _s[6];
2636 _p[7] = (unsigned char) _s[7];
2637#endif
2638 _s += SIZEOF_LONG;
2639 _p += SIZEOF_LONG;
2640 }
2641 s = _s;
2642 p = _p;
2643 if (s == e)
2644 break;
2645 ch = (unsigned char)*s;
2646 }
2647 }
2648
2649 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002650 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 s++;
2652 continue;
2653 }
2654
2655 n = utf8_code_length[ch];
2656
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002657 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002658 if (consumed)
2659 break;
2660 else {
2661 errmsg = "unexpected end of data";
2662 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002663 endinpos = startinpos+1;
2664 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2665 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002666 goto utf8Error;
2667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002669
2670 switch (n) {
2671
2672 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002673 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002674 startinpos = s-starts;
2675 endinpos = startinpos+1;
2676 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677
2678 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002679 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 startinpos = s-starts;
2681 endinpos = startinpos+1;
2682 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683
2684 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002685 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002686 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002687 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002688 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 goto utf8Error;
2690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002692 assert ((ch > 0x007F) && (ch <= 0x07FF));
2693 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 break;
2695
2696 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002697 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2698 will result in surrogates in range d800-dfff. Surrogates are
2699 not valid UTF-8 so they are rejected.
2700 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2701 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002702 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002703 (s[2] & 0xc0) != 0x80 ||
2704 ((unsigned char)s[0] == 0xE0 &&
2705 (unsigned char)s[1] < 0xA0) ||
2706 ((unsigned char)s[0] == 0xED &&
2707 (unsigned char)s[1] > 0x9F)) {
2708 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002709 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002710 endinpos = startinpos + 1;
2711
2712 /* if s[1] first two bits are 1 and 0, then the invalid
2713 continuation byte is s[2], so increment endinpos by 1,
2714 if not, s[1] is invalid and endinpos doesn't need to
2715 be incremented. */
2716 if ((s[1] & 0xC0) == 0x80)
2717 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 goto utf8Error;
2719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002721 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2722 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002723 break;
2724
2725 case 4:
2726 if ((s[1] & 0xc0) != 0x80 ||
2727 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002728 (s[3] & 0xc0) != 0x80 ||
2729 ((unsigned char)s[0] == 0xF0 &&
2730 (unsigned char)s[1] < 0x90) ||
2731 ((unsigned char)s[0] == 0xF4 &&
2732 (unsigned char)s[1] > 0x8F)) {
2733 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002734 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002735 endinpos = startinpos + 1;
2736 if ((s[1] & 0xC0) == 0x80) {
2737 endinpos++;
2738 if ((s[2] & 0xC0) == 0x80)
2739 endinpos++;
2740 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002741 goto utf8Error;
2742 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002743 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002744 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2745 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2746
Fredrik Lundh8f455852001-06-27 18:59:43 +00002747#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002749#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002750 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002751
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002752 /* translate from 10000..10FFFF to 0..FFFF */
2753 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002754
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002755 /* high surrogate = top 10 bits added to D800 */
2756 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002757
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002758 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002759 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002760#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 }
2763 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002765
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 utf8Error:
2767 outpos = p-PyUnicode_AS_UNICODE(unicode);
2768 if (unicode_decode_call_errorhandler(
2769 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01002770 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 &starts, &e, &startinpos, &endinpos, &exc, &s,
2772 &unicode, &outpos, &p))
2773 goto onError;
2774 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 }
Walter Dörwald69652032004-09-07 20:24:22 +00002776 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002777 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778
2779 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002780 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 goto onError;
2782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002783 Py_XDECREF(errorHandler);
2784 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 return (PyObject *)unicode;
2786
Benjamin Peterson29060642009-01-31 22:14:21 +00002787 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788 Py_XDECREF(errorHandler);
2789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 Py_DECREF(unicode);
2791 return NULL;
2792}
2793
Antoine Pitrouab868312009-01-10 15:40:25 +00002794#undef ASCII_CHAR_MASK
2795
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002796#ifdef __APPLE__
2797
2798/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01002799 used to decode the command line arguments on Mac OS X.
2800
2801 Return a pointer to a newly allocated wide character string (use
2802 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002803
2804wchar_t*
2805_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2806{
2807 int n;
2808 const char *e;
2809 wchar_t *unicode, *p;
2810
2811 /* Note: size will always be longer than the resulting Unicode
2812 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01002813 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002814 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002815 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2816 if (!unicode)
2817 return NULL;
2818
2819 /* Unpack UTF-8 encoded data */
2820 p = unicode;
2821 e = s + size;
2822 while (s < e) {
2823 Py_UCS4 ch = (unsigned char)*s;
2824
2825 if (ch < 0x80) {
2826 *p++ = (wchar_t)ch;
2827 s++;
2828 continue;
2829 }
2830
2831 n = utf8_code_length[ch];
2832 if (s + n > e) {
2833 goto surrogateescape;
2834 }
2835
2836 switch (n) {
2837 case 0:
2838 case 1:
2839 goto surrogateescape;
2840
2841 case 2:
2842 if ((s[1] & 0xc0) != 0x80)
2843 goto surrogateescape;
2844 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2845 assert ((ch > 0x007F) && (ch <= 0x07FF));
2846 *p++ = (wchar_t)ch;
2847 break;
2848
2849 case 3:
2850 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2851 will result in surrogates in range d800-dfff. Surrogates are
2852 not valid UTF-8 so they are rejected.
2853 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2854 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2855 if ((s[1] & 0xc0) != 0x80 ||
2856 (s[2] & 0xc0) != 0x80 ||
2857 ((unsigned char)s[0] == 0xE0 &&
2858 (unsigned char)s[1] < 0xA0) ||
2859 ((unsigned char)s[0] == 0xED &&
2860 (unsigned char)s[1] > 0x9F)) {
2861
2862 goto surrogateescape;
2863 }
2864 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2865 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2866 *p++ = (Py_UNICODE)ch;
2867 break;
2868
2869 case 4:
2870 if ((s[1] & 0xc0) != 0x80 ||
2871 (s[2] & 0xc0) != 0x80 ||
2872 (s[3] & 0xc0) != 0x80 ||
2873 ((unsigned char)s[0] == 0xF0 &&
2874 (unsigned char)s[1] < 0x90) ||
2875 ((unsigned char)s[0] == 0xF4 &&
2876 (unsigned char)s[1] > 0x8F)) {
2877 goto surrogateescape;
2878 }
2879 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2880 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2881 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2882
2883#if SIZEOF_WCHAR_T == 4
2884 *p++ = (wchar_t)ch;
2885#else
2886 /* compute and append the two surrogates: */
2887
2888 /* translate from 10000..10FFFF to 0..FFFF */
2889 ch -= 0x10000;
2890
2891 /* high surrogate = top 10 bits added to D800 */
2892 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2893
2894 /* low surrogate = bottom 10 bits added to DC00 */
2895 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2896#endif
2897 break;
2898 }
2899 s += n;
2900 continue;
2901
2902 surrogateescape:
2903 *p++ = 0xDC00 + ch;
2904 s++;
2905 }
2906 *p = L'\0';
2907 return unicode;
2908}
2909
2910#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002911
Tim Peters602f7402002-04-27 18:03:26 +00002912/* Allocation strategy: if the string is short, convert into a stack buffer
2913 and allocate exactly as much space needed at the end. Else allocate the
2914 maximum possible needed (4 result bytes per Unicode character), and return
2915 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002916*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002917PyObject *
2918PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 Py_ssize_t size,
2920 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921{
Tim Peters602f7402002-04-27 18:03:26 +00002922#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002923
Guido van Rossum98297ee2007-11-06 21:34:58 +00002924 Py_ssize_t i; /* index into s of next input byte */
2925 PyObject *result; /* result string object */
2926 char *p; /* next free byte in output buffer */
2927 Py_ssize_t nallocated; /* number of result bytes allocated */
2928 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002929 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002930 PyObject *errorHandler = NULL;
2931 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002932
Tim Peters602f7402002-04-27 18:03:26 +00002933 assert(s != NULL);
2934 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935
Tim Peters602f7402002-04-27 18:03:26 +00002936 if (size <= MAX_SHORT_UNICHARS) {
2937 /* Write into the stack buffer; nallocated can't overflow.
2938 * At the end, we'll allocate exactly as much heap space as it
2939 * turns out we need.
2940 */
2941 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002942 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002943 p = stackbuf;
2944 }
2945 else {
2946 /* Overallocate on the heap, and give the excess back at the end. */
2947 nallocated = size * 4;
2948 if (nallocated / 4 != size) /* overflow! */
2949 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002950 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002951 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002952 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002953 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002954 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002955
Tim Peters602f7402002-04-27 18:03:26 +00002956 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002957 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002958
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002959 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002960 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002962
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002964 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002965 *p++ = (char)(0xc0 | (ch >> 6));
2966 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002967 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002968#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002969 /* Special case: check for high and low surrogate */
2970 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2971 Py_UCS4 ch2 = s[i];
2972 /* Combine the two surrogates to form a UCS4 value */
2973 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2974 i++;
2975
2976 /* Encode UCS4 Unicode ordinals */
2977 *p++ = (char)(0xf0 | (ch >> 18));
2978 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002979 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2980 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002981 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002982#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002983 Py_ssize_t newpos;
2984 PyObject *rep;
2985 Py_ssize_t repsize, k;
2986 rep = unicode_encode_call_errorhandler
2987 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2988 s, size, &exc, i-1, i, &newpos);
2989 if (!rep)
2990 goto error;
2991
2992 if (PyBytes_Check(rep))
2993 repsize = PyBytes_GET_SIZE(rep);
2994 else
2995 repsize = PyUnicode_GET_SIZE(rep);
2996
2997 if (repsize > 4) {
2998 Py_ssize_t offset;
2999
3000 if (result == NULL)
3001 offset = p - stackbuf;
3002 else
3003 offset = p - PyBytes_AS_STRING(result);
3004
3005 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3006 /* integer overflow */
3007 PyErr_NoMemory();
3008 goto error;
3009 }
3010 nallocated += repsize - 4;
3011 if (result != NULL) {
3012 if (_PyBytes_Resize(&result, nallocated) < 0)
3013 goto error;
3014 } else {
3015 result = PyBytes_FromStringAndSize(NULL, nallocated);
3016 if (result == NULL)
3017 goto error;
3018 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3019 }
3020 p = PyBytes_AS_STRING(result) + offset;
3021 }
3022
3023 if (PyBytes_Check(rep)) {
3024 char *prep = PyBytes_AS_STRING(rep);
3025 for(k = repsize; k > 0; k--)
3026 *p++ = *prep++;
3027 } else /* rep is unicode */ {
3028 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3029 Py_UNICODE c;
3030
3031 for(k=0; k<repsize; k++) {
3032 c = prep[k];
3033 if (0x80 <= c) {
3034 raise_encode_exception(&exc, "utf-8", s, size,
3035 i-1, i, "surrogates not allowed");
3036 goto error;
3037 }
3038 *p++ = (char)prep[k];
3039 }
3040 }
3041 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003042#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003043 }
Victor Stinner445a6232010-04-22 20:01:57 +00003044#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003045 } else if (ch < 0x10000) {
3046 *p++ = (char)(0xe0 | (ch >> 12));
3047 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3048 *p++ = (char)(0x80 | (ch & 0x3f));
3049 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003050 /* Encode UCS4 Unicode ordinals */
3051 *p++ = (char)(0xf0 | (ch >> 18));
3052 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3053 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3054 *p++ = (char)(0x80 | (ch & 0x3f));
3055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003057
Guido van Rossum98297ee2007-11-06 21:34:58 +00003058 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003059 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003060 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003061 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003062 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003063 }
3064 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003065 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003066 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003067 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003068 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003069 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003070 Py_XDECREF(errorHandler);
3071 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003072 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003073 error:
3074 Py_XDECREF(errorHandler);
3075 Py_XDECREF(exc);
3076 Py_XDECREF(result);
3077 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003078
Tim Peters602f7402002-04-27 18:03:26 +00003079#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080}
3081
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3083{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 if (!PyUnicode_Check(unicode)) {
3085 PyErr_BadArgument();
3086 return NULL;
3087 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003088 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003089 PyUnicode_GET_SIZE(unicode),
3090 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091}
3092
Walter Dörwald41980ca2007-08-16 21:55:45 +00003093/* --- UTF-32 Codec ------------------------------------------------------- */
3094
3095PyObject *
3096PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 Py_ssize_t size,
3098 const char *errors,
3099 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003100{
3101 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3102}
3103
3104PyObject *
3105PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003106 Py_ssize_t size,
3107 const char *errors,
3108 int *byteorder,
3109 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003110{
3111 const char *starts = s;
3112 Py_ssize_t startinpos;
3113 Py_ssize_t endinpos;
3114 Py_ssize_t outpos;
3115 PyUnicodeObject *unicode;
3116 Py_UNICODE *p;
3117#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003118 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003119 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003120#else
3121 const int pairs = 0;
3122#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003123 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003124 int bo = 0; /* assume native ordering by default */
3125 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003126 /* Offsets from q for retrieving bytes in the right order. */
3127#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3128 int iorder[] = {0, 1, 2, 3};
3129#else
3130 int iorder[] = {3, 2, 1, 0};
3131#endif
3132 PyObject *errorHandler = NULL;
3133 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003134
Walter Dörwald41980ca2007-08-16 21:55:45 +00003135 q = (unsigned char *)s;
3136 e = q + size;
3137
3138 if (byteorder)
3139 bo = *byteorder;
3140
3141 /* Check for BOM marks (U+FEFF) in the input and adjust current
3142 byte order setting accordingly. In native mode, the leading BOM
3143 mark is skipped, in all other modes, it is copied to the output
3144 stream as-is (giving a ZWNBSP character). */
3145 if (bo == 0) {
3146 if (size >= 4) {
3147 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003148 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003149#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003150 if (bom == 0x0000FEFF) {
3151 q += 4;
3152 bo = -1;
3153 }
3154 else if (bom == 0xFFFE0000) {
3155 q += 4;
3156 bo = 1;
3157 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003158#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003159 if (bom == 0x0000FEFF) {
3160 q += 4;
3161 bo = 1;
3162 }
3163 else if (bom == 0xFFFE0000) {
3164 q += 4;
3165 bo = -1;
3166 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003167#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003169 }
3170
3171 if (bo == -1) {
3172 /* force LE */
3173 iorder[0] = 0;
3174 iorder[1] = 1;
3175 iorder[2] = 2;
3176 iorder[3] = 3;
3177 }
3178 else if (bo == 1) {
3179 /* force BE */
3180 iorder[0] = 3;
3181 iorder[1] = 2;
3182 iorder[2] = 1;
3183 iorder[3] = 0;
3184 }
3185
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003186 /* On narrow builds we split characters outside the BMP into two
3187 codepoints => count how much extra space we need. */
3188#ifndef Py_UNICODE_WIDE
Serhiy Storchakadec798e2013-01-08 22:45:42 +02003189 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003190 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3191 pairs++;
3192#endif
3193
3194 /* This might be one to much, because of a BOM */
3195 unicode = _PyUnicode_New((size+3)/4+pairs);
3196 if (!unicode)
3197 return NULL;
3198 if (size == 0)
3199 return (PyObject *)unicode;
3200
3201 /* Unpack UTF-32 encoded data */
3202 p = unicode->str;
3203
Walter Dörwald41980ca2007-08-16 21:55:45 +00003204 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 Py_UCS4 ch;
3206 /* remaining bytes at the end? (size should be divisible by 4) */
3207 if (e-q<4) {
3208 if (consumed)
3209 break;
3210 errmsg = "truncated data";
3211 startinpos = ((const char *)q)-starts;
3212 endinpos = ((const char *)e)-starts;
3213 goto utf32Error;
3214 /* The remaining input chars are ignored if the callback
3215 chooses to skip the input */
3216 }
3217 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3218 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003219
Benjamin Peterson29060642009-01-31 22:14:21 +00003220 if (ch >= 0x110000)
3221 {
3222 errmsg = "codepoint not in range(0x110000)";
3223 startinpos = ((const char *)q)-starts;
3224 endinpos = startinpos+4;
3225 goto utf32Error;
3226 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003227#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 if (ch >= 0x10000)
3229 {
3230 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3231 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3232 }
3233 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003234#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 *p++ = ch;
3236 q += 4;
3237 continue;
3238 utf32Error:
3239 outpos = p-PyUnicode_AS_UNICODE(unicode);
3240 if (unicode_decode_call_errorhandler(
3241 errors, &errorHandler,
3242 "utf32", errmsg,
3243 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3244 &unicode, &outpos, &p))
3245 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003246 }
3247
3248 if (byteorder)
3249 *byteorder = bo;
3250
3251 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003252 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003253
3254 /* Adjust length */
3255 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3256 goto onError;
3257
3258 Py_XDECREF(errorHandler);
3259 Py_XDECREF(exc);
3260 return (PyObject *)unicode;
3261
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003263 Py_DECREF(unicode);
3264 Py_XDECREF(errorHandler);
3265 Py_XDECREF(exc);
3266 return NULL;
3267}
3268
3269PyObject *
3270PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003271 Py_ssize_t size,
3272 const char *errors,
3273 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003274{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003275 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003276 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003277 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003278#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003279 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003280#else
3281 const int pairs = 0;
3282#endif
3283 /* Offsets from p for storing byte pairs in the right order. */
3284#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3285 int iorder[] = {0, 1, 2, 3};
3286#else
3287 int iorder[] = {3, 2, 1, 0};
3288#endif
3289
Benjamin Peterson29060642009-01-31 22:14:21 +00003290#define STORECHAR(CH) \
3291 do { \
3292 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3293 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3294 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3295 p[iorder[0]] = (CH) & 0xff; \
3296 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003297 } while(0)
3298
3299 /* In narrow builds we can output surrogate pairs as one codepoint,
3300 so we need less space. */
3301#ifndef Py_UNICODE_WIDE
3302 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3304 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3305 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003306#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003307 nsize = (size - pairs + (byteorder == 0));
3308 bytesize = nsize * 4;
3309 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003311 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003312 if (v == NULL)
3313 return NULL;
3314
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003315 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003316 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003317 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003318 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003319 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003320
3321 if (byteorder == -1) {
3322 /* force LE */
3323 iorder[0] = 0;
3324 iorder[1] = 1;
3325 iorder[2] = 2;
3326 iorder[3] = 3;
3327 }
3328 else if (byteorder == 1) {
3329 /* force BE */
3330 iorder[0] = 3;
3331 iorder[1] = 2;
3332 iorder[2] = 1;
3333 iorder[3] = 0;
3334 }
3335
3336 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003337 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003338#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003339 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3340 Py_UCS4 ch2 = *s;
3341 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3342 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3343 s++;
3344 size--;
3345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003346 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003347#endif
3348 STORECHAR(ch);
3349 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003350
3351 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003352 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003353#undef STORECHAR
3354}
3355
3356PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3357{
3358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_BadArgument();
3360 return NULL;
3361 }
3362 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003363 PyUnicode_GET_SIZE(unicode),
3364 NULL,
3365 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003366}
3367
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368/* --- UTF-16 Codec ------------------------------------------------------- */
3369
Tim Peters772747b2001-08-09 22:21:55 +00003370PyObject *
3371PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 Py_ssize_t size,
3373 const char *errors,
3374 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375{
Walter Dörwald69652032004-09-07 20:24:22 +00003376 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3377}
3378
Antoine Pitrouab868312009-01-10 15:40:25 +00003379/* Two masks for fast checking of whether a C 'long' may contain
3380 UTF16-encoded surrogate characters. This is an efficient heuristic,
3381 assuming that non-surrogate characters with a code point >= 0x8000 are
3382 rare in most input.
3383 FAST_CHAR_MASK is used when the input is in native byte ordering,
3384 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003385*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003386#if (SIZEOF_LONG == 8)
3387# define FAST_CHAR_MASK 0x8000800080008000L
3388# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3389#elif (SIZEOF_LONG == 4)
3390# define FAST_CHAR_MASK 0x80008000L
3391# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3392#else
3393# error C 'long' size should be either 4 or 8!
3394#endif
3395
Walter Dörwald69652032004-09-07 20:24:22 +00003396PyObject *
3397PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003398 Py_ssize_t size,
3399 const char *errors,
3400 int *byteorder,
3401 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003402{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003404 Py_ssize_t startinpos;
3405 Py_ssize_t endinpos;
3406 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 PyUnicodeObject *unicode;
3408 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003409 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003410 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003411 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003412 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003413 /* Offsets from q for retrieving byte pairs in the right order. */
3414#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3415 int ihi = 1, ilo = 0;
3416#else
3417 int ihi = 0, ilo = 1;
3418#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 PyObject *errorHandler = NULL;
3420 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421
3422 /* Note: size will always be longer than the resulting Unicode
3423 character count */
3424 unicode = _PyUnicode_New(size);
3425 if (!unicode)
3426 return NULL;
3427 if (size == 0)
3428 return (PyObject *)unicode;
3429
3430 /* Unpack UTF-16 encoded data */
3431 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003432 q = (unsigned char *)s;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003433 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434
3435 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003436 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003438 /* Check for BOM marks (U+FEFF) in the input and adjust current
3439 byte order setting accordingly. In native mode, the leading BOM
3440 mark is skipped, in all other modes, it is copied to the output
3441 stream as-is (giving a ZWNBSP character). */
3442 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003443 if (size >= 2) {
3444 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003445#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 if (bom == 0xFEFF) {
3447 q += 2;
3448 bo = -1;
3449 }
3450 else if (bom == 0xFFFE) {
3451 q += 2;
3452 bo = 1;
3453 }
Tim Petersced69f82003-09-16 20:30:58 +00003454#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 if (bom == 0xFEFF) {
3456 q += 2;
3457 bo = 1;
3458 }
3459 else if (bom == 0xFFFE) {
3460 q += 2;
3461 bo = -1;
3462 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003463#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466
Tim Peters772747b2001-08-09 22:21:55 +00003467 if (bo == -1) {
3468 /* force LE */
3469 ihi = 1;
3470 ilo = 0;
3471 }
3472 else if (bo == 1) {
3473 /* force BE */
3474 ihi = 0;
3475 ilo = 1;
3476 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003477#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3478 native_ordering = ilo < ihi;
3479#else
3480 native_ordering = ilo > ihi;
3481#endif
Tim Peters772747b2001-08-09 22:21:55 +00003482
Antoine Pitrouab868312009-01-10 15:40:25 +00003483 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003484 while (1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003485 Py_UNICODE ch;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003486 if (e - q < 2) {
3487 /* remaining byte at the end? (size should be even) */
3488 if (q == e || consumed)
3489 break;
3490 errmsg = "truncated data";
3491 startinpos = ((const char *)q) - starts;
3492 endinpos = ((const char *)e) - starts;
3493 outpos = p - PyUnicode_AS_UNICODE(unicode);
3494 goto utf16Error;
3495 /* The remaining input chars are ignored if the callback
3496 chooses to skip the input */
3497 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003498 /* First check for possible aligned read of a C 'long'. Unaligned
3499 reads are more expensive, better to defer to another iteration. */
3500 if (!((size_t) q & LONG_PTR_MASK)) {
3501 /* Fast path for runs of non-surrogate chars. */
3502 register const unsigned char *_q = q;
3503 Py_UNICODE *_p = p;
3504 if (native_ordering) {
3505 /* Native ordering is simple: as long as the input cannot
3506 possibly contain a surrogate char, do an unrolled copy
3507 of several 16-bit code points to the target object.
3508 The non-surrogate check is done on several input bytes
3509 at a time (as many as a C 'long' can contain). */
3510 while (_q < aligned_end) {
3511 unsigned long data = * (unsigned long *) _q;
3512 if (data & FAST_CHAR_MASK)
3513 break;
3514 _p[0] = ((unsigned short *) _q)[0];
3515 _p[1] = ((unsigned short *) _q)[1];
3516#if (SIZEOF_LONG == 8)
3517 _p[2] = ((unsigned short *) _q)[2];
3518 _p[3] = ((unsigned short *) _q)[3];
3519#endif
3520 _q += SIZEOF_LONG;
3521 _p += SIZEOF_LONG / 2;
3522 }
3523 }
3524 else {
3525 /* Byteswapped ordering is similar, but we must decompose
3526 the copy bytewise, and take care of zero'ing out the
3527 upper bytes if the target object is in 32-bit units
3528 (that is, in UCS-4 builds). */
3529 while (_q < aligned_end) {
3530 unsigned long data = * (unsigned long *) _q;
3531 if (data & SWAPPED_FAST_CHAR_MASK)
3532 break;
3533 /* Zero upper bytes in UCS-4 builds */
3534#if (Py_UNICODE_SIZE > 2)
3535 _p[0] = 0;
3536 _p[1] = 0;
3537#if (SIZEOF_LONG == 8)
3538 _p[2] = 0;
3539 _p[3] = 0;
3540#endif
3541#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003542 /* Issue #4916; UCS-4 builds on big endian machines must
3543 fill the two last bytes of each 4-byte unit. */
3544#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3545# define OFF 2
3546#else
3547# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003548#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003549 ((unsigned char *) _p)[OFF + 1] = _q[0];
3550 ((unsigned char *) _p)[OFF + 0] = _q[1];
3551 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3552 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3553#if (SIZEOF_LONG == 8)
3554 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3555 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3556 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3557 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3558#endif
3559#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003560 _q += SIZEOF_LONG;
3561 _p += SIZEOF_LONG / 2;
3562 }
3563 }
3564 p = _p;
3565 q = _q;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003566 if (e - q < 2)
3567 continue;
Antoine Pitrouab868312009-01-10 15:40:25 +00003568 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570
Benjamin Peterson14339b62009-01-31 16:36:08 +00003571 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003572
3573 if (ch < 0xD800 || ch > 0xDFFF) {
3574 *p++ = ch;
3575 continue;
3576 }
3577
3578 /* UTF-16 code pair: */
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003579 if (e - q < 2) {
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02003580 q -= 2;
3581 if (consumed)
3582 break;
Benjamin Peterson29060642009-01-31 22:14:21 +00003583 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02003584 startinpos = ((const char *)q) - starts;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003585 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00003586 goto utf16Error;
3587 }
3588 if (0xD800 <= ch && ch <= 0xDBFF) {
3589 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3590 q += 2;
3591 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003592#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003593 *p++ = ch;
3594 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003595#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003597#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 continue;
3599 }
3600 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003601 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 startinpos = (((const char *)q)-4)-starts;
3603 endinpos = startinpos+2;
3604 goto utf16Error;
3605 }
3606
Benjamin Peterson14339b62009-01-31 16:36:08 +00003607 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003608 errmsg = "illegal encoding";
3609 startinpos = (((const char *)q)-2)-starts;
3610 endinpos = startinpos+2;
3611 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003612
Benjamin Peterson29060642009-01-31 22:14:21 +00003613 utf16Error:
3614 outpos = p - PyUnicode_AS_UNICODE(unicode);
3615 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003616 errors,
3617 &errorHandler,
3618 "utf16", errmsg,
3619 &starts,
3620 (const char **)&e,
3621 &startinpos,
3622 &endinpos,
3623 &exc,
3624 (const char **)&q,
3625 &unicode,
3626 &outpos,
3627 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003628 goto onError;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003629 /* Update data because unicode_decode_call_errorhandler might have
3630 changed the input object. */
3631 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Antoine Pitrouab868312009-01-10 15:40:25 +00003632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633
3634 if (byteorder)
3635 *byteorder = bo;
3636
Walter Dörwald69652032004-09-07 20:24:22 +00003637 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003639
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003641 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 goto onError;
3643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 Py_XDECREF(errorHandler);
3645 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 return (PyObject *)unicode;
3647
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 Py_XDECREF(errorHandler);
3651 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 return NULL;
3653}
3654
Antoine Pitrouab868312009-01-10 15:40:25 +00003655#undef FAST_CHAR_MASK
3656#undef SWAPPED_FAST_CHAR_MASK
3657
Tim Peters772747b2001-08-09 22:21:55 +00003658PyObject *
3659PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 Py_ssize_t size,
3661 const char *errors,
3662 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003664 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003665 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003666 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003667#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003668 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003669#else
3670 const int pairs = 0;
3671#endif
Tim Peters772747b2001-08-09 22:21:55 +00003672 /* Offsets from p for storing byte pairs in the right order. */
3673#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3674 int ihi = 1, ilo = 0;
3675#else
3676 int ihi = 0, ilo = 1;
3677#endif
3678
Benjamin Peterson29060642009-01-31 22:14:21 +00003679#define STORECHAR(CH) \
3680 do { \
3681 p[ihi] = ((CH) >> 8) & 0xff; \
3682 p[ilo] = (CH) & 0xff; \
3683 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003684 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003686#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003687 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 if (s[i] >= 0x10000)
3689 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003690#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003691 /* 2 * (size + pairs + (byteorder == 0)) */
3692 if (size > PY_SSIZE_T_MAX ||
3693 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003695 nsize = size + pairs + (byteorder == 0);
3696 bytesize = nsize * 2;
3697 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003699 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700 if (v == NULL)
3701 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003703 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003706 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003707 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003708
3709 if (byteorder == -1) {
3710 /* force LE */
3711 ihi = 1;
3712 ilo = 0;
3713 }
3714 else if (byteorder == 1) {
3715 /* force BE */
3716 ihi = 0;
3717 ilo = 1;
3718 }
3719
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003720 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 Py_UNICODE ch = *s++;
3722 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003723#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003724 if (ch >= 0x10000) {
3725 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3726 ch = 0xD800 | ((ch-0x10000) >> 10);
3727 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003728#endif
Tim Peters772747b2001-08-09 22:21:55 +00003729 STORECHAR(ch);
3730 if (ch2)
3731 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003732 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003733
3734 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003735 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003736#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737}
3738
3739PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3740{
3741 if (!PyUnicode_Check(unicode)) {
3742 PyErr_BadArgument();
3743 return NULL;
3744 }
3745 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 PyUnicode_GET_SIZE(unicode),
3747 NULL,
3748 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749}
3750
3751/* --- Unicode Escape Codec ----------------------------------------------- */
3752
Fredrik Lundh06d12682001-01-24 07:59:11 +00003753static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003754
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 Py_ssize_t size,
3757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003760 Py_ssize_t startinpos;
3761 Py_ssize_t endinpos;
3762 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003767 char* message;
3768 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769 PyObject *errorHandler = NULL;
3770 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003771
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 /* Escaped strings will always be longer than the resulting
3773 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 length after conversion to the true value.
3775 (but if the error callback returns a long replacement string
3776 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 v = _PyUnicode_New(size);
3778 if (v == NULL)
3779 goto onError;
3780 if (size == 0)
3781 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003785
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786 while (s < end) {
3787 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003788 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003789 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790
3791 /* Non-escape characters are interpreted as Unicode ordinals */
3792 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003793 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794 continue;
3795 }
3796
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003797 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 /* \ - Escapes */
3799 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003800 c = *s++;
3801 if (s > end)
3802 c = '\0'; /* Invalid after \ */
3803 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804
Benjamin Peterson29060642009-01-31 22:14:21 +00003805 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 case '\n': break;
3807 case '\\': *p++ = '\\'; break;
3808 case '\'': *p++ = '\''; break;
3809 case '\"': *p++ = '\"'; break;
3810 case 'b': *p++ = '\b'; break;
3811 case 'f': *p++ = '\014'; break; /* FF */
3812 case 't': *p++ = '\t'; break;
3813 case 'n': *p++ = '\n'; break;
3814 case 'r': *p++ = '\r'; break;
3815 case 'v': *p++ = '\013'; break; /* VT */
3816 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3817
Benjamin Peterson29060642009-01-31 22:14:21 +00003818 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 case '0': case '1': case '2': case '3':
3820 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003821 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003822 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003823 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003824 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003825 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003827 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 break;
3829
Benjamin Peterson29060642009-01-31 22:14:21 +00003830 /* hex escapes */
3831 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003833 digits = 2;
3834 message = "truncated \\xXX escape";
3835 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836
Benjamin Peterson29060642009-01-31 22:14:21 +00003837 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003839 digits = 4;
3840 message = "truncated \\uXXXX escape";
3841 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842
Benjamin Peterson29060642009-01-31 22:14:21 +00003843 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003844 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003845 digits = 8;
3846 message = "truncated \\UXXXXXXXX escape";
3847 hexescape:
3848 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003849 outpos = p-PyUnicode_AS_UNICODE(v);
3850 if (s+digits>end) {
3851 endinpos = size;
3852 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 errors, &errorHandler,
3854 "unicodeescape", "end of string in escape sequence",
3855 &starts, &end, &startinpos, &endinpos, &exc, &s,
3856 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 goto onError;
3858 goto nextByte;
3859 }
3860 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003861 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003862 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003863 endinpos = (s+i+1)-starts;
3864 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003865 errors, &errorHandler,
3866 "unicodeescape", message,
3867 &starts, &end, &startinpos, &endinpos, &exc, &s,
3868 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003869 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003870 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003871 }
3872 chr = (chr<<4) & ~0xF;
3873 if (c >= '0' && c <= '9')
3874 chr += c - '0';
3875 else if (c >= 'a' && c <= 'f')
3876 chr += 10 + c - 'a';
3877 else
3878 chr += 10 + c - 'A';
3879 }
3880 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003881 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882 /* _decoding_error will have already written into the
3883 target buffer. */
3884 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003885 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003886 /* when we get here, chr is a 32-bit unicode character */
3887 if (chr <= 0xffff)
3888 /* UCS-2 character */
3889 *p++ = (Py_UNICODE) chr;
3890 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003891 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003892 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003893#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003894 *p++ = chr;
3895#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003896 chr -= 0x10000L;
3897 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003898 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003899#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003900 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 endinpos = s-starts;
3902 outpos = p-PyUnicode_AS_UNICODE(v);
3903 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003904 errors, &errorHandler,
3905 "unicodeescape", "illegal Unicode character",
3906 &starts, &end, &startinpos, &endinpos, &exc, &s,
3907 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003908 goto onError;
3909 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003910 break;
3911
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003913 case 'N':
3914 message = "malformed \\N character escape";
3915 if (ucnhash_CAPI == NULL) {
3916 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003917 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003918 if (ucnhash_CAPI == NULL)
3919 goto ucnhashError;
3920 }
3921 if (*s == '{') {
3922 const char *start = s+1;
3923 /* look for the closing brace */
3924 while (*s != '}' && s < end)
3925 s++;
3926 if (s > start && s < end && *s == '}') {
3927 /* found a name. look it up in the unicode database */
3928 message = "unknown Unicode character name";
3929 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02003930 if (s - start - 1 <= INT_MAX &&
3931 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003932 goto store;
3933 }
3934 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 endinpos = s-starts;
3936 outpos = p-PyUnicode_AS_UNICODE(v);
3937 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003938 errors, &errorHandler,
3939 "unicodeescape", message,
3940 &starts, &end, &startinpos, &endinpos, &exc, &s,
3941 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003942 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003943 break;
3944
3945 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003946 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 message = "\\ at end of string";
3948 s--;
3949 endinpos = s-starts;
3950 outpos = p-PyUnicode_AS_UNICODE(v);
3951 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003952 errors, &errorHandler,
3953 "unicodeescape", message,
3954 &starts, &end, &startinpos, &endinpos, &exc, &s,
3955 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003956 goto onError;
3957 }
3958 else {
3959 *p++ = '\\';
3960 *p++ = (unsigned char)s[-1];
3961 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003962 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003964 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003967 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003969 Py_XDECREF(errorHandler);
3970 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003972
Benjamin Peterson29060642009-01-31 22:14:21 +00003973 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003974 PyErr_SetString(
3975 PyExc_UnicodeError,
3976 "\\N escapes not supported (can't load unicodedata module)"
3977 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003978 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 Py_XDECREF(errorHandler);
3980 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003981 return NULL;
3982
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 Py_XDECREF(errorHandler);
3986 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 return NULL;
3988}
3989
3990/* Return a Unicode-Escape string version of the Unicode object.
3991
3992 If quotes is true, the string is enclosed in u"" or u'' quotes as
3993 appropriate.
3994
3995*/
3996
Thomas Wouters477c8d52006-05-27 19:21:47 +00003997Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 Py_ssize_t size,
3999 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00004000{
4001 /* like wcschr, but doesn't stop at NULL characters */
4002
4003 while (size-- > 0) {
4004 if (*s == ch)
4005 return s;
4006 s++;
4007 }
4008
4009 return NULL;
4010}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004011
Walter Dörwald79e913e2007-05-12 11:08:06 +00004012static const char *hexdigits = "0123456789abcdef";
4013
4014PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004017 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004020#ifdef Py_UNICODE_WIDE
4021 const Py_ssize_t expandsize = 10;
4022#else
4023 const Py_ssize_t expandsize = 6;
4024#endif
4025
Thomas Wouters89f507f2006-12-13 04:49:30 +00004026 /* XXX(nnorwitz): rather than over-allocating, it would be
4027 better to choose a different scheme. Perhaps scan the
4028 first N-chars of the string and allocate based on that size.
4029 */
4030 /* Initial allocation is based on the longest-possible unichr
4031 escape.
4032
4033 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4034 unichr, so in this case it's the longest unichr escape. In
4035 narrow (UTF-16) builds this is five chars per source unichr
4036 since there are two unichrs in the surrogate pair, so in narrow
4037 (UTF-16) builds it's not the longest unichr escape.
4038
4039 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4040 so in the narrow (UTF-16) build case it's the longest unichr
4041 escape.
4042 */
4043
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004044 if (size == 0)
4045 return PyBytes_FromStringAndSize(NULL, 0);
4046
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004047 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004048 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004049
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004050 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004051 2
4052 + expandsize*size
4053 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054 if (repr == NULL)
4055 return NULL;
4056
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004057 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059 while (size-- > 0) {
4060 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004061
Walter Dörwald79e913e2007-05-12 11:08:06 +00004062 /* Escape backslashes */
4063 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 *p++ = '\\';
4065 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004066 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004067 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004068
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004069#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004070 /* Map 21-bit characters to '\U00xxxxxx' */
4071 else if (ch >= 0x10000) {
4072 *p++ = '\\';
4073 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004074 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4075 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4076 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4077 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4078 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4079 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4080 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4081 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004083 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004084#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004085 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4086 else if (ch >= 0xD800 && ch < 0xDC00) {
4087 Py_UNICODE ch2;
4088 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004089
Benjamin Peterson29060642009-01-31 22:14:21 +00004090 ch2 = *s++;
4091 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004092 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004093 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4094 *p++ = '\\';
4095 *p++ = 'U';
4096 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4097 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4098 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4099 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4100 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4101 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4102 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4103 *p++ = hexdigits[ucs & 0x0000000F];
4104 continue;
4105 }
4106 /* Fall through: isolated surrogates are copied as-is */
4107 s--;
4108 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004109 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004110#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004111
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004113 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114 *p++ = '\\';
4115 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004116 *p++ = hexdigits[(ch >> 12) & 0x000F];
4117 *p++ = hexdigits[(ch >> 8) & 0x000F];
4118 *p++ = hexdigits[(ch >> 4) & 0x000F];
4119 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004121
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004122 /* Map special whitespace to '\t', \n', '\r' */
4123 else if (ch == '\t') {
4124 *p++ = '\\';
4125 *p++ = 't';
4126 }
4127 else if (ch == '\n') {
4128 *p++ = '\\';
4129 *p++ = 'n';
4130 }
4131 else if (ch == '\r') {
4132 *p++ = '\\';
4133 *p++ = 'r';
4134 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004135
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004136 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004137 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004139 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004140 *p++ = hexdigits[(ch >> 4) & 0x000F];
4141 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004142 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004143
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 /* Copy everything else as-is */
4145 else
4146 *p++ = (char) ch;
4147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004149 assert(p - PyBytes_AS_STRING(repr) > 0);
4150 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4151 return NULL;
4152 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153}
4154
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004155PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004157 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158 if (!PyUnicode_Check(unicode)) {
4159 PyErr_BadArgument();
4160 return NULL;
4161 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004162 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4163 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004164 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165}
4166
4167/* --- Raw Unicode Escape Codec ------------------------------------------- */
4168
4169PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004170 Py_ssize_t size,
4171 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004174 Py_ssize_t startinpos;
4175 Py_ssize_t endinpos;
4176 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 const char *end;
4180 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181 PyObject *errorHandler = NULL;
4182 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004183
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184 /* Escaped strings will always be longer than the resulting
4185 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186 length after conversion to the true value. (But decoding error
4187 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 v = _PyUnicode_New(size);
4189 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 end = s + size;
4195 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 unsigned char c;
4197 Py_UCS4 x;
4198 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004199 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 /* Non-escape characters are interpreted as Unicode ordinals */
4202 if (*s != '\\') {
4203 *p++ = (unsigned char)*s++;
4204 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004205 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 startinpos = s-starts;
4207
4208 /* \u-escapes are only interpreted iff the number of leading
4209 backslashes if odd */
4210 bs = s;
4211 for (;s < end;) {
4212 if (*s != '\\')
4213 break;
4214 *p++ = (unsigned char)*s++;
4215 }
4216 if (((s - bs) & 1) == 0 ||
4217 s >= end ||
4218 (*s != 'u' && *s != 'U')) {
4219 continue;
4220 }
4221 p--;
4222 count = *s=='u' ? 4 : 8;
4223 s++;
4224
4225 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4226 outpos = p-PyUnicode_AS_UNICODE(v);
4227 for (x = 0, i = 0; i < count; ++i, ++s) {
4228 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004229 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 endinpos = s-starts;
4231 if (unicode_decode_call_errorhandler(
4232 errors, &errorHandler,
4233 "rawunicodeescape", "truncated \\uXXXX",
4234 &starts, &end, &startinpos, &endinpos, &exc, &s,
4235 &v, &outpos, &p))
4236 goto onError;
4237 goto nextByte;
4238 }
4239 x = (x<<4) & ~0xF;
4240 if (c >= '0' && c <= '9')
4241 x += c - '0';
4242 else if (c >= 'a' && c <= 'f')
4243 x += 10 + c - 'a';
4244 else
4245 x += 10 + c - 'A';
4246 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004247 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004248 /* UCS-2 character */
4249 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004250 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 /* UCS-4 character. Either store directly, or as
4252 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004253#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004255#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 x -= 0x10000L;
4257 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4258 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004259#endif
4260 } else {
4261 endinpos = s-starts;
4262 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004263 if (unicode_decode_call_errorhandler(
4264 errors, &errorHandler,
4265 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004266 &starts, &end, &startinpos, &endinpos, &exc, &s,
4267 &v, &outpos, &p))
4268 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004269 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004270 nextByte:
4271 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004273 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 Py_XDECREF(errorHandler);
4276 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004278
Benjamin Peterson29060642009-01-31 22:14:21 +00004279 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281 Py_XDECREF(errorHandler);
4282 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 return NULL;
4284}
4285
4286PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004289 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290 char *p;
4291 char *q;
4292
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004293#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004294 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004295#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004296 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004297#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004298
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004299 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004301
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004302 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303 if (repr == NULL)
4304 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004305 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004306 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004308 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309 while (size-- > 0) {
4310 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004311#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 /* Map 32-bit characters to '\Uxxxxxxxx' */
4313 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004314 *p++ = '\\';
4315 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004316 *p++ = hexdigits[(ch >> 28) & 0xf];
4317 *p++ = hexdigits[(ch >> 24) & 0xf];
4318 *p++ = hexdigits[(ch >> 20) & 0xf];
4319 *p++ = hexdigits[(ch >> 16) & 0xf];
4320 *p++ = hexdigits[(ch >> 12) & 0xf];
4321 *p++ = hexdigits[(ch >> 8) & 0xf];
4322 *p++ = hexdigits[(ch >> 4) & 0xf];
4323 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004324 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004325 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004326#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004327 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4328 if (ch >= 0xD800 && ch < 0xDC00) {
4329 Py_UNICODE ch2;
4330 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004331
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 ch2 = *s++;
4333 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004334 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004335 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4336 *p++ = '\\';
4337 *p++ = 'U';
4338 *p++ = hexdigits[(ucs >> 28) & 0xf];
4339 *p++ = hexdigits[(ucs >> 24) & 0xf];
4340 *p++ = hexdigits[(ucs >> 20) & 0xf];
4341 *p++ = hexdigits[(ucs >> 16) & 0xf];
4342 *p++ = hexdigits[(ucs >> 12) & 0xf];
4343 *p++ = hexdigits[(ucs >> 8) & 0xf];
4344 *p++ = hexdigits[(ucs >> 4) & 0xf];
4345 *p++ = hexdigits[ucs & 0xf];
4346 continue;
4347 }
4348 /* Fall through: isolated surrogates are copied as-is */
4349 s--;
4350 size++;
4351 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004352#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 /* Map 16-bit characters to '\uxxxx' */
4354 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355 *p++ = '\\';
4356 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004357 *p++ = hexdigits[(ch >> 12) & 0xf];
4358 *p++ = hexdigits[(ch >> 8) & 0xf];
4359 *p++ = hexdigits[(ch >> 4) & 0xf];
4360 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004362 /* Copy everything else as-is */
4363 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 *p++ = (char) ch;
4365 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004366 size = p - q;
4367
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004368 assert(size > 0);
4369 if (_PyBytes_Resize(&repr, size) < 0)
4370 return NULL;
4371 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372}
4373
4374PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4375{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004376 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004378 PyErr_BadArgument();
4379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004381 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4382 PyUnicode_GET_SIZE(unicode));
4383
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004384 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385}
4386
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004387/* --- Unicode Internal Codec ------------------------------------------- */
4388
4389PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 Py_ssize_t size,
4391 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004392{
4393 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004394 Py_ssize_t startinpos;
4395 Py_ssize_t endinpos;
4396 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004397 PyUnicodeObject *v;
4398 Py_UNICODE *p;
4399 const char *end;
4400 const char *reason;
4401 PyObject *errorHandler = NULL;
4402 PyObject *exc = NULL;
4403
Neal Norwitzd43069c2006-01-08 01:12:10 +00004404#ifdef Py_UNICODE_WIDE
4405 Py_UNICODE unimax = PyUnicode_GetMax();
4406#endif
4407
Thomas Wouters89f507f2006-12-13 04:49:30 +00004408 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004409 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4410 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004412 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004414 p = PyUnicode_AS_UNICODE(v);
4415 end = s + size;
4416
4417 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004418 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004419 /* We have to sanity check the raw data, otherwise doom looms for
4420 some malformed UCS-4 data. */
4421 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004422#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004423 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004424#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004425 end-s < Py_UNICODE_SIZE
4426 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004428 startinpos = s - starts;
4429 if (end-s < Py_UNICODE_SIZE) {
4430 endinpos = end-starts;
4431 reason = "truncated input";
4432 }
4433 else {
4434 endinpos = s - starts + Py_UNICODE_SIZE;
4435 reason = "illegal code point (> 0x10FFFF)";
4436 }
4437 outpos = p - PyUnicode_AS_UNICODE(v);
4438 if (unicode_decode_call_errorhandler(
4439 errors, &errorHandler,
4440 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004441 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004442 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004443 goto onError;
4444 }
4445 }
4446 else {
4447 p++;
4448 s += Py_UNICODE_SIZE;
4449 }
4450 }
4451
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004452 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004453 goto onError;
4454 Py_XDECREF(errorHandler);
4455 Py_XDECREF(exc);
4456 return (PyObject *)v;
4457
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004459 Py_XDECREF(v);
4460 Py_XDECREF(errorHandler);
4461 Py_XDECREF(exc);
4462 return NULL;
4463}
4464
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465/* --- Latin-1 Codec ------------------------------------------------------ */
4466
4467PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004468 Py_ssize_t size,
4469 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470{
4471 PyUnicodeObject *v;
4472 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004473 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004474
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004476 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 Py_UNICODE r = *(unsigned char*)s;
4478 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004479 }
4480
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 v = _PyUnicode_New(size);
4482 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004487 e = s + size;
4488 /* Unrolling the copy makes it much faster by reducing the looping
4489 overhead. This is similar to what many memcpy() implementations do. */
4490 unrolled_end = e - 4;
4491 while (s < unrolled_end) {
4492 p[0] = (unsigned char) s[0];
4493 p[1] = (unsigned char) s[1];
4494 p[2] = (unsigned char) s[2];
4495 p[3] = (unsigned char) s[3];
4496 s += 4;
4497 p += 4;
4498 }
4499 while (s < e)
4500 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004502
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 Py_XDECREF(v);
4505 return NULL;
4506}
4507
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508/* create or adjust a UnicodeEncodeError */
4509static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004510 const char *encoding,
4511 const Py_UNICODE *unicode, Py_ssize_t size,
4512 Py_ssize_t startpos, Py_ssize_t endpos,
4513 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 *exceptionObject = PyUnicodeEncodeError_Create(
4517 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 }
4519 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4521 goto onError;
4522 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4523 goto onError;
4524 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4525 goto onError;
4526 return;
4527 onError:
4528 Py_DECREF(*exceptionObject);
4529 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 }
4531}
4532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533/* raises a UnicodeEncodeError */
4534static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 const char *encoding,
4536 const Py_UNICODE *unicode, Py_ssize_t size,
4537 Py_ssize_t startpos, Py_ssize_t endpos,
4538 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539{
4540 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544}
4545
4546/* error handling callback helper:
4547 build arguments, call the callback and check the arguments,
4548 put the result into newpos and return the replacement string, which
4549 has to be freed by the caller */
4550static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 PyObject **errorHandler,
4552 const char *encoding, const char *reason,
4553 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4554 Py_ssize_t startpos, Py_ssize_t endpos,
4555 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004557 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558
4559 PyObject *restuple;
4560 PyObject *resunicode;
4561
4562 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004565 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 }
4567
4568 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004571 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572
4573 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004578 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 Py_DECREF(restuple);
4580 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004582 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 &resunicode, newpos)) {
4584 Py_DECREF(restuple);
4585 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004587 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4588 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4589 Py_DECREF(restuple);
4590 return NULL;
4591 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004594 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4596 Py_DECREF(restuple);
4597 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004598 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 Py_INCREF(resunicode);
4600 Py_DECREF(restuple);
4601 return resunicode;
4602}
4603
4604static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004605 Py_ssize_t size,
4606 const char *errors,
4607 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608{
4609 /* output object */
4610 PyObject *res;
4611 /* pointers to the beginning and end+1 of input */
4612 const Py_UNICODE *startp = p;
4613 const Py_UNICODE *endp = p + size;
4614 /* pointer to the beginning of the unencodable characters */
4615 /* const Py_UNICODE *badp = NULL; */
4616 /* pointer into the output */
4617 char *str;
4618 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004619 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004620 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4621 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004622 PyObject *errorHandler = NULL;
4623 PyObject *exc = NULL;
4624 /* the following variable is used for caching string comparisons
4625 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4626 int known_errorHandler = -1;
4627
4628 /* allocate enough for a simple encoding without
4629 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004630 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004631 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004632 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004634 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004635 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 ressize = size;
4637
4638 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004639 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004640
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 /* can we encode this? */
4642 if (c<limit) {
4643 /* no overflow check, because we know that the space is enough */
4644 *str++ = (char)c;
4645 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004646 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004647 else {
4648 Py_ssize_t unicodepos = p-startp;
4649 Py_ssize_t requiredsize;
4650 PyObject *repunicode;
4651 Py_ssize_t repsize;
4652 Py_ssize_t newpos;
4653 Py_ssize_t respos;
4654 Py_UNICODE *uni2;
4655 /* startpos for collecting unencodable chars */
4656 const Py_UNICODE *collstart = p;
4657 const Py_UNICODE *collend = p;
4658 /* find all unecodable characters */
4659 while ((collend < endp) && ((*collend)>=limit))
4660 ++collend;
4661 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4662 if (known_errorHandler==-1) {
4663 if ((errors==NULL) || (!strcmp(errors, "strict")))
4664 known_errorHandler = 1;
4665 else if (!strcmp(errors, "replace"))
4666 known_errorHandler = 2;
4667 else if (!strcmp(errors, "ignore"))
4668 known_errorHandler = 3;
4669 else if (!strcmp(errors, "xmlcharrefreplace"))
4670 known_errorHandler = 4;
4671 else
4672 known_errorHandler = 0;
4673 }
4674 switch (known_errorHandler) {
4675 case 1: /* strict */
4676 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4677 goto onError;
4678 case 2: /* replace */
4679 while (collstart++<collend)
4680 *str++ = '?'; /* fall through */
4681 case 3: /* ignore */
4682 p = collend;
4683 break;
4684 case 4: /* xmlcharrefreplace */
4685 respos = str - PyBytes_AS_STRING(res);
4686 /* determine replacement size (temporarily (mis)uses p) */
4687 for (p = collstart, repsize = 0; p < collend; ++p) {
4688 if (*p<10)
4689 repsize += 2+1+1;
4690 else if (*p<100)
4691 repsize += 2+2+1;
4692 else if (*p<1000)
4693 repsize += 2+3+1;
4694 else if (*p<10000)
4695 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004696#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 else
4698 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004699#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 else if (*p<100000)
4701 repsize += 2+5+1;
4702 else if (*p<1000000)
4703 repsize += 2+6+1;
4704 else
4705 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004706#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004707 }
4708 requiredsize = respos+repsize+(endp-collend);
4709 if (requiredsize > ressize) {
4710 if (requiredsize<2*ressize)
4711 requiredsize = 2*ressize;
4712 if (_PyBytes_Resize(&res, requiredsize))
4713 goto onError;
4714 str = PyBytes_AS_STRING(res) + respos;
4715 ressize = requiredsize;
4716 }
4717 /* generate replacement (temporarily (mis)uses p) */
4718 for (p = collstart; p < collend; ++p) {
4719 str += sprintf(str, "&#%d;", (int)*p);
4720 }
4721 p = collend;
4722 break;
4723 default:
4724 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4725 encoding, reason, startp, size, &exc,
4726 collstart-startp, collend-startp, &newpos);
4727 if (repunicode == NULL)
4728 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004729 if (PyBytes_Check(repunicode)) {
4730 /* Directly copy bytes result to output. */
4731 repsize = PyBytes_Size(repunicode);
4732 if (repsize > 1) {
4733 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004734 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004735 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4736 Py_DECREF(repunicode);
4737 goto onError;
4738 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004739 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004740 ressize += repsize-1;
4741 }
4742 memcpy(str, PyBytes_AsString(repunicode), repsize);
4743 str += repsize;
4744 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004745 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004746 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004747 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004748 /* need more space? (at least enough for what we
4749 have+the replacement+the rest of the string, so
4750 we won't have to check space for encodable characters) */
4751 respos = str - PyBytes_AS_STRING(res);
4752 repsize = PyUnicode_GET_SIZE(repunicode);
4753 requiredsize = respos+repsize+(endp-collend);
4754 if (requiredsize > ressize) {
4755 if (requiredsize<2*ressize)
4756 requiredsize = 2*ressize;
4757 if (_PyBytes_Resize(&res, requiredsize)) {
4758 Py_DECREF(repunicode);
4759 goto onError;
4760 }
4761 str = PyBytes_AS_STRING(res) + respos;
4762 ressize = requiredsize;
4763 }
4764 /* check if there is anything unencodable in the replacement
4765 and copy it to the output */
4766 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4767 c = *uni2;
4768 if (c >= limit) {
4769 raise_encode_exception(&exc, encoding, startp, size,
4770 unicodepos, unicodepos+1, reason);
4771 Py_DECREF(repunicode);
4772 goto onError;
4773 }
4774 *str = (char)c;
4775 }
4776 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004777 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004778 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004779 }
4780 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004781 /* Resize if we allocated to much */
4782 size = str - PyBytes_AS_STRING(res);
4783 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004784 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004785 if (_PyBytes_Resize(&res, size) < 0)
4786 goto onError;
4787 }
4788
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 Py_XDECREF(errorHandler);
4790 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004791 return res;
4792
4793 onError:
4794 Py_XDECREF(res);
4795 Py_XDECREF(errorHandler);
4796 Py_XDECREF(exc);
4797 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798}
4799
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004801 Py_ssize_t size,
4802 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004804 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805}
4806
4807PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4808{
4809 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 PyErr_BadArgument();
4811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 }
4813 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004814 PyUnicode_GET_SIZE(unicode),
4815 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816}
4817
4818/* --- 7-bit ASCII Codec -------------------------------------------------- */
4819
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 Py_ssize_t size,
4822 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825 PyUnicodeObject *v;
4826 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004827 Py_ssize_t startinpos;
4828 Py_ssize_t endinpos;
4829 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 const char *e;
4831 PyObject *errorHandler = NULL;
4832 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004833
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004835 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 Py_UNICODE r = *(unsigned char*)s;
4837 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004838 }
Tim Petersced69f82003-09-16 20:30:58 +00004839
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 v = _PyUnicode_New(size);
4841 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004842 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 e = s + size;
4847 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004848 register unsigned char c = (unsigned char)*s;
4849 if (c < 128) {
4850 *p++ = c;
4851 ++s;
4852 }
4853 else {
4854 startinpos = s-starts;
4855 endinpos = startinpos + 1;
4856 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4857 if (unicode_decode_call_errorhandler(
4858 errors, &errorHandler,
4859 "ascii", "ordinal not in range(128)",
4860 &starts, &e, &startinpos, &endinpos, &exc, &s,
4861 &v, &outpos, &p))
4862 goto onError;
4863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004865 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4867 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868 Py_XDECREF(errorHandler);
4869 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004871
Benjamin Peterson29060642009-01-31 22:14:21 +00004872 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874 Py_XDECREF(errorHandler);
4875 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 return NULL;
4877}
4878
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 Py_ssize_t size,
4881 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884}
4885
4886PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4887{
4888 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004889 PyErr_BadArgument();
4890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 }
4892 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 PyUnicode_GET_SIZE(unicode),
4894 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895}
4896
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004897#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004898
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004899/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004900
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004901#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004902#define NEED_RETRY
4903#endif
4904
4905/* XXX This code is limited to "true" double-byte encodings, as
4906 a) it assumes an incomplete character consists of a single byte, and
4907 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004908 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004909
4910static int is_dbcs_lead_byte(const char *s, int offset)
4911{
4912 const char *curr = s + offset;
4913
4914 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004915 const char *prev = CharPrev(s, curr);
4916 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004917 }
4918 return 0;
4919}
4920
4921/*
4922 * Decode MBCS string into unicode object. If 'final' is set, converts
4923 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4924 */
4925static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 const char *s, /* MBCS string */
4927 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004928 int final,
4929 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004930{
4931 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004932 Py_ssize_t n;
4933 DWORD usize;
4934 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004935
4936 assert(size >= 0);
4937
Victor Stinner554f3f02010-06-16 23:33:54 +00004938 /* check and handle 'errors' arg */
4939 if (errors==NULL || strcmp(errors, "strict")==0)
4940 flags = MB_ERR_INVALID_CHARS;
4941 else if (strcmp(errors, "ignore")==0)
4942 flags = 0;
4943 else {
4944 PyErr_Format(PyExc_ValueError,
4945 "mbcs encoding does not support errors='%s'",
4946 errors);
4947 return -1;
4948 }
4949
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004950 /* Skip trailing lead-byte unless 'final' is set */
4951 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004952 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004953
4954 /* First get the size of the result */
4955 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004956 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4957 if (usize==0)
4958 goto mbcs_decode_error;
4959 } else
4960 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004961
4962 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 /* Create unicode object */
4964 *v = _PyUnicode_New(usize);
4965 if (*v == NULL)
4966 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004967 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004968 }
4969 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 /* Extend unicode object */
4971 n = PyUnicode_GET_SIZE(*v);
4972 if (_PyUnicode_Resize(v, n + usize) < 0)
4973 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004974 }
4975
4976 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004977 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004979 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4980 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004982 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004983 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004984
4985mbcs_decode_error:
4986 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4987 we raise a UnicodeDecodeError - else it is a 'generic'
4988 windows error
4989 */
4990 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4991 /* Ideally, we should get reason from FormatMessage - this
4992 is the Windows 2000 English version of the message
4993 */
4994 PyObject *exc = NULL;
4995 const char *reason = "No mapping for the Unicode character exists "
4996 "in the target multi-byte code page.";
4997 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4998 if (exc != NULL) {
4999 PyCodec_StrictErrors(exc);
5000 Py_DECREF(exc);
5001 }
5002 } else {
5003 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5004 }
5005 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005006}
5007
5008PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 Py_ssize_t size,
5010 const char *errors,
5011 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005012{
5013 PyUnicodeObject *v = NULL;
5014 int done;
5015
5016 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005018
5019#ifdef NEED_RETRY
5020 retry:
5021 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005022 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005023 else
5024#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005025 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005026
5027 if (done < 0) {
5028 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005030 }
5031
5032 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005034
5035#ifdef NEED_RETRY
5036 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 s += done;
5038 size -= done;
5039 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005040 }
5041#endif
5042
5043 return (PyObject *)v;
5044}
5045
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005046PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 Py_ssize_t size,
5048 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005049{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005050 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5051}
5052
5053/*
5054 * Convert unicode into string object (MBCS).
5055 * Returns 0 if succeed, -1 otherwise.
5056 */
5057static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005059 int size, /* size of unicode */
5060 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005061{
Victor Stinner554f3f02010-06-16 23:33:54 +00005062 BOOL usedDefaultChar = FALSE;
5063 BOOL *pusedDefaultChar;
5064 int mbcssize;
5065 Py_ssize_t n;
5066 PyObject *exc = NULL;
5067 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005068
5069 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005070
Victor Stinner554f3f02010-06-16 23:33:54 +00005071 /* check and handle 'errors' arg */
5072 if (errors==NULL || strcmp(errors, "strict")==0) {
5073 flags = WC_NO_BEST_FIT_CHARS;
5074 pusedDefaultChar = &usedDefaultChar;
5075 } else if (strcmp(errors, "replace")==0) {
5076 flags = 0;
5077 pusedDefaultChar = NULL;
5078 } else {
5079 PyErr_Format(PyExc_ValueError,
5080 "mbcs encoding does not support errors='%s'",
5081 errors);
5082 return -1;
5083 }
5084
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005085 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005086 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005087 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5088 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 if (mbcssize == 0) {
5090 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5091 return -1;
5092 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005093 /* If we used a default char, then we failed! */
5094 if (pusedDefaultChar && *pusedDefaultChar)
5095 goto mbcs_encode_error;
5096 } else {
5097 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005098 }
5099
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005100 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 /* Create string object */
5102 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5103 if (*repr == NULL)
5104 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005105 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005106 }
5107 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 /* Extend string object */
5109 n = PyBytes_Size(*repr);
5110 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5111 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005112 }
5113
5114 /* Do the conversion */
5115 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005117 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5118 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5120 return -1;
5121 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005122 if (pusedDefaultChar && *pusedDefaultChar)
5123 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005124 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005125 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005126
5127mbcs_encode_error:
5128 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5129 Py_XDECREF(exc);
5130 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005131}
5132
5133PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 Py_ssize_t size,
5135 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005136{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005137 PyObject *repr = NULL;
5138 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005139
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005140#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005142 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005143 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005144 else
5145#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005146 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005147
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005148 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005149 Py_XDECREF(repr);
5150 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005151 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005152
5153#ifdef NEED_RETRY
5154 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 p += INT_MAX;
5156 size -= INT_MAX;
5157 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005158 }
5159#endif
5160
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005161 return repr;
5162}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005163
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005164PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5165{
5166 if (!PyUnicode_Check(unicode)) {
5167 PyErr_BadArgument();
5168 return NULL;
5169 }
5170 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 PyUnicode_GET_SIZE(unicode),
5172 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005173}
5174
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005175#undef NEED_RETRY
5176
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005177#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005178
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179/* --- Character Mapping Codec -------------------------------------------- */
5180
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 Py_ssize_t size,
5183 PyObject *mapping,
5184 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005186 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005187 Py_ssize_t startinpos;
5188 Py_ssize_t endinpos;
5189 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005190 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191 PyUnicodeObject *v;
5192 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005193 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005194 PyObject *errorHandler = NULL;
5195 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005196 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005197 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005198
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 /* Default to Latin-1 */
5200 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005201 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202
5203 v = _PyUnicode_New(size);
5204 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005210 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 mapstring = PyUnicode_AS_UNICODE(mapping);
5212 maplen = PyUnicode_GET_SIZE(mapping);
5213 while (s < e) {
5214 unsigned char ch = *s;
5215 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
Benjamin Peterson29060642009-01-31 22:14:21 +00005217 if (ch < maplen)
5218 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 if (x == 0xfffe) {
5221 /* undefined mapping */
5222 outpos = p-PyUnicode_AS_UNICODE(v);
5223 startinpos = s-starts;
5224 endinpos = startinpos+1;
5225 if (unicode_decode_call_errorhandler(
5226 errors, &errorHandler,
5227 "charmap", "character maps to <undefined>",
5228 &starts, &e, &startinpos, &endinpos, &exc, &s,
5229 &v, &outpos, &p)) {
5230 goto onError;
5231 }
5232 continue;
5233 }
5234 *p++ = x;
5235 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005236 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005237 }
5238 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 while (s < e) {
5240 unsigned char ch = *s;
5241 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005242
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5244 w = PyLong_FromLong((long)ch);
5245 if (w == NULL)
5246 goto onError;
5247 x = PyObject_GetItem(mapping, w);
5248 Py_DECREF(w);
5249 if (x == NULL) {
5250 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5251 /* No mapping found means: mapping is undefined. */
5252 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005253 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00005254 } else
5255 goto onError;
5256 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005257
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005259 if (x == Py_None)
5260 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 if (PyLong_Check(x)) {
5262 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005263 if (value == 0xFFFE)
5264 goto Undefined;
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005265 if (value < 0 || value > 0x10FFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 PyErr_SetString(PyExc_TypeError,
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005267 "character mapping must be in range(0x110000)");
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 Py_DECREF(x);
5269 goto onError;
5270 }
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005271
5272#ifndef Py_UNICODE_WIDE
5273 if (value > 0xFFFF) {
5274 /* see the code for 1-n mapping below */
5275 if (extrachars < 2) {
5276 /* resize first */
5277 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5278 Py_ssize_t needed = 10 - extrachars;
5279 extrachars += needed;
5280 /* XXX overflow detection missing */
5281 if (_PyUnicode_Resize(&v,
5282 PyUnicode_GET_SIZE(v) + needed) < 0) {
5283 Py_DECREF(x);
5284 goto onError;
5285 }
5286 p = PyUnicode_AS_UNICODE(v) + oldpos;
5287 }
5288 value -= 0x10000;
5289 *p++ = 0xD800 | (value >> 10);
5290 *p++ = 0xDC00 | (value & 0x3FF);
5291 extrachars -= 2;
5292 }
5293 else
5294#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 *p++ = (Py_UNICODE)value;
5296 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 else if (PyUnicode_Check(x)) {
5298 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005299
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005300 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 /* 1-1 mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005302 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
5303 if (value == 0xFFFE)
5304 goto Undefined;
5305 *p++ = value;
5306 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 else if (targetsize > 1) {
5308 /* 1-n mapping */
5309 if (targetsize > extrachars) {
5310 /* resize first */
5311 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5312 Py_ssize_t needed = (targetsize - extrachars) + \
5313 (targetsize << 2);
5314 extrachars += needed;
5315 /* XXX overflow detection missing */
5316 if (_PyUnicode_Resize(&v,
5317 PyUnicode_GET_SIZE(v) + needed) < 0) {
5318 Py_DECREF(x);
5319 goto onError;
5320 }
5321 p = PyUnicode_AS_UNICODE(v) + oldpos;
5322 }
5323 Py_UNICODE_COPY(p,
5324 PyUnicode_AS_UNICODE(x),
5325 targetsize);
5326 p += targetsize;
5327 extrachars -= targetsize;
5328 }
5329 /* 1-0 mapping: skip the character */
5330 }
5331 else {
5332 /* wrong return value */
5333 PyErr_SetString(PyExc_TypeError,
5334 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005335 Py_DECREF(x);
5336 goto onError;
5337 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 Py_DECREF(x);
5339 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005340 continue;
5341Undefined:
5342 /* undefined mapping */
5343 Py_XDECREF(x);
5344 outpos = p-PyUnicode_AS_UNICODE(v);
5345 startinpos = s-starts;
5346 endinpos = startinpos+1;
5347 if (unicode_decode_call_errorhandler(
5348 errors, &errorHandler,
5349 "charmap", "character maps to <undefined>",
5350 &starts, &e, &startinpos, &endinpos, &exc, &s,
5351 &v, &outpos, &p)) {
5352 goto onError;
5353 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005354 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 }
5356 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5358 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359 Py_XDECREF(errorHandler);
5360 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005362
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005364 Py_XDECREF(errorHandler);
5365 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 Py_XDECREF(v);
5367 return NULL;
5368}
5369
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005370/* Charmap encoding: the lookup table */
5371
5372struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 PyObject_HEAD
5374 unsigned char level1[32];
5375 int count2, count3;
5376 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005377};
5378
5379static PyObject*
5380encoding_map_size(PyObject *obj, PyObject* args)
5381{
5382 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005383 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005385}
5386
5387static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005388 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 PyDoc_STR("Return the size (in bytes) of this object") },
5390 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005391};
5392
5393static void
5394encoding_map_dealloc(PyObject* o)
5395{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005396 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005397}
5398
5399static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005400 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 "EncodingMap", /*tp_name*/
5402 sizeof(struct encoding_map), /*tp_basicsize*/
5403 0, /*tp_itemsize*/
5404 /* methods */
5405 encoding_map_dealloc, /*tp_dealloc*/
5406 0, /*tp_print*/
5407 0, /*tp_getattr*/
5408 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005409 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 0, /*tp_repr*/
5411 0, /*tp_as_number*/
5412 0, /*tp_as_sequence*/
5413 0, /*tp_as_mapping*/
5414 0, /*tp_hash*/
5415 0, /*tp_call*/
5416 0, /*tp_str*/
5417 0, /*tp_getattro*/
5418 0, /*tp_setattro*/
5419 0, /*tp_as_buffer*/
5420 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5421 0, /*tp_doc*/
5422 0, /*tp_traverse*/
5423 0, /*tp_clear*/
5424 0, /*tp_richcompare*/
5425 0, /*tp_weaklistoffset*/
5426 0, /*tp_iter*/
5427 0, /*tp_iternext*/
5428 encoding_map_methods, /*tp_methods*/
5429 0, /*tp_members*/
5430 0, /*tp_getset*/
5431 0, /*tp_base*/
5432 0, /*tp_dict*/
5433 0, /*tp_descr_get*/
5434 0, /*tp_descr_set*/
5435 0, /*tp_dictoffset*/
5436 0, /*tp_init*/
5437 0, /*tp_alloc*/
5438 0, /*tp_new*/
5439 0, /*tp_free*/
5440 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005441};
5442
5443PyObject*
5444PyUnicode_BuildEncodingMap(PyObject* string)
5445{
5446 Py_UNICODE *decode;
5447 PyObject *result;
5448 struct encoding_map *mresult;
5449 int i;
5450 int need_dict = 0;
5451 unsigned char level1[32];
5452 unsigned char level2[512];
5453 unsigned char *mlevel1, *mlevel2, *mlevel3;
5454 int count2 = 0, count3 = 0;
5455
5456 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5457 PyErr_BadArgument();
5458 return NULL;
5459 }
5460 decode = PyUnicode_AS_UNICODE(string);
5461 memset(level1, 0xFF, sizeof level1);
5462 memset(level2, 0xFF, sizeof level2);
5463
5464 /* If there isn't a one-to-one mapping of NULL to \0,
5465 or if there are non-BMP characters, we need to use
5466 a mapping dictionary. */
5467 if (decode[0] != 0)
5468 need_dict = 1;
5469 for (i = 1; i < 256; i++) {
5470 int l1, l2;
5471 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005472#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005473 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005474#endif
5475 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005476 need_dict = 1;
5477 break;
5478 }
5479 if (decode[i] == 0xFFFE)
5480 /* unmapped character */
5481 continue;
5482 l1 = decode[i] >> 11;
5483 l2 = decode[i] >> 7;
5484 if (level1[l1] == 0xFF)
5485 level1[l1] = count2++;
5486 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005487 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005488 }
5489
5490 if (count2 >= 0xFF || count3 >= 0xFF)
5491 need_dict = 1;
5492
5493 if (need_dict) {
5494 PyObject *result = PyDict_New();
5495 PyObject *key, *value;
5496 if (!result)
5497 return NULL;
5498 for (i = 0; i < 256; i++) {
5499 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005500 key = PyLong_FromLong(decode[i]);
5501 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005502 if (!key || !value)
5503 goto failed1;
5504 if (PyDict_SetItem(result, key, value) == -1)
5505 goto failed1;
5506 Py_DECREF(key);
5507 Py_DECREF(value);
5508 }
5509 return result;
5510 failed1:
5511 Py_XDECREF(key);
5512 Py_XDECREF(value);
5513 Py_DECREF(result);
5514 return NULL;
5515 }
5516
5517 /* Create a three-level trie */
5518 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5519 16*count2 + 128*count3 - 1);
5520 if (!result)
5521 return PyErr_NoMemory();
5522 PyObject_Init(result, &EncodingMapType);
5523 mresult = (struct encoding_map*)result;
5524 mresult->count2 = count2;
5525 mresult->count3 = count3;
5526 mlevel1 = mresult->level1;
5527 mlevel2 = mresult->level23;
5528 mlevel3 = mresult->level23 + 16*count2;
5529 memcpy(mlevel1, level1, 32);
5530 memset(mlevel2, 0xFF, 16*count2);
5531 memset(mlevel3, 0, 128*count3);
5532 count3 = 0;
5533 for (i = 1; i < 256; i++) {
5534 int o1, o2, o3, i2, i3;
5535 if (decode[i] == 0xFFFE)
5536 /* unmapped character */
5537 continue;
5538 o1 = decode[i]>>11;
5539 o2 = (decode[i]>>7) & 0xF;
5540 i2 = 16*mlevel1[o1] + o2;
5541 if (mlevel2[i2] == 0xFF)
5542 mlevel2[i2] = count3++;
5543 o3 = decode[i] & 0x7F;
5544 i3 = 128*mlevel2[i2] + o3;
5545 mlevel3[i3] = i;
5546 }
5547 return result;
5548}
5549
5550static int
5551encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5552{
5553 struct encoding_map *map = (struct encoding_map*)mapping;
5554 int l1 = c>>11;
5555 int l2 = (c>>7) & 0xF;
5556 int l3 = c & 0x7F;
5557 int i;
5558
5559#ifdef Py_UNICODE_WIDE
5560 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005562 }
5563#endif
5564 if (c == 0)
5565 return 0;
5566 /* level 1*/
5567 i = map->level1[l1];
5568 if (i == 0xFF) {
5569 return -1;
5570 }
5571 /* level 2*/
5572 i = map->level23[16*i+l2];
5573 if (i == 0xFF) {
5574 return -1;
5575 }
5576 /* level 3 */
5577 i = map->level23[16*map->count2 + 128*i + l3];
5578 if (i == 0) {
5579 return -1;
5580 }
5581 return i;
5582}
5583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005584/* Lookup the character ch in the mapping. If the character
5585 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005586 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005587static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588{
Christian Heimes217cfd12007-12-02 14:31:20 +00005589 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005590 PyObject *x;
5591
5592 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594 x = PyObject_GetItem(mapping, w);
5595 Py_DECREF(w);
5596 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5598 /* No mapping found means: mapping is undefined. */
5599 PyErr_Clear();
5600 x = Py_None;
5601 Py_INCREF(x);
5602 return x;
5603 } else
5604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005606 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005608 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 long value = PyLong_AS_LONG(x);
5610 if (value < 0 || value > 255) {
5611 PyErr_SetString(PyExc_TypeError,
5612 "character mapping must be in range(256)");
5613 Py_DECREF(x);
5614 return NULL;
5615 }
5616 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005618 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005619 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 /* wrong return value */
5622 PyErr_Format(PyExc_TypeError,
5623 "character mapping must return integer, bytes or None, not %.400s",
5624 x->ob_type->tp_name);
5625 Py_DECREF(x);
5626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 }
5628}
5629
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005630static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005631charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005632{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005633 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5634 /* exponentially overallocate to minimize reallocations */
5635 if (requiredsize < 2*outsize)
5636 requiredsize = 2*outsize;
5637 if (_PyBytes_Resize(outobj, requiredsize))
5638 return -1;
5639 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005640}
5641
Benjamin Peterson14339b62009-01-31 16:36:08 +00005642typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005643 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005644}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005646 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 space is available. Return a new reference to the object that
5648 was put in the output buffer, or Py_None, if the mapping was undefined
5649 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005650 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005652charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005655 PyObject *rep;
5656 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005657 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658
Christian Heimes90aa7642007-12-19 02:45:37 +00005659 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005660 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005662 if (res == -1)
5663 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 if (outsize<requiredsize)
5665 if (charmapencode_resize(outobj, outpos, requiredsize))
5666 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005667 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 outstart[(*outpos)++] = (char)res;
5669 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005670 }
5671
5672 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005673 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005675 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 Py_DECREF(rep);
5677 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005678 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 if (PyLong_Check(rep)) {
5680 Py_ssize_t requiredsize = *outpos+1;
5681 if (outsize<requiredsize)
5682 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5683 Py_DECREF(rep);
5684 return enc_EXCEPTION;
5685 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005686 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005688 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 else {
5690 const char *repchars = PyBytes_AS_STRING(rep);
5691 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5692 Py_ssize_t requiredsize = *outpos+repsize;
5693 if (outsize<requiredsize)
5694 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5695 Py_DECREF(rep);
5696 return enc_EXCEPTION;
5697 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005698 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 memcpy(outstart + *outpos, repchars, repsize);
5700 *outpos += repsize;
5701 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005703 Py_DECREF(rep);
5704 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705}
5706
5707/* handle an error in PyUnicode_EncodeCharmap
5708 Return 0 on success, -1 on error */
5709static
5710int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005711 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005713 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005714 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715{
5716 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005717 Py_ssize_t repsize;
5718 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005719 Py_UNICODE *uni2;
5720 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005721 Py_ssize_t collstartpos = *inpos;
5722 Py_ssize_t collendpos = *inpos+1;
5723 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 char *encoding = "charmap";
5725 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005726 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728 /* find all unencodable characters */
5729 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005730 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005731 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005732 int res = encoding_map_lookup(p[collendpos], mapping);
5733 if (res != -1)
5734 break;
5735 ++collendpos;
5736 continue;
5737 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005738
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 rep = charmapencode_lookup(p[collendpos], mapping);
5740 if (rep==NULL)
5741 return -1;
5742 else if (rep!=Py_None) {
5743 Py_DECREF(rep);
5744 break;
5745 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005746 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005748 }
5749 /* cache callback name lookup
5750 * (if not done yet, i.e. it's the first error) */
5751 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 if ((errors==NULL) || (!strcmp(errors, "strict")))
5753 *known_errorHandler = 1;
5754 else if (!strcmp(errors, "replace"))
5755 *known_errorHandler = 2;
5756 else if (!strcmp(errors, "ignore"))
5757 *known_errorHandler = 3;
5758 else if (!strcmp(errors, "xmlcharrefreplace"))
5759 *known_errorHandler = 4;
5760 else
5761 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 }
5763 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005764 case 1: /* strict */
5765 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5766 return -1;
5767 case 2: /* replace */
5768 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 x = charmapencode_output('?', mapping, res, respos);
5770 if (x==enc_EXCEPTION) {
5771 return -1;
5772 }
5773 else if (x==enc_FAILED) {
5774 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5775 return -1;
5776 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005777 }
5778 /* fall through */
5779 case 3: /* ignore */
5780 *inpos = collendpos;
5781 break;
5782 case 4: /* xmlcharrefreplace */
5783 /* generate replacement (temporarily (mis)uses p) */
5784 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 char buffer[2+29+1+1];
5786 char *cp;
5787 sprintf(buffer, "&#%d;", (int)p[collpos]);
5788 for (cp = buffer; *cp; ++cp) {
5789 x = charmapencode_output(*cp, mapping, res, respos);
5790 if (x==enc_EXCEPTION)
5791 return -1;
5792 else if (x==enc_FAILED) {
5793 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5794 return -1;
5795 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005796 }
5797 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005798 *inpos = collendpos;
5799 break;
5800 default:
5801 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 encoding, reason, p, size, exceptionObject,
5803 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005804 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005806 if (PyBytes_Check(repunicode)) {
5807 /* Directly copy bytes result to output. */
5808 Py_ssize_t outsize = PyBytes_Size(*res);
5809 Py_ssize_t requiredsize;
5810 repsize = PyBytes_Size(repunicode);
5811 requiredsize = *respos + repsize;
5812 if (requiredsize > outsize)
5813 /* Make room for all additional bytes. */
5814 if (charmapencode_resize(res, respos, requiredsize)) {
5815 Py_DECREF(repunicode);
5816 return -1;
5817 }
5818 memcpy(PyBytes_AsString(*res) + *respos,
5819 PyBytes_AsString(repunicode), repsize);
5820 *respos += repsize;
5821 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005822 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005823 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005824 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005825 /* generate replacement */
5826 repsize = PyUnicode_GET_SIZE(repunicode);
5827 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 x = charmapencode_output(*uni2, mapping, res, respos);
5829 if (x==enc_EXCEPTION) {
5830 return -1;
5831 }
5832 else if (x==enc_FAILED) {
5833 Py_DECREF(repunicode);
5834 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5835 return -1;
5836 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005837 }
5838 *inpos = newpos;
5839 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005840 }
5841 return 0;
5842}
5843
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 Py_ssize_t size,
5846 PyObject *mapping,
5847 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 /* output object */
5850 PyObject *res = NULL;
5851 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005852 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005854 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855 PyObject *errorHandler = NULL;
5856 PyObject *exc = NULL;
5857 /* the following variable is used for caching string comparisons
5858 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5859 * 3=ignore, 4=xmlcharrefreplace */
5860 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861
5862 /* Default to Latin-1 */
5863 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 /* allocate enough for a simple encoding without
5867 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005868 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869 if (res == NULL)
5870 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005871 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 /* try to encode it */
5876 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5877 if (x==enc_EXCEPTION) /* error */
5878 goto onError;
5879 if (x==enc_FAILED) { /* unencodable character */
5880 if (charmap_encoding_error(p, size, &inpos, mapping,
5881 &exc,
5882 &known_errorHandler, &errorHandler, errors,
5883 &res, &respos)) {
5884 goto onError;
5885 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005886 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 else
5888 /* done with this character => adjust input position */
5889 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005892 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005893 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005894 if (_PyBytes_Resize(&res, respos) < 0)
5895 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005896
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 Py_XDECREF(exc);
5898 Py_XDECREF(errorHandler);
5899 return res;
5900
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902 Py_XDECREF(res);
5903 Py_XDECREF(exc);
5904 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 return NULL;
5906}
5907
5908PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910{
5911 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 PyErr_BadArgument();
5913 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 }
5915 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 PyUnicode_GET_SIZE(unicode),
5917 mapping,
5918 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919}
5920
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005921/* create or adjust a UnicodeTranslateError */
5922static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 const Py_UNICODE *unicode, Py_ssize_t size,
5924 Py_ssize_t startpos, Py_ssize_t endpos,
5925 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005928 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 }
5931 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5933 goto onError;
5934 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5935 goto onError;
5936 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5937 goto onError;
5938 return;
5939 onError:
5940 Py_DECREF(*exceptionObject);
5941 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 }
5943}
5944
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005945/* raises a UnicodeTranslateError */
5946static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 const Py_UNICODE *unicode, Py_ssize_t size,
5948 Py_ssize_t startpos, Py_ssize_t endpos,
5949 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005950{
5951 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005953 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005955}
5956
5957/* error handling callback helper:
5958 build arguments, call the callback and check the arguments,
5959 put the result into newpos and return the replacement string, which
5960 has to be freed by the caller */
5961static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 PyObject **errorHandler,
5963 const char *reason,
5964 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5965 Py_ssize_t startpos, Py_ssize_t endpos,
5966 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005968 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005969
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005970 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005971 PyObject *restuple;
5972 PyObject *resunicode;
5973
5974 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005975 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005978 }
5979
5980 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005982 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005984
5985 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005990 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 Py_DECREF(restuple);
5992 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005993 }
5994 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 &resunicode, &i_newpos)) {
5996 Py_DECREF(restuple);
5997 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005999 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006001 else
6002 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006003 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6005 Py_DECREF(restuple);
6006 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006007 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006008 Py_INCREF(resunicode);
6009 Py_DECREF(restuple);
6010 return resunicode;
6011}
6012
6013/* Lookup the character ch in the mapping and put the result in result,
6014 which must be decrefed by the caller.
6015 Return 0 on success, -1 on error */
6016static
6017int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
6018{
Christian Heimes217cfd12007-12-02 14:31:20 +00006019 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020 PyObject *x;
6021
6022 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006024 x = PyObject_GetItem(mapping, w);
6025 Py_DECREF(w);
6026 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6028 /* No mapping found means: use 1:1 mapping. */
6029 PyErr_Clear();
6030 *result = NULL;
6031 return 0;
6032 } else
6033 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006034 }
6035 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 *result = x;
6037 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006038 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006039 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 long value = PyLong_AS_LONG(x);
6041 long max = PyUnicode_GetMax();
6042 if (value < 0 || value > max) {
6043 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006044 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 Py_DECREF(x);
6046 return -1;
6047 }
6048 *result = x;
6049 return 0;
6050 }
6051 else if (PyUnicode_Check(x)) {
6052 *result = x;
6053 return 0;
6054 }
6055 else {
6056 /* wrong return value */
6057 PyErr_SetString(PyExc_TypeError,
6058 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006059 Py_DECREF(x);
6060 return -1;
6061 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006062}
6063/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 if not reallocate and adjust various state variables.
6065 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066static
Walter Dörwald4894c302003-10-24 14:25:28 +00006067int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006070 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006071 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 /* remember old output position */
6073 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6074 /* exponentially overallocate to minimize reallocations */
6075 if (requiredsize < 2 * oldsize)
6076 requiredsize = 2 * oldsize;
6077 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6078 return -1;
6079 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080 }
6081 return 0;
6082}
6083/* lookup the character, put the result in the output string and adjust
6084 various state variables. Return a new reference to the object that
6085 was put in the output buffer in *result, or Py_None, if the mapping was
6086 undefined (in which case no character was written).
6087 The called must decref result.
6088 Return 0 on success, -1 on error. */
6089static
Walter Dörwald4894c302003-10-24 14:25:28 +00006090int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6092 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093{
Walter Dörwald4894c302003-10-24 14:25:28 +00006094 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006096 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 /* not found => default to 1:1 mapping */
6098 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099 }
6100 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006102 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 /* no overflow check, because we know that the space is enough */
6104 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006105 }
6106 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6108 if (repsize==1) {
6109 /* no overflow check, because we know that the space is enough */
6110 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6111 }
6112 else if (repsize!=0) {
6113 /* more than one character */
6114 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6115 (insize - (curinp-startinp)) +
6116 repsize - 1;
6117 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6118 return -1;
6119 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6120 *outp += repsize;
6121 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 }
6123 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006125 return 0;
6126}
6127
6128PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 Py_ssize_t size,
6130 PyObject *mapping,
6131 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133 /* output object */
6134 PyObject *res = NULL;
6135 /* pointers to the beginning and end+1 of input */
6136 const Py_UNICODE *startp = p;
6137 const Py_UNICODE *endp = p + size;
6138 /* pointer into the output */
6139 Py_UNICODE *str;
6140 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006141 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006142 char *reason = "character maps to <undefined>";
6143 PyObject *errorHandler = NULL;
6144 PyObject *exc = NULL;
6145 /* the following variable is used for caching string comparisons
6146 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6147 * 3=ignore, 4=xmlcharrefreplace */
6148 int known_errorHandler = -1;
6149
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 PyErr_BadArgument();
6152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154
6155 /* allocate enough for a simple 1:1 translation without
6156 replacements, if we need more, we'll resize */
6157 res = PyUnicode_FromUnicode(NULL, size);
6158 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006162 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 /* try to encode it */
6166 PyObject *x = NULL;
6167 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6168 Py_XDECREF(x);
6169 goto onError;
6170 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006171 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 if (x!=Py_None) /* it worked => adjust input pointer */
6173 ++p;
6174 else { /* untranslatable character */
6175 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6176 Py_ssize_t repsize;
6177 Py_ssize_t newpos;
6178 Py_UNICODE *uni2;
6179 /* startpos for collecting untranslatable chars */
6180 const Py_UNICODE *collstart = p;
6181 const Py_UNICODE *collend = p+1;
6182 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 /* find all untranslatable characters */
6185 while (collend < endp) {
6186 if (charmaptranslate_lookup(*collend, mapping, &x))
6187 goto onError;
6188 Py_XDECREF(x);
6189 if (x!=Py_None)
6190 break;
6191 ++collend;
6192 }
6193 /* cache callback name lookup
6194 * (if not done yet, i.e. it's the first error) */
6195 if (known_errorHandler==-1) {
6196 if ((errors==NULL) || (!strcmp(errors, "strict")))
6197 known_errorHandler = 1;
6198 else if (!strcmp(errors, "replace"))
6199 known_errorHandler = 2;
6200 else if (!strcmp(errors, "ignore"))
6201 known_errorHandler = 3;
6202 else if (!strcmp(errors, "xmlcharrefreplace"))
6203 known_errorHandler = 4;
6204 else
6205 known_errorHandler = 0;
6206 }
6207 switch (known_errorHandler) {
6208 case 1: /* strict */
6209 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006210 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 case 2: /* replace */
6212 /* No need to check for space, this is a 1:1 replacement */
6213 for (coll = collstart; coll<collend; ++coll)
6214 *str++ = '?';
6215 /* fall through */
6216 case 3: /* ignore */
6217 p = collend;
6218 break;
6219 case 4: /* xmlcharrefreplace */
6220 /* generate replacement (temporarily (mis)uses p) */
6221 for (p = collstart; p < collend; ++p) {
6222 char buffer[2+29+1+1];
6223 char *cp;
6224 sprintf(buffer, "&#%d;", (int)*p);
6225 if (charmaptranslate_makespace(&res, &str,
6226 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6227 goto onError;
6228 for (cp = buffer; *cp; ++cp)
6229 *str++ = *cp;
6230 }
6231 p = collend;
6232 break;
6233 default:
6234 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6235 reason, startp, size, &exc,
6236 collstart-startp, collend-startp, &newpos);
6237 if (repunicode == NULL)
6238 goto onError;
6239 /* generate replacement */
6240 repsize = PyUnicode_GET_SIZE(repunicode);
6241 if (charmaptranslate_makespace(&res, &str,
6242 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6243 Py_DECREF(repunicode);
6244 goto onError;
6245 }
6246 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6247 *str++ = *uni2;
6248 p = startp + newpos;
6249 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006250 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006251 }
6252 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006253 /* Resize if we allocated to much */
6254 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006255 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 if (PyUnicode_Resize(&res, respos) < 0)
6257 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258 }
6259 Py_XDECREF(exc);
6260 Py_XDECREF(errorHandler);
6261 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006264 Py_XDECREF(res);
6265 Py_XDECREF(exc);
6266 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 return NULL;
6268}
6269
6270PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 PyObject *mapping,
6272 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273{
6274 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006275
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 str = PyUnicode_FromObject(str);
6277 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 PyUnicode_GET_SIZE(str),
6281 mapping,
6282 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 Py_DECREF(str);
6284 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006285
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 Py_XDECREF(str);
6288 return NULL;
6289}
Tim Petersced69f82003-09-16 20:30:58 +00006290
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006291PyObject *
6292PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6293 Py_ssize_t length)
6294{
6295 PyObject *result;
6296 Py_UNICODE *p; /* write pointer into result */
6297 Py_ssize_t i;
6298 /* Copy to a new string */
6299 result = (PyObject *)_PyUnicode_New(length);
6300 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6301 if (result == NULL)
6302 return result;
6303 p = PyUnicode_AS_UNICODE(result);
6304 /* Iterate over code points */
6305 for (i = 0; i < length; i++) {
6306 Py_UNICODE ch =s[i];
6307 if (ch > 127) {
6308 int decimal = Py_UNICODE_TODECIMAL(ch);
6309 if (decimal >= 0)
6310 p[i] = '0' + decimal;
6311 }
6312 }
6313 return result;
6314}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006315/* --- Decimal Encoder ---------------------------------------------------- */
6316
6317int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 Py_ssize_t length,
6319 char *output,
6320 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006321{
6322 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006323 PyObject *errorHandler = NULL;
6324 PyObject *exc = NULL;
6325 const char *encoding = "decimal";
6326 const char *reason = "invalid decimal Unicode string";
6327 /* the following variable is used for caching string comparisons
6328 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6329 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006330
6331 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 PyErr_BadArgument();
6333 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006334 }
6335
6336 p = s;
6337 end = s + length;
6338 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 register Py_UNICODE ch = *p;
6340 int decimal;
6341 PyObject *repunicode;
6342 Py_ssize_t repsize;
6343 Py_ssize_t newpos;
6344 Py_UNICODE *uni2;
6345 Py_UNICODE *collstart;
6346 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006347
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006349 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 ++p;
6351 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006352 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 decimal = Py_UNICODE_TODECIMAL(ch);
6354 if (decimal >= 0) {
6355 *output++ = '0' + decimal;
6356 ++p;
6357 continue;
6358 }
6359 if (0 < ch && ch < 256) {
6360 *output++ = (char)ch;
6361 ++p;
6362 continue;
6363 }
6364 /* All other characters are considered unencodable */
6365 collstart = p;
Victor Stinnerab1d16b2011-11-22 01:45:37 +01006366 for (collend = p+1; collend < end; collend++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 if ((0 < *collend && *collend < 256) ||
Victor Stinnerab1d16b2011-11-22 01:45:37 +01006368 Py_UNICODE_ISSPACE(*collend) ||
6369 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 break;
6371 }
6372 /* cache callback name lookup
6373 * (if not done yet, i.e. it's the first error) */
6374 if (known_errorHandler==-1) {
6375 if ((errors==NULL) || (!strcmp(errors, "strict")))
6376 known_errorHandler = 1;
6377 else if (!strcmp(errors, "replace"))
6378 known_errorHandler = 2;
6379 else if (!strcmp(errors, "ignore"))
6380 known_errorHandler = 3;
6381 else if (!strcmp(errors, "xmlcharrefreplace"))
6382 known_errorHandler = 4;
6383 else
6384 known_errorHandler = 0;
6385 }
6386 switch (known_errorHandler) {
6387 case 1: /* strict */
6388 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6389 goto onError;
6390 case 2: /* replace */
6391 for (p = collstart; p < collend; ++p)
6392 *output++ = '?';
6393 /* fall through */
6394 case 3: /* ignore */
6395 p = collend;
6396 break;
6397 case 4: /* xmlcharrefreplace */
6398 /* generate replacement (temporarily (mis)uses p) */
6399 for (p = collstart; p < collend; ++p)
6400 output += sprintf(output, "&#%d;", (int)*p);
6401 p = collend;
6402 break;
6403 default:
6404 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6405 encoding, reason, s, length, &exc,
6406 collstart-s, collend-s, &newpos);
6407 if (repunicode == NULL)
6408 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006409 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006410 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006411 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6412 Py_DECREF(repunicode);
6413 goto onError;
6414 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 /* generate replacement */
6416 repsize = PyUnicode_GET_SIZE(repunicode);
6417 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6418 Py_UNICODE ch = *uni2;
6419 if (Py_UNICODE_ISSPACE(ch))
6420 *output++ = ' ';
6421 else {
6422 decimal = Py_UNICODE_TODECIMAL(ch);
6423 if (decimal >= 0)
6424 *output++ = '0' + decimal;
6425 else if (0 < ch && ch < 256)
6426 *output++ = (char)ch;
6427 else {
6428 Py_DECREF(repunicode);
6429 raise_encode_exception(&exc, encoding,
6430 s, length, collstart-s, collend-s, reason);
6431 goto onError;
6432 }
6433 }
6434 }
6435 p = s + newpos;
6436 Py_DECREF(repunicode);
6437 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006438 }
6439 /* 0-terminate the output string */
6440 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441 Py_XDECREF(exc);
6442 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006443 return 0;
6444
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446 Py_XDECREF(exc);
6447 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006448 return -1;
6449}
6450
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451/* --- Helpers ------------------------------------------------------------ */
6452
Eric Smith8c663262007-08-25 02:26:07 +00006453#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006454#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006455
Thomas Wouters477c8d52006-05-27 19:21:47 +00006456#include "stringlib/count.h"
6457#include "stringlib/find.h"
6458#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006459#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006460
Eric Smith5807c412008-05-11 21:00:57 +00006461#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006462#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006463#include "stringlib/localeutil.h"
6464
Thomas Wouters477c8d52006-05-27 19:21:47 +00006465/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006466#define ADJUST_INDICES(start, end, len) \
6467 if (end > len) \
6468 end = len; \
6469 else if (end < 0) { \
6470 end += len; \
6471 if (end < 0) \
6472 end = 0; \
6473 } \
6474 if (start < 0) { \
6475 start += len; \
6476 if (start < 0) \
6477 start = 0; \
6478 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006479
Ezio Melotti93e7afc2011-08-22 14:08:38 +03006480/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
6481 * by 'ptr', possibly combining surrogate pairs on narrow builds.
6482 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
6483 * that should be returned and 'end' pointing to the end of the buffer.
6484 * ('end' is used on narrow builds to detect a lone surrogate at the
6485 * end of the buffer that should be returned unchanged.)
6486 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
6487 * The type of the returned char is always Py_UCS4.
6488 *
6489 * Note: the macro advances ptr to next char, so it might have side-effects
6490 * (especially if used with other macros).
6491 */
6492
6493/* helper macros used by _Py_UNICODE_NEXT */
6494#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
6495#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
6496/* Join two surrogate characters and return a single Py_UCS4 value. */
6497#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
6498 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
6499 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
6500
6501#ifdef Py_UNICODE_WIDE
6502#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
6503#else
6504#define _Py_UNICODE_NEXT(ptr, end) \
6505 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
6506 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
6507 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
6508 (Py_UCS4)*(ptr)++)
6509#endif
6510
Martin v. Löwis18e16552006-02-15 17:27:45 +00006511Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006512 PyObject *substr,
6513 Py_ssize_t start,
6514 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006516 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006517 PyUnicodeObject* str_obj;
6518 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006519
Thomas Wouters477c8d52006-05-27 19:21:47 +00006520 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6521 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006523 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6524 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 Py_DECREF(str_obj);
6526 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 }
Tim Petersced69f82003-09-16 20:30:58 +00006528
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006529 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006530 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006531 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6532 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006533 );
6534
6535 Py_DECREF(sub_obj);
6536 Py_DECREF(str_obj);
6537
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 return result;
6539}
6540
Martin v. Löwis18e16552006-02-15 17:27:45 +00006541Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006542 PyObject *sub,
6543 Py_ssize_t start,
6544 Py_ssize_t end,
6545 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006547 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006548
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006550 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006552 sub = PyUnicode_FromObject(sub);
6553 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 Py_DECREF(str);
6555 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 }
Tim Petersced69f82003-09-16 20:30:58 +00006557
Thomas Wouters477c8d52006-05-27 19:21:47 +00006558 if (direction > 0)
6559 result = stringlib_find_slice(
6560 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6561 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6562 start, end
6563 );
6564 else
6565 result = stringlib_rfind_slice(
6566 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6567 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6568 start, end
6569 );
6570
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006572 Py_DECREF(sub);
6573
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 return result;
6575}
6576
Tim Petersced69f82003-09-16 20:30:58 +00006577static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 PyUnicodeObject *substring,
6580 Py_ssize_t start,
6581 Py_ssize_t end,
6582 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 if (substring->length == 0)
6585 return 1;
6586
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006587 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 end -= substring->length;
6589 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591
6592 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 if (Py_UNICODE_MATCH(self, end, substring))
6594 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 } else {
6596 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 }
6599
6600 return 0;
6601}
6602
Martin v. Löwis18e16552006-02-15 17:27:45 +00006603Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 PyObject *substr,
6605 Py_ssize_t start,
6606 Py_ssize_t end,
6607 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006609 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006610
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 str = PyUnicode_FromObject(str);
6612 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 substr = PyUnicode_FromObject(substr);
6615 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 Py_DECREF(str);
6617 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 }
Tim Petersced69f82003-09-16 20:30:58 +00006619
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 (PyUnicodeObject *)substr,
6622 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 Py_DECREF(str);
6624 Py_DECREF(substr);
6625 return result;
6626}
6627
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628/* Apply fixfct filter to the Unicode object self and return a
6629 reference to the modified object */
6630
Tim Petersced69f82003-09-16 20:30:58 +00006631static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634{
6635
6636 PyUnicodeObject *u;
6637
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006638 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006641
6642 Py_UNICODE_COPY(u->str, self->str, self->length);
6643
Tim Peters7a29bd52001-09-12 03:03:31 +00006644 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 /* fixfct should return TRUE if it modified the buffer. If
6646 FALSE, return a reference to the original buffer instead
6647 (to save space, not time) */
6648 Py_INCREF(self);
6649 Py_DECREF(u);
6650 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 }
6652 return (PyObject*) u;
6653}
6654
Tim Petersced69f82003-09-16 20:30:58 +00006655static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656int fixupper(PyUnicodeObject *self)
6657{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006658 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 Py_UNICODE *s = self->str;
6660 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006661
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006664
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 ch = Py_UNICODE_TOUPPER(*s);
6666 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 *s = ch;
6669 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 s++;
6671 }
6672
6673 return status;
6674}
6675
Tim Petersced69f82003-09-16 20:30:58 +00006676static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677int fixlower(PyUnicodeObject *self)
6678{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006679 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 Py_UNICODE *s = self->str;
6681 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006682
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006685
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 ch = Py_UNICODE_TOLOWER(*s);
6687 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 *s = ch;
6690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 s++;
6692 }
6693
6694 return status;
6695}
6696
Tim Petersced69f82003-09-16 20:30:58 +00006697static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698int fixswapcase(PyUnicodeObject *self)
6699{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006700 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 Py_UNICODE *s = self->str;
6702 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 while (len-- > 0) {
6705 if (Py_UNICODE_ISUPPER(*s)) {
6706 *s = Py_UNICODE_TOLOWER(*s);
6707 status = 1;
6708 } else if (Py_UNICODE_ISLOWER(*s)) {
6709 *s = Py_UNICODE_TOUPPER(*s);
6710 status = 1;
6711 }
6712 s++;
6713 }
6714
6715 return status;
6716}
6717
Tim Petersced69f82003-09-16 20:30:58 +00006718static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719int fixcapitalize(PyUnicodeObject *self)
6720{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006721 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006722 Py_UNICODE *s = self->str;
6723 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006724
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006725 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 return 0;
Ezio Melottiee8d9982011-08-15 09:09:57 +03006727 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 *s = Py_UNICODE_TOUPPER(*s);
6729 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006731 s++;
6732 while (--len > 0) {
Ezio Melottiee8d9982011-08-15 09:09:57 +03006733 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006734 *s = Py_UNICODE_TOLOWER(*s);
6735 status = 1;
6736 }
6737 s++;
6738 }
6739 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740}
6741
6742static
6743int fixtitle(PyUnicodeObject *self)
6744{
6745 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6746 register Py_UNICODE *e;
6747 int previous_is_cased;
6748
6749 /* Shortcut for single character strings */
6750 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6752 if (*p != ch) {
6753 *p = ch;
6754 return 1;
6755 }
6756 else
6757 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 }
Tim Petersced69f82003-09-16 20:30:58 +00006759
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 e = p + PyUnicode_GET_SIZE(self);
6761 previous_is_cased = 0;
6762 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006764
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 if (previous_is_cased)
6766 *p = Py_UNICODE_TOLOWER(ch);
6767 else
6768 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006769
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 if (Py_UNICODE_ISLOWER(ch) ||
6771 Py_UNICODE_ISUPPER(ch) ||
6772 Py_UNICODE_ISTITLE(ch))
6773 previous_is_cased = 1;
6774 else
6775 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 }
6777 return 1;
6778}
6779
Tim Peters8ce9f162004-08-27 01:49:32 +00006780PyObject *
6781PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782{
Skip Montanaro6543b452004-09-16 03:28:13 +00006783 const Py_UNICODE blank = ' ';
6784 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006785 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006786 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006787 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6788 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006789 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6790 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006791 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006792 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793
Tim Peters05eba1f2004-08-27 21:32:02 +00006794 fseq = PySequence_Fast(seq, "");
6795 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006796 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006797 }
6798
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006799 /* NOTE: the following code can't call back into Python code,
6800 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006801 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006802
Tim Peters05eba1f2004-08-27 21:32:02 +00006803 seqlen = PySequence_Fast_GET_SIZE(fseq);
6804 /* If empty sequence, return u"". */
6805 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006806 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6807 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006808 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006809 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006810 /* If singleton sequence with an exact Unicode, return that. */
6811 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 item = items[0];
6813 if (PyUnicode_CheckExact(item)) {
6814 Py_INCREF(item);
6815 res = (PyUnicodeObject *)item;
6816 goto Done;
6817 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006818 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006819 else {
6820 /* Set up sep and seplen */
6821 if (separator == NULL) {
6822 sep = &blank;
6823 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006824 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006825 else {
6826 if (!PyUnicode_Check(separator)) {
6827 PyErr_Format(PyExc_TypeError,
6828 "separator: expected str instance,"
6829 " %.80s found",
6830 Py_TYPE(separator)->tp_name);
6831 goto onError;
6832 }
6833 sep = PyUnicode_AS_UNICODE(separator);
6834 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006835 }
6836 }
6837
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006838 /* There are at least two things to join, or else we have a subclass
6839 * of str in the sequence.
6840 * Do a pre-pass to figure out the total amount of space we'll
6841 * need (sz), and see whether all argument are strings.
6842 */
6843 sz = 0;
6844 for (i = 0; i < seqlen; i++) {
6845 const Py_ssize_t old_sz = sz;
6846 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 if (!PyUnicode_Check(item)) {
6848 PyErr_Format(PyExc_TypeError,
6849 "sequence item %zd: expected str instance,"
6850 " %.80s found",
6851 i, Py_TYPE(item)->tp_name);
6852 goto onError;
6853 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006854 sz += PyUnicode_GET_SIZE(item);
6855 if (i != 0)
6856 sz += seplen;
6857 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6858 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006860 goto onError;
6861 }
6862 }
Tim Petersced69f82003-09-16 20:30:58 +00006863
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006864 res = _PyUnicode_New(sz);
6865 if (res == NULL)
6866 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006867
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006868 /* Catenate everything. */
6869 res_p = PyUnicode_AS_UNICODE(res);
6870 for (i = 0; i < seqlen; ++i) {
6871 Py_ssize_t itemlen;
6872 item = items[i];
6873 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 /* Copy item, and maybe the separator. */
6875 if (i) {
6876 Py_UNICODE_COPY(res_p, sep, seplen);
6877 res_p += seplen;
6878 }
6879 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6880 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006881 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006882
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006884 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 return (PyObject *)res;
6886
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006888 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006889 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 return NULL;
6891}
6892
Tim Petersced69f82003-09-16 20:30:58 +00006893static
6894PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 Py_ssize_t left,
6896 Py_ssize_t right,
6897 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898{
6899 PyUnicodeObject *u;
6900
6901 if (left < 0)
6902 left = 0;
6903 if (right < 0)
6904 right = 0;
6905
Tim Peters7a29bd52001-09-12 03:03:31 +00006906 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 Py_INCREF(self);
6908 return self;
6909 }
6910
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006911 if (left > PY_SSIZE_T_MAX - self->length ||
6912 right > PY_SSIZE_T_MAX - (left + self->length)) {
6913 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6914 return NULL;
6915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 u = _PyUnicode_New(left + self->length + right);
6917 if (u) {
6918 if (left)
6919 Py_UNICODE_FILL(u->str, fill, left);
6920 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6921 if (right)
6922 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6923 }
6924
6925 return u;
6926}
6927
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006928PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931
6932 string = PyUnicode_FromObject(string);
6933 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006936 list = stringlib_splitlines(
6937 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6938 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939
6940 Py_DECREF(string);
6941 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942}
6943
Tim Petersced69f82003-09-16 20:30:58 +00006944static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006946 PyUnicodeObject *substring,
6947 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006950 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006953 return stringlib_split_whitespace(
6954 (PyObject*) self, self->str, self->length, maxcount
6955 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006957 return stringlib_split(
6958 (PyObject*) self, self->str, self->length,
6959 substring->str, substring->length,
6960 maxcount
6961 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962}
6963
Tim Petersced69f82003-09-16 20:30:58 +00006964static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006965PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 PyUnicodeObject *substring,
6967 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006968{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006969 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006970 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006971
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006972 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006973 return stringlib_rsplit_whitespace(
6974 (PyObject*) self, self->str, self->length, maxcount
6975 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006976
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006977 return stringlib_rsplit(
6978 (PyObject*) self, self->str, self->length,
6979 substring->str, substring->length,
6980 maxcount
6981 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006982}
6983
6984static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 PyUnicodeObject *str1,
6987 PyUnicodeObject *str2,
6988 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989{
6990 PyUnicodeObject *u;
6991
6992 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006994 else if (maxcount == 0 || self->length == 0)
6995 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
Thomas Wouters477c8d52006-05-27 19:21:47 +00006997 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006998 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006999 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007000 if (str1->length == 0)
7001 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007002 if (str1->length == 1) {
7003 /* replace characters */
7004 Py_UNICODE u1, u2;
7005 if (!findchar(self->str, self->length, str1->str[0]))
7006 goto nothing;
7007 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7008 if (!u)
7009 return NULL;
7010 Py_UNICODE_COPY(u->str, self->str, self->length);
7011 u1 = str1->str[0];
7012 u2 = str2->str[0];
7013 for (i = 0; i < u->length; i++)
7014 if (u->str[i] == u1) {
7015 if (--maxcount < 0)
7016 break;
7017 u->str[i] = u2;
7018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007020 i = stringlib_find(
7021 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007023 if (i < 0)
7024 goto nothing;
7025 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7026 if (!u)
7027 return NULL;
7028 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007029
7030 /* change everything in-place, starting with this one */
7031 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7032 i += str1->length;
7033
7034 while ( --maxcount > 0) {
7035 i = stringlib_find(self->str+i, self->length-i,
7036 str1->str, str1->length,
7037 i);
7038 if (i == -1)
7039 break;
7040 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7041 i += str1->length;
7042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007045
Victor Stinnerab1d16b2011-11-22 01:45:37 +01007046 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007047 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 Py_UNICODE *p;
7049
7050 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007051 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7052 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007053 if (n == 0)
7054 goto nothing;
7055 /* new_size = self->length + n * (str2->length - str1->length)); */
7056 delta = (str2->length - str1->length);
7057 if (delta == 0) {
7058 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007060 product = n * (str2->length - str1->length);
7061 if ((product / (str2->length - str1->length)) != n) {
7062 PyErr_SetString(PyExc_OverflowError,
7063 "replace string is too long");
7064 return NULL;
7065 }
7066 new_size = self->length + product;
7067 if (new_size < 0) {
7068 PyErr_SetString(PyExc_OverflowError,
7069 "replace string is too long");
7070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 }
7072 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007073 u = _PyUnicode_New(new_size);
7074 if (!u)
7075 return NULL;
7076 i = 0;
7077 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007078 if (str1->length > 0) {
7079 while (n-- > 0) {
7080 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007081 j = stringlib_find(self->str+i, self->length-i,
7082 str1->str, str1->length,
7083 i);
7084 if (j == -1)
7085 break;
7086 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007087 /* copy unchanged part [i:j] */
7088 Py_UNICODE_COPY(p, self->str+i, j-i);
7089 p += j - i;
7090 }
7091 /* copy substitution string */
7092 if (str2->length > 0) {
7093 Py_UNICODE_COPY(p, str2->str, str2->length);
7094 p += str2->length;
7095 }
7096 i = j + str1->length;
7097 }
7098 if (i < self->length)
7099 /* copy tail [i:] */
7100 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7101 } else {
7102 /* interleave */
7103 while (n > 0) {
7104 Py_UNICODE_COPY(p, str2->str, str2->length);
7105 p += str2->length;
7106 if (--n <= 0)
7107 break;
7108 *p++ = self->str[i++];
7109 }
7110 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7111 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007114
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007116 /* nothing to replace; return original string (when possible) */
7117 if (PyUnicode_CheckExact(self)) {
7118 Py_INCREF(self);
7119 return (PyObject *) self;
7120 }
7121 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122}
7123
7124/* --- Unicode Object Methods --------------------------------------------- */
7125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007126PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128\n\
7129Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007130characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131
7132static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007133unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135 return fixup(self, fixtitle);
7136}
7137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007138PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140\n\
7141Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007142have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143
7144static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007145unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 return fixup(self, fixcapitalize);
7148}
7149
7150#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007151PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153\n\
7154Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007155normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
7157static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007158unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159{
7160 PyObject *list;
7161 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007162 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 /* Split into words */
7165 list = split(self, NULL, -1);
7166 if (!list)
7167 return NULL;
7168
7169 /* Capitalize each word */
7170 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7171 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 if (item == NULL)
7174 goto onError;
7175 Py_DECREF(PyList_GET_ITEM(list, i));
7176 PyList_SET_ITEM(list, i, item);
7177 }
7178
7179 /* Join the words to form a new string */
7180 item = PyUnicode_Join(NULL, list);
7181
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 Py_DECREF(list);
7184 return (PyObject *)item;
7185}
7186#endif
7187
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007188/* Argument converter. Coerces to a single unicode character */
7189
7190static int
7191convert_uc(PyObject *obj, void *addr)
7192{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007193 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7194 PyObject *uniobj;
7195 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007196
Benjamin Peterson14339b62009-01-31 16:36:08 +00007197 uniobj = PyUnicode_FromObject(obj);
7198 if (uniobj == NULL) {
7199 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007201 return 0;
7202 }
7203 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7204 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007206 Py_DECREF(uniobj);
7207 return 0;
7208 }
7209 unistr = PyUnicode_AS_UNICODE(uniobj);
7210 *fillcharloc = unistr[0];
7211 Py_DECREF(uniobj);
7212 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007213}
7214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007215PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007218Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007219done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220
7221static PyObject *
7222unicode_center(PyUnicodeObject *self, PyObject *args)
7223{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007224 Py_ssize_t marg, left;
7225 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007226 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
Thomas Woutersde017742006-02-16 19:34:37 +00007228 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 return NULL;
7230
Tim Peters7a29bd52001-09-12 03:03:31 +00007231 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 Py_INCREF(self);
7233 return (PyObject*) self;
7234 }
7235
7236 marg = width - self->length;
7237 left = marg / 2 + (marg & width & 1);
7238
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007239 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240}
7241
Marc-André Lemburge5034372000-08-08 08:04:29 +00007242#if 0
7243
7244/* This code should go into some future Unicode collation support
7245 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007246 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007247
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007248/* speedy UTF-16 code point order comparison */
7249/* gleaned from: */
7250/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7251
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007252static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007253{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007254 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007255 0, 0, 0, 0, 0, 0, 0, 0,
7256 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007257 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007258};
7259
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260static int
7261unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7262{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007263 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007264
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265 Py_UNICODE *s1 = str1->str;
7266 Py_UNICODE *s2 = str2->str;
7267
7268 len1 = str1->length;
7269 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007270
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007272 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007273
7274 c1 = *s1++;
7275 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007276
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 if (c1 > (1<<11) * 26)
7278 c1 += utf16Fixup[c1>>11];
7279 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007280 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007281 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007282
7283 if (c1 != c2)
7284 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007285
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007286 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287 }
7288
7289 return (len1 < len2) ? -1 : (len1 != len2);
7290}
7291
Marc-André Lemburge5034372000-08-08 08:04:29 +00007292#else
7293
7294static int
7295unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7296{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007297 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007298
7299 Py_UNICODE *s1 = str1->str;
7300 Py_UNICODE *s2 = str2->str;
7301
7302 len1 = str1->length;
7303 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007304
Marc-André Lemburge5034372000-08-08 08:04:29 +00007305 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007306 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007307
Fredrik Lundh45714e92001-06-26 16:39:36 +00007308 c1 = *s1++;
7309 c2 = *s2++;
7310
7311 if (c1 != c2)
7312 return (c1 < c2) ? -1 : 1;
7313
Marc-André Lemburge5034372000-08-08 08:04:29 +00007314 len1--; len2--;
7315 }
7316
7317 return (len1 < len2) ? -1 : (len1 != len2);
7318}
7319
7320#endif
7321
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007325 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7326 return unicode_compare((PyUnicodeObject *)left,
7327 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007328 PyErr_Format(PyExc_TypeError,
7329 "Can't compare %.100s and %.100s",
7330 left->ob_type->tp_name,
7331 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 return -1;
7333}
7334
Martin v. Löwis5b222132007-06-10 09:51:05 +00007335int
7336PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7337{
7338 int i;
7339 Py_UNICODE *id;
7340 assert(PyUnicode_Check(uni));
7341 id = PyUnicode_AS_UNICODE(uni);
7342 /* Compare Unicode string and source character set string */
7343 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 if (id[i] != str[i])
7345 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007346 /* This check keeps Python strings that end in '\0' from comparing equal
7347 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007348 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007350 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007352 return 0;
7353}
7354
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007355
Benjamin Peterson29060642009-01-31 22:14:21 +00007356#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007357 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007358
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007359PyObject *PyUnicode_RichCompare(PyObject *left,
7360 PyObject *right,
7361 int op)
7362{
7363 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007364
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007365 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7366 PyObject *v;
7367 if (((PyUnicodeObject *) left)->length !=
7368 ((PyUnicodeObject *) right)->length) {
7369 if (op == Py_EQ) {
7370 Py_INCREF(Py_False);
7371 return Py_False;
7372 }
7373 if (op == Py_NE) {
7374 Py_INCREF(Py_True);
7375 return Py_True;
7376 }
7377 }
7378 if (left == right)
7379 result = 0;
7380 else
7381 result = unicode_compare((PyUnicodeObject *)left,
7382 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007383
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007384 /* Convert the return value to a Boolean */
7385 switch (op) {
7386 case Py_EQ:
7387 v = TEST_COND(result == 0);
7388 break;
7389 case Py_NE:
7390 v = TEST_COND(result != 0);
7391 break;
7392 case Py_LE:
7393 v = TEST_COND(result <= 0);
7394 break;
7395 case Py_GE:
7396 v = TEST_COND(result >= 0);
7397 break;
7398 case Py_LT:
7399 v = TEST_COND(result == -1);
7400 break;
7401 case Py_GT:
7402 v = TEST_COND(result == 1);
7403 break;
7404 default:
7405 PyErr_BadArgument();
7406 return NULL;
7407 }
7408 Py_INCREF(v);
7409 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007410 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007411
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007412 Py_INCREF(Py_NotImplemented);
7413 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007414}
7415
Guido van Rossum403d68b2000-03-13 15:55:09 +00007416int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007418{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007419 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007420 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007421
7422 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007423 sub = PyUnicode_FromObject(element);
7424 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 PyErr_Format(PyExc_TypeError,
7426 "'in <string>' requires string as left operand, not %s",
7427 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007428 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007429 }
7430
Thomas Wouters477c8d52006-05-27 19:21:47 +00007431 str = PyUnicode_FromObject(container);
7432 if (!str) {
7433 Py_DECREF(sub);
7434 return -1;
7435 }
7436
7437 result = stringlib_contains_obj(str, sub);
7438
7439 Py_DECREF(str);
7440 Py_DECREF(sub);
7441
Guido van Rossum403d68b2000-03-13 15:55:09 +00007442 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007443}
7444
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445/* Concat to string or Unicode object giving a new Unicode object. */
7446
7447PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
7450 PyUnicodeObject *u = NULL, *v = NULL, *w;
7451
7452 /* Coerce the two arguments */
7453 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7454 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7457 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459
7460 /* Shortcuts */
7461 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 Py_DECREF(v);
7463 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 }
7465 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 Py_DECREF(u);
7467 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 }
7469
7470 /* Concat the two Unicode strings */
7471 w = _PyUnicode_New(u->length + v->length);
7472 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 Py_UNICODE_COPY(w->str, u->str, u->length);
7475 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7476
7477 Py_DECREF(u);
7478 Py_DECREF(v);
7479 return (PyObject *)w;
7480
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 Py_XDECREF(u);
7483 Py_XDECREF(v);
7484 return NULL;
7485}
7486
Walter Dörwald1ab83302007-05-18 17:15:44 +00007487void
7488PyUnicode_Append(PyObject **pleft, PyObject *right)
7489{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007490 PyObject *new;
7491 if (*pleft == NULL)
7492 return;
7493 if (right == NULL || !PyUnicode_Check(*pleft)) {
7494 Py_DECREF(*pleft);
7495 *pleft = NULL;
7496 return;
7497 }
7498 new = PyUnicode_Concat(*pleft, right);
7499 Py_DECREF(*pleft);
7500 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007501}
7502
7503void
7504PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7505{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007506 PyUnicode_Append(pleft, right);
7507 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007508}
7509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007510PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007513Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007514string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007515interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516
7517static PyObject *
7518unicode_count(PyUnicodeObject *self, PyObject *args)
7519{
7520 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007521 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007522 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523 PyObject *result;
7524
Jesus Ceaac451502011-04-20 17:09:23 +02007525 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7526 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007528
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007529 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007530 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007531 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007532 substring->str, substring->length,
7533 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007534 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535
7536 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007537
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538 return result;
7539}
7540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007541PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007542 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007544Encode S using the codec registered for encoding. Default encoding\n\
7545is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007546handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007547a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7548'xmlcharrefreplace' as well as any other name registered with\n\
7549codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550
7551static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007552unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007554 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555 char *encoding = NULL;
7556 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007557
Benjamin Peterson308d6372009-09-18 21:42:35 +00007558 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7559 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007561 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007562}
7563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007564PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007565 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566\n\
7567Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007568If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569
7570static PyObject*
7571unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7572{
7573 Py_UNICODE *e;
7574 Py_UNICODE *p;
7575 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007576 Py_UNICODE *qe;
7577 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 PyUnicodeObject *u;
7579 int tabsize = 8;
7580
7581 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583
Thomas Wouters7e474022000-07-16 12:04:32 +00007584 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007585 i = 0; /* chars up to and including most recent \n or \r */
7586 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7587 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588 for (p = self->str; p < e; p++)
7589 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 if (tabsize > 0) {
7591 incr = tabsize - (j % tabsize); /* cannot overflow */
7592 if (j > PY_SSIZE_T_MAX - incr)
7593 goto overflow1;
7594 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007595 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 if (j > PY_SSIZE_T_MAX - 1)
7599 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 j++;
7601 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 if (i > PY_SSIZE_T_MAX - j)
7603 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007605 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 }
7607 }
7608
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007609 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007611
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 /* Second pass: create output string and fill it */
7613 u = _PyUnicode_New(i + j);
7614 if (!u)
7615 return NULL;
7616
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007617 j = 0; /* same as in first pass */
7618 q = u->str; /* next output char */
7619 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620
7621 for (p = self->str; p < e; p++)
7622 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 if (tabsize > 0) {
7624 i = tabsize - (j % tabsize);
7625 j += i;
7626 while (i--) {
7627 if (q >= qe)
7628 goto overflow2;
7629 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007630 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007632 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 else {
7634 if (q >= qe)
7635 goto overflow2;
7636 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007637 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 if (*p == '\n' || *p == '\r')
7639 j = 0;
7640 }
7641
7642 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007643
7644 overflow2:
7645 Py_DECREF(u);
7646 overflow1:
7647 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649}
7650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007651PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653\n\
7654Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08007655such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656arguments start and end are interpreted as in slice notation.\n\
7657\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007658Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659
7660static PyObject *
7661unicode_find(PyUnicodeObject *self, PyObject *args)
7662{
Jesus Ceaac451502011-04-20 17:09:23 +02007663 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007664 Py_ssize_t start;
7665 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007666 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667
Jesus Ceaac451502011-04-20 17:09:23 +02007668 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7669 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671
Thomas Wouters477c8d52006-05-27 19:21:47 +00007672 result = stringlib_find_slice(
7673 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7674 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7675 start, end
7676 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677
7678 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007679
Christian Heimes217cfd12007-12-02 14:31:20 +00007680 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681}
7682
7683static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007684unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685{
7686 if (index < 0 || index >= self->length) {
7687 PyErr_SetString(PyExc_IndexError, "string index out of range");
7688 return NULL;
7689 }
7690
7691 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7692}
7693
Guido van Rossumc2504932007-09-18 19:42:40 +00007694/* Believe it or not, this produces the same value for ASCII strings
7695 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007696static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007697unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698{
Guido van Rossumc2504932007-09-18 19:42:40 +00007699 Py_ssize_t len;
7700 Py_UNICODE *p;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -08007701 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +00007702
Benjamin Petersonf6622c82012-04-09 14:53:07 -04007703#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -05007704 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -04007705#endif
Guido van Rossumc2504932007-09-18 19:42:40 +00007706 if (self->hash != -1)
7707 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007708 len = Py_SIZE(self);
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007709 /*
7710 We make the hash of the empty string be 0, rather than using
7711 (prefix ^ suffix), since this slightly obfuscates the hash secret
7712 */
7713 if (len == 0) {
7714 self->hash = 0;
7715 return 0;
7716 }
Guido van Rossumc2504932007-09-18 19:42:40 +00007717 p = self->str;
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007718 x = _Py_HashSecret.prefix;
7719 x ^= *p << 7;
Guido van Rossumc2504932007-09-18 19:42:40 +00007720 while (--len >= 0)
Gregory P. Smith63e6c322012-01-14 15:31:34 -08007721 x = (_PyHASH_MULTIPLIER*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007722 x ^= Py_SIZE(self);
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007723 x ^= _Py_HashSecret.suffix;
Guido van Rossumc2504932007-09-18 19:42:40 +00007724 if (x == -1)
7725 x = -2;
7726 self->hash = x;
7727 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728}
7729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007730PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007733Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734
7735static PyObject *
7736unicode_index(PyUnicodeObject *self, PyObject *args)
7737{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007738 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02007739 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007740 Py_ssize_t start;
7741 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742
Jesus Ceaac451502011-04-20 17:09:23 +02007743 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7744 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746
Thomas Wouters477c8d52006-05-27 19:21:47 +00007747 result = stringlib_find_slice(
7748 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7749 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7750 start, end
7751 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752
7753 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007754
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 if (result < 0) {
7756 PyErr_SetString(PyExc_ValueError, "substring not found");
7757 return NULL;
7758 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007759
Christian Heimes217cfd12007-12-02 14:31:20 +00007760 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761}
7762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007763PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007766Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007767at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768
7769static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007770unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771{
7772 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7773 register const Py_UNICODE *e;
7774 int cased;
7775
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776 /* Shortcut for single character strings */
7777 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007780 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007781 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007783
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784 e = p + PyUnicode_GET_SIZE(self);
7785 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007786 while (p < e) {
7787 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007788
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7790 return PyBool_FromLong(0);
7791 else if (!cased && Py_UNICODE_ISLOWER(ch))
7792 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007794 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795}
7796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007797PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007800Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007801at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802
7803static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007804unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805{
7806 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7807 register const Py_UNICODE *e;
7808 int cased;
7809
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 /* Shortcut for single character strings */
7811 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007814 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007815 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007817
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818 e = p + PyUnicode_GET_SIZE(self);
7819 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007820 while (p < e) {
7821 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007822
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7824 return PyBool_FromLong(0);
7825 else if (!cased && Py_UNICODE_ISUPPER(ch))
7826 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007828 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829}
7830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007831PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007834Return True if S is a titlecased string and there is at least one\n\
7835character in S, i.e. upper- and titlecase characters may only\n\
7836follow uncased characters and lowercase characters only cased ones.\n\
7837Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838
7839static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007840unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841{
7842 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7843 register const Py_UNICODE *e;
7844 int cased, previous_is_cased;
7845
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 /* Shortcut for single character strings */
7847 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7849 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007851 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007852 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007854
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855 e = p + PyUnicode_GET_SIZE(self);
7856 cased = 0;
7857 previous_is_cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007858 while (p < e) {
7859 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007860
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7862 if (previous_is_cased)
7863 return PyBool_FromLong(0);
7864 previous_is_cased = 1;
7865 cased = 1;
7866 }
7867 else if (Py_UNICODE_ISLOWER(ch)) {
7868 if (!previous_is_cased)
7869 return PyBool_FromLong(0);
7870 previous_is_cased = 1;
7871 cased = 1;
7872 }
7873 else
7874 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007876 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877}
7878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007879PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007882Return True if all characters in S are whitespace\n\
7883and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884
7885static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007886unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887{
7888 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7889 register const Py_UNICODE *e;
7890
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 /* Shortcut for single character strings */
7892 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 Py_UNICODE_ISSPACE(*p))
7894 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007896 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007897 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007899
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007901 while (p < e) {
7902 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7903 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007906 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907}
7908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007909PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007911\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007912Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007913and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007914
7915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007916unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007917{
7918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7919 register const Py_UNICODE *e;
7920
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007921 /* Shortcut for single character strings */
7922 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 Py_UNICODE_ISALPHA(*p))
7924 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007925
7926 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007927 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007929
7930 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007931 while (p < e) {
7932 if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007934 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007935 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007936}
7937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007938PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007940\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007941Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007942and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007943
7944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007945unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007946{
7947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7948 register const Py_UNICODE *e;
7949
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007950 /* Shortcut for single character strings */
7951 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 Py_UNICODE_ISALNUM(*p))
7953 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007954
7955 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007956 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007958
7959 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007960 while (p < e) {
7961 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7962 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007964 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007965 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007966}
7967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007968PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007971Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007972False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973
7974static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007975unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976{
7977 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7978 register const Py_UNICODE *e;
7979
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980 /* Shortcut for single character strings */
7981 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 Py_UNICODE_ISDECIMAL(*p))
7983 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007985 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007986 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007988
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007990 while (p < e) {
7991 if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007994 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995}
7996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007997PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00008000Return True if all characters in S are digits\n\
8001and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002
8003static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008004unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005{
8006 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8007 register const Py_UNICODE *e;
8008
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009 /* Shortcut for single character strings */
8010 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 Py_UNICODE_ISDIGIT(*p))
8012 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008014 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008015 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008017
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008019 while (p < e) {
8020 if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008023 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024}
8025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008026PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008029Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008030False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031
8032static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008033unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034{
8035 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8036 register const Py_UNICODE *e;
8037
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 /* Shortcut for single character strings */
8039 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 Py_UNICODE_ISNUMERIC(*p))
8041 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008043 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008044 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008046
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008048 while (p < e) {
8049 if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008052 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053}
8054
Martin v. Löwis47383402007-08-15 07:32:56 +00008055int
8056PyUnicode_IsIdentifier(PyObject *self)
8057{
Benjamin Petersonf413b802011-08-12 22:17:18 -05008058 const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008059 const Py_UNICODE *e;
8060 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +00008061
8062 /* Special case for empty strings */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008063 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008065
8066 /* PEP 3131 says that the first character must be in
8067 XID_Start and subsequent characters in XID_Continue,
8068 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008069 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008070 letters, digits, underscore). However, given the current
8071 definition of XID_Start and XID_Continue, it is sufficient
8072 to check just for these, except that _ must be allowed
8073 as starting an identifier. */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008074 e = p + PyUnicode_GET_SIZE(self);
8075 first = _Py_UNICODE_NEXT(p, e);
Benjamin Petersonf413b802011-08-12 22:17:18 -05008076 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +00008077 return 0;
8078
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008079 while (p < e)
8080 if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008082 return 1;
8083}
8084
8085PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008087\n\
8088Return True if S is a valid identifier according\n\
8089to the language definition.");
8090
8091static PyObject*
8092unicode_isidentifier(PyObject *self)
8093{
8094 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8095}
8096
Georg Brandl559e5d72008-06-11 18:37:52 +00008097PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008099\n\
8100Return True if all characters in S are considered\n\
8101printable in repr() or S is empty, False otherwise.");
8102
8103static PyObject*
8104unicode_isprintable(PyObject *self)
8105{
8106 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8107 register const Py_UNICODE *e;
8108
8109 /* Shortcut for single character strings */
8110 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8111 Py_RETURN_TRUE;
8112 }
8113
8114 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008115 while (p < e) {
8116 if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
Georg Brandl559e5d72008-06-11 18:37:52 +00008117 Py_RETURN_FALSE;
8118 }
8119 }
8120 Py_RETURN_TRUE;
8121}
8122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008123PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008124 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125\n\
8126Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008127iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128
8129static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008130unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008132 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133}
8134
Martin v. Löwis18e16552006-02-15 17:27:45 +00008135static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136unicode_length(PyUnicodeObject *self)
8137{
8138 return self->length;
8139}
8140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008141PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008144Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008145done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146
8147static PyObject *
8148unicode_ljust(PyUnicodeObject *self, PyObject *args)
8149{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008150 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008151 Py_UNICODE fillchar = ' ';
8152
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008153 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 return NULL;
8155
Tim Peters7a29bd52001-09-12 03:03:31 +00008156 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 Py_INCREF(self);
8158 return (PyObject*) self;
8159 }
8160
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008161 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162}
8163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008164PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008167Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168
8169static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008170unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172 return fixup(self, fixlower);
8173}
8174
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008175#define LEFTSTRIP 0
8176#define RIGHTSTRIP 1
8177#define BOTHSTRIP 2
8178
8179/* Arrays indexed by above */
8180static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8181
8182#define STRIPNAME(i) (stripformat[i]+3)
8183
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008184/* externally visible for str.strip(unicode) */
8185PyObject *
8186_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8187{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8189 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8190 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8191 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8192 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008193
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008195
Benjamin Peterson14339b62009-01-31 16:36:08 +00008196 i = 0;
8197 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8199 i++;
8200 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008201 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008202
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 j = len;
8204 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 do {
8206 j--;
8207 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8208 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008209 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008210
Benjamin Peterson14339b62009-01-31 16:36:08 +00008211 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 Py_INCREF(self);
8213 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 }
8215 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008217}
8218
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219
8220static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008221do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008223 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8224 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008225
Benjamin Peterson14339b62009-01-31 16:36:08 +00008226 i = 0;
8227 if (striptype != RIGHTSTRIP) {
8228 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8229 i++;
8230 }
8231 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008232
Benjamin Peterson14339b62009-01-31 16:36:08 +00008233 j = len;
8234 if (striptype != LEFTSTRIP) {
8235 do {
8236 j--;
8237 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8238 j++;
8239 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008240
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8242 Py_INCREF(self);
8243 return (PyObject*)self;
8244 }
8245 else
8246 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247}
8248
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008249
8250static PyObject *
8251do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8252{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008253 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008254
Benjamin Peterson14339b62009-01-31 16:36:08 +00008255 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8256 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008257
Benjamin Peterson14339b62009-01-31 16:36:08 +00008258 if (sep != NULL && sep != Py_None) {
8259 if (PyUnicode_Check(sep))
8260 return _PyUnicode_XStrip(self, striptype, sep);
8261 else {
8262 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 "%s arg must be None or str",
8264 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008265 return NULL;
8266 }
8267 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008268
Benjamin Peterson14339b62009-01-31 16:36:08 +00008269 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008270}
8271
8272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008273PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008275\n\
8276Return a copy of the string S with leading and trailing\n\
8277whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008278If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008279
8280static PyObject *
8281unicode_strip(PyUnicodeObject *self, PyObject *args)
8282{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008283 if (PyTuple_GET_SIZE(args) == 0)
8284 return do_strip(self, BOTHSTRIP); /* Common case */
8285 else
8286 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008287}
8288
8289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008290PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008292\n\
8293Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008294If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008295
8296static PyObject *
8297unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8298{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008299 if (PyTuple_GET_SIZE(args) == 0)
8300 return do_strip(self, LEFTSTRIP); /* Common case */
8301 else
8302 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008303}
8304
8305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008306PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008308\n\
8309Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008310If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008311
8312static PyObject *
8313unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8314{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008315 if (PyTuple_GET_SIZE(args) == 0)
8316 return do_strip(self, RIGHTSTRIP); /* Common case */
8317 else
8318 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008319}
8320
8321
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008323unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324{
8325 PyUnicodeObject *u;
8326 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008327 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008328 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329
Serhiy Storchaka05997252013-01-26 12:14:02 +02008330 if (len < 1)
8331 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332
Tim Peters7a29bd52001-09-12 03:03:31 +00008333 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 /* no repeat, return original string */
8335 Py_INCREF(str);
8336 return (PyObject*) str;
8337 }
Tim Peters8f422462000-09-09 06:13:41 +00008338
8339 /* ensure # of chars needed doesn't overflow int and # of bytes
8340 * needed doesn't overflow size_t
8341 */
8342 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008343 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008344 PyErr_SetString(PyExc_OverflowError,
8345 "repeated string is too long");
8346 return NULL;
8347 }
8348 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8349 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8350 PyErr_SetString(PyExc_OverflowError,
8351 "repeated string is too long");
8352 return NULL;
8353 }
8354 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 if (!u)
8356 return NULL;
8357
8358 p = u->str;
8359
Georg Brandl222de0f2009-04-12 12:01:50 +00008360 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008361 Py_UNICODE_FILL(p, str->str[0], len);
8362 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008363 Py_ssize_t done = str->length; /* number of characters copied this far */
8364 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008366 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008367 Py_UNICODE_COPY(p+done, p, n);
8368 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 }
8371
8372 return (PyObject*) u;
8373}
8374
8375PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 PyObject *subobj,
8377 PyObject *replobj,
8378 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379{
8380 PyObject *self;
8381 PyObject *str1;
8382 PyObject *str2;
8383 PyObject *result;
8384
8385 self = PyUnicode_FromObject(obj);
8386 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388 str1 = PyUnicode_FromObject(subobj);
8389 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 Py_DECREF(self);
8391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392 }
8393 str2 = PyUnicode_FromObject(replobj);
8394 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 Py_DECREF(self);
8396 Py_DECREF(str1);
8397 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 }
Tim Petersced69f82003-09-16 20:30:58 +00008399 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 (PyUnicodeObject *)str1,
8401 (PyUnicodeObject *)str2,
8402 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403 Py_DECREF(self);
8404 Py_DECREF(str1);
8405 Py_DECREF(str2);
8406 return result;
8407}
8408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008409PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008410 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411\n\
8412Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008413old replaced by new. If the optional argument count is\n\
8414given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415
8416static PyObject*
8417unicode_replace(PyUnicodeObject *self, PyObject *args)
8418{
8419 PyUnicodeObject *str1;
8420 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008421 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 PyObject *result;
8423
Martin v. Löwis18e16552006-02-15 17:27:45 +00008424 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 return NULL;
8426 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8427 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008430 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 Py_DECREF(str1);
8432 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008433 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434
8435 result = replace(self, str1, str2, maxcount);
8436
8437 Py_DECREF(str1);
8438 Py_DECREF(str2);
8439 return result;
8440}
8441
8442static
8443PyObject *unicode_repr(PyObject *unicode)
8444{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008445 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008446 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008447 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8448 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8449
8450 /* XXX(nnorwitz): rather than over-allocating, it would be
8451 better to choose a different scheme. Perhaps scan the
8452 first N-chars of the string and allocate based on that size.
8453 */
8454 /* Initial allocation is based on the longest-possible unichr
8455 escape.
8456
8457 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8458 unichr, so in this case it's the longest unichr escape. In
8459 narrow (UTF-16) builds this is five chars per source unichr
8460 since there are two unichrs in the surrogate pair, so in narrow
8461 (UTF-16) builds it's not the longest unichr escape.
8462
8463 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8464 so in the narrow (UTF-16) build case it's the longest unichr
8465 escape.
8466 */
8467
Walter Dörwald1ab83302007-05-18 17:15:44 +00008468 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008470#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008472#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008474#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008476 if (repr == NULL)
8477 return NULL;
8478
Walter Dörwald1ab83302007-05-18 17:15:44 +00008479 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008480
8481 /* Add quote */
8482 *p++ = (findchar(s, size, '\'') &&
8483 !findchar(s, size, '"')) ? '"' : '\'';
8484 while (size-- > 0) {
8485 Py_UNICODE ch = *s++;
8486
8487 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008488 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008489 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008490 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008491 continue;
8492 }
8493
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008495 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008496 *p++ = '\\';
8497 *p++ = 't';
8498 }
8499 else if (ch == '\n') {
8500 *p++ = '\\';
8501 *p++ = 'n';
8502 }
8503 else if (ch == '\r') {
8504 *p++ = '\\';
8505 *p++ = 'r';
8506 }
8507
8508 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008509 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008510 *p++ = '\\';
8511 *p++ = 'x';
8512 *p++ = hexdigits[(ch >> 4) & 0x000F];
8513 *p++ = hexdigits[ch & 0x000F];
8514 }
8515
Georg Brandl559e5d72008-06-11 18:37:52 +00008516 /* Copy ASCII characters as-is */
8517 else if (ch < 0x7F) {
8518 *p++ = ch;
8519 }
8520
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008522 else {
8523 Py_UCS4 ucs = ch;
8524
8525#ifndef Py_UNICODE_WIDE
8526 Py_UNICODE ch2 = 0;
8527 /* Get code point from surrogate pair */
8528 if (size > 0) {
8529 ch2 = *s;
8530 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008532 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008534 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008535 size--;
8536 }
8537 }
8538#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008539 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008540 (categories Z* and C* except ASCII space)
8541 */
8542 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8543 /* Map 8-bit characters to '\xhh' */
8544 if (ucs <= 0xff) {
8545 *p++ = '\\';
8546 *p++ = 'x';
8547 *p++ = hexdigits[(ch >> 4) & 0x000F];
8548 *p++ = hexdigits[ch & 0x000F];
8549 }
8550 /* Map 21-bit characters to '\U00xxxxxx' */
8551 else if (ucs >= 0x10000) {
8552 *p++ = '\\';
8553 *p++ = 'U';
8554 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8555 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8556 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8557 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8558 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8559 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8560 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8561 *p++ = hexdigits[ucs & 0x0000000F];
8562 }
8563 /* Map 16-bit characters to '\uxxxx' */
8564 else {
8565 *p++ = '\\';
8566 *p++ = 'u';
8567 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8568 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8569 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8570 *p++ = hexdigits[ucs & 0x000F];
8571 }
8572 }
8573 /* Copy characters as-is */
8574 else {
8575 *p++ = ch;
8576#ifndef Py_UNICODE_WIDE
8577 if (ucs >= 0x10000)
8578 *p++ = ch2;
8579#endif
8580 }
8581 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008582 }
8583 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008584 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008585
8586 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008587 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008588 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589}
8590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008591PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593\n\
8594Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08008595such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596arguments start and end are interpreted as in slice notation.\n\
8597\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008598Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599
8600static PyObject *
8601unicode_rfind(PyUnicodeObject *self, PyObject *args)
8602{
Jesus Ceaac451502011-04-20 17:09:23 +02008603 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008604 Py_ssize_t start;
8605 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008606 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607
Jesus Ceaac451502011-04-20 17:09:23 +02008608 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8609 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
Thomas Wouters477c8d52006-05-27 19:21:47 +00008612 result = stringlib_rfind_slice(
8613 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8614 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8615 start, end
8616 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617
8618 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008619
Christian Heimes217cfd12007-12-02 14:31:20 +00008620 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621}
8622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008623PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008626Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627
8628static PyObject *
8629unicode_rindex(PyUnicodeObject *self, PyObject *args)
8630{
Jesus Ceaac451502011-04-20 17:09:23 +02008631 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008632 Py_ssize_t start;
8633 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008634 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635
Jesus Ceaac451502011-04-20 17:09:23 +02008636 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8637 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639
Thomas Wouters477c8d52006-05-27 19:21:47 +00008640 result = stringlib_rfind_slice(
8641 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8642 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8643 start, end
8644 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645
8646 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008647
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 if (result < 0) {
8649 PyErr_SetString(PyExc_ValueError, "substring not found");
8650 return NULL;
8651 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008652 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653}
8654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008655PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008658Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008659done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660
8661static PyObject *
8662unicode_rjust(PyUnicodeObject *self, PyObject *args)
8663{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008664 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008665 Py_UNICODE fillchar = ' ';
8666
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008667 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 return NULL;
8669
Tim Peters7a29bd52001-09-12 03:03:31 +00008670 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 Py_INCREF(self);
8672 return (PyObject*) self;
8673 }
8674
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008675 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676}
8677
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 PyObject *sep,
8680 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681{
8682 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008683
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 s = PyUnicode_FromObject(s);
8685 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008686 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 if (sep != NULL) {
8688 sep = PyUnicode_FromObject(sep);
8689 if (sep == NULL) {
8690 Py_DECREF(s);
8691 return NULL;
8692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 }
8694
8695 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8696
8697 Py_DECREF(s);
8698 Py_XDECREF(sep);
8699 return result;
8700}
8701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008702PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704\n\
8705Return a list of the words in S, using sep as the\n\
8706delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008707splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008708whitespace string is a separator and empty strings are\n\
8709removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710
8711static PyObject*
8712unicode_split(PyUnicodeObject *self, PyObject *args)
8713{
8714 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008715 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716
Martin v. Löwis18e16552006-02-15 17:27:45 +00008717 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718 return NULL;
8719
8720 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726}
8727
Thomas Wouters477c8d52006-05-27 19:21:47 +00008728PyObject *
8729PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8730{
8731 PyObject* str_obj;
8732 PyObject* sep_obj;
8733 PyObject* out;
8734
8735 str_obj = PyUnicode_FromObject(str_in);
8736 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008738 sep_obj = PyUnicode_FromObject(sep_in);
8739 if (!sep_obj) {
8740 Py_DECREF(str_obj);
8741 return NULL;
8742 }
8743
8744 out = stringlib_partition(
8745 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8746 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8747 );
8748
8749 Py_DECREF(sep_obj);
8750 Py_DECREF(str_obj);
8751
8752 return out;
8753}
8754
8755
8756PyObject *
8757PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8758{
8759 PyObject* str_obj;
8760 PyObject* sep_obj;
8761 PyObject* out;
8762
8763 str_obj = PyUnicode_FromObject(str_in);
8764 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008766 sep_obj = PyUnicode_FromObject(sep_in);
8767 if (!sep_obj) {
8768 Py_DECREF(str_obj);
8769 return NULL;
8770 }
8771
8772 out = stringlib_rpartition(
8773 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8774 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8775 );
8776
8777 Py_DECREF(sep_obj);
8778 Py_DECREF(str_obj);
8779
8780 return out;
8781}
8782
8783PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008785\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008786Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008787the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008788found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008789
8790static PyObject*
8791unicode_partition(PyUnicodeObject *self, PyObject *separator)
8792{
8793 return PyUnicode_Partition((PyObject *)self, separator);
8794}
8795
8796PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008797 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008798\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008799Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008800the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008801separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008802
8803static PyObject*
8804unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8805{
8806 return PyUnicode_RPartition((PyObject *)self, separator);
8807}
8808
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008809PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 PyObject *sep,
8811 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008812{
8813 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008814
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008815 s = PyUnicode_FromObject(s);
8816 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008817 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 if (sep != NULL) {
8819 sep = PyUnicode_FromObject(sep);
8820 if (sep == NULL) {
8821 Py_DECREF(s);
8822 return NULL;
8823 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008824 }
8825
8826 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8827
8828 Py_DECREF(s);
8829 Py_XDECREF(sep);
8830 return result;
8831}
8832
8833PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008835\n\
8836Return a list of the words in S, using sep as the\n\
8837delimiter string, starting at the end of the string and\n\
8838working to the front. If maxsplit is given, at most maxsplit\n\
8839splits are done. If sep is not specified, any whitespace string\n\
8840is a separator.");
8841
8842static PyObject*
8843unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8844{
8845 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008846 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008847
Martin v. Löwis18e16552006-02-15 17:27:45 +00008848 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008849 return NULL;
8850
8851 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008853 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008855 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008856 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008857}
8858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008859PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008860 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861\n\
8862Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008863Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008864is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865
8866static PyObject*
8867unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8868{
Guido van Rossum86662912000-04-11 15:38:46 +00008869 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870
Guido van Rossum86662912000-04-11 15:38:46 +00008871 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 return NULL;
8873
Guido van Rossum86662912000-04-11 15:38:46 +00008874 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875}
8876
8877static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008878PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879{
Walter Dörwald346737f2007-05-31 10:44:43 +00008880 if (PyUnicode_CheckExact(self)) {
8881 Py_INCREF(self);
8882 return self;
8883 } else
8884 /* Subtype -- return genuine unicode string with the same value. */
8885 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8886 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887}
8888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008889PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891\n\
8892Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008893and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894
8895static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008896unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898 return fixup(self, fixswapcase);
8899}
8900
Georg Brandlceee0772007-11-27 23:48:05 +00008901PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008902 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008903\n\
8904Return a translation table usable for str.translate().\n\
8905If there is only one argument, it must be a dictionary mapping Unicode\n\
8906ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008907Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008908If there are two arguments, they must be strings of equal length, and\n\
8909in the resulting dictionary, each character in x will be mapped to the\n\
8910character at the same position in y. If there is a third argument, it\n\
8911must be a string, whose characters will be mapped to None in the result.");
8912
8913static PyObject*
8914unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8915{
8916 PyObject *x, *y = NULL, *z = NULL;
8917 PyObject *new = NULL, *key, *value;
8918 Py_ssize_t i = 0;
8919 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008920
Georg Brandlceee0772007-11-27 23:48:05 +00008921 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8922 return NULL;
8923 new = PyDict_New();
8924 if (!new)
8925 return NULL;
8926 if (y != NULL) {
8927 /* x must be a string too, of equal length */
8928 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8929 if (!PyUnicode_Check(x)) {
8930 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8931 "be a string if there is a second argument");
8932 goto err;
8933 }
8934 if (PyUnicode_GET_SIZE(x) != ylen) {
8935 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8936 "arguments must have equal length");
8937 goto err;
8938 }
8939 /* create entries for translating chars in x to those in y */
8940 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008941 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
Benjamin Peterson53aa1d72011-12-20 13:29:45 -06008942 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +00008943 goto err;
Benjamin Peterson53aa1d72011-12-20 13:29:45 -06008944 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8945 if (!value) {
8946 Py_DECREF(key);
8947 goto err;
8948 }
Georg Brandlceee0772007-11-27 23:48:05 +00008949 res = PyDict_SetItem(new, key, value);
8950 Py_DECREF(key);
8951 Py_DECREF(value);
8952 if (res < 0)
8953 goto err;
8954 }
8955 /* create entries for deleting chars in z */
8956 if (z != NULL) {
8957 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008958 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008959 if (!key)
8960 goto err;
8961 res = PyDict_SetItem(new, key, Py_None);
8962 Py_DECREF(key);
8963 if (res < 0)
8964 goto err;
8965 }
8966 }
8967 } else {
8968 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008969 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008970 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8971 "to maketrans it must be a dict");
8972 goto err;
8973 }
8974 /* copy entries into the new dict, converting string keys to int keys */
8975 while (PyDict_Next(x, &i, &key, &value)) {
8976 if (PyUnicode_Check(key)) {
8977 /* convert string keys to integer keys */
8978 PyObject *newkey;
8979 if (PyUnicode_GET_SIZE(key) != 1) {
8980 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8981 "table must be of length 1");
8982 goto err;
8983 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008984 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008985 if (!newkey)
8986 goto err;
8987 res = PyDict_SetItem(new, newkey, value);
8988 Py_DECREF(newkey);
8989 if (res < 0)
8990 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008991 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008992 /* just keep integer keys */
8993 if (PyDict_SetItem(new, key, value) < 0)
8994 goto err;
8995 } else {
8996 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8997 "be strings or integers");
8998 goto err;
8999 }
9000 }
9001 }
9002 return new;
9003 err:
9004 Py_DECREF(new);
9005 return NULL;
9006}
9007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009008PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010\n\
9011Return a copy of the string S, where all characters have been mapped\n\
9012through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009013Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00009014Unmapped characters are left untouched. Characters mapped to None\n\
9015are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016
9017static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009018unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019{
Georg Brandlceee0772007-11-27 23:48:05 +00009020 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021}
9022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009023PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009024 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009026Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027
9028static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009029unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031 return fixup(self, fixupper);
9032}
9033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009034PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009037Pad a numeric string S with zeros on the left, to fill a field\n\
9038of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039
9040static PyObject *
9041unicode_zfill(PyUnicodeObject *self, PyObject *args)
9042{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009043 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 PyUnicodeObject *u;
9045
Martin v. Löwis18e16552006-02-15 17:27:45 +00009046 Py_ssize_t width;
9047 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048 return NULL;
9049
9050 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009051 if (PyUnicode_CheckExact(self)) {
9052 Py_INCREF(self);
9053 return (PyObject*) self;
9054 }
9055 else
9056 return PyUnicode_FromUnicode(
9057 PyUnicode_AS_UNICODE(self),
9058 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009059 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060 }
9061
9062 fill = width - self->length;
9063
9064 u = pad(self, fill, 0, '0');
9065
Walter Dörwald068325e2002-04-15 13:36:47 +00009066 if (u == NULL)
9067 return NULL;
9068
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069 if (u->str[fill] == '+' || u->str[fill] == '-') {
9070 /* move sign to beginning of string */
9071 u->str[0] = u->str[fill];
9072 u->str[fill] = '0';
9073 }
9074
9075 return (PyObject*) u;
9076}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077
9078#if 0
9079static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009080unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081{
Christian Heimes2202f872008-02-06 14:31:34 +00009082 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009084
9085static PyObject *
9086unicode__decimal2ascii(PyObject *self)
9087{
9088 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9089 PyUnicode_GET_SIZE(self));
9090}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091#endif
9092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009093PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009096Return True if S starts with the specified prefix, False otherwise.\n\
9097With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009098With optional end, stop comparing S at that position.\n\
9099prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100
9101static PyObject *
9102unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009105 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009107 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009108 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009109 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110
Jesus Ceaac451502011-04-20 17:09:23 +02009111 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009112 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009113 if (PyTuple_Check(subobj)) {
9114 Py_ssize_t i;
9115 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9116 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009118 if (substring == NULL)
9119 return NULL;
9120 result = tailmatch(self, substring, start, end, -1);
9121 Py_DECREF(substring);
9122 if (result) {
9123 Py_RETURN_TRUE;
9124 }
9125 }
9126 /* nothing matched */
9127 Py_RETURN_FALSE;
9128 }
9129 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009130 if (substring == NULL) {
9131 if (PyErr_ExceptionMatches(PyExc_TypeError))
9132 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9133 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009135 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009136 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009138 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139}
9140
9141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009142PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009145Return True if S ends with the specified suffix, False otherwise.\n\
9146With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009147With optional end, stop comparing S at that position.\n\
9148suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149
9150static PyObject *
9151unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009154 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009156 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009157 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009158 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159
Jesus Ceaac451502011-04-20 17:09:23 +02009160 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009161 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009162 if (PyTuple_Check(subobj)) {
9163 Py_ssize_t i;
9164 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9165 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009167 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009169 result = tailmatch(self, substring, start, end, +1);
9170 Py_DECREF(substring);
9171 if (result) {
9172 Py_RETURN_TRUE;
9173 }
9174 }
9175 Py_RETURN_FALSE;
9176 }
9177 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009178 if (substring == NULL) {
9179 if (PyErr_ExceptionMatches(PyExc_TypeError))
9180 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9181 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009182 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009183 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009184 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009186 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187}
9188
Eric Smith8c663262007-08-25 02:26:07 +00009189#include "stringlib/string_format.h"
9190
9191PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009192 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009193\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009194Return a formatted version of S, using substitutions from args and kwargs.\n\
9195The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009196
Eric Smith27bbca62010-11-04 17:06:58 +00009197PyDoc_STRVAR(format_map__doc__,
9198 "S.format_map(mapping) -> str\n\
9199\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009200Return a formatted version of S, using substitutions from mapping.\n\
9201The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009202
Eric Smith4a7d76d2008-05-30 18:10:19 +00009203static PyObject *
9204unicode__format__(PyObject* self, PyObject* args)
9205{
9206 PyObject *format_spec;
9207
9208 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9209 return NULL;
9210
9211 return _PyUnicode_FormatAdvanced(self,
9212 PyUnicode_AS_UNICODE(format_spec),
9213 PyUnicode_GET_SIZE(format_spec));
9214}
9215
Eric Smith8c663262007-08-25 02:26:07 +00009216PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009218\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009219Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009220
9221static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009222unicode__sizeof__(PyUnicodeObject *v)
9223{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009224 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9225 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009226}
9227
9228PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009229 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009230
9231static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009232unicode_getnewargs(PyUnicodeObject *v)
9233{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009234 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009235}
9236
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009238 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009239 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9240 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009241 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009242 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9243 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9244 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9245 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9246 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9247 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9248 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009249 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009250 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9251 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9252 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009253 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009254 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9255 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9256 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009257 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009258 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009259 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009260 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009261 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9262 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9263 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9264 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9265 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9266 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9267 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9268 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9269 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9270 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9271 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9272 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9273 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9274 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009275 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009276 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009277 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009278 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009279 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009280 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009281 {"maketrans", (PyCFunction) unicode_maketrans,
9282 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009283 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009284#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009285 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286#endif
9287
9288#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009289 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009290 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009291 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292#endif
9293
Benjamin Peterson14339b62009-01-31 16:36:08 +00009294 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295 {NULL, NULL}
9296};
9297
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009298static PyObject *
9299unicode_mod(PyObject *v, PyObject *w)
9300{
Benjamin Peterson29060642009-01-31 22:14:21 +00009301 if (!PyUnicode_Check(v)) {
9302 Py_INCREF(Py_NotImplemented);
9303 return Py_NotImplemented;
9304 }
9305 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009306}
9307
9308static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009309 0, /*nb_add*/
9310 0, /*nb_subtract*/
9311 0, /*nb_multiply*/
9312 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009313};
9314
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009316 (lenfunc) unicode_length, /* sq_length */
9317 PyUnicode_Concat, /* sq_concat */
9318 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9319 (ssizeargfunc) unicode_getitem, /* sq_item */
9320 0, /* sq_slice */
9321 0, /* sq_ass_item */
9322 0, /* sq_ass_slice */
9323 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324};
9325
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009326static PyObject*
9327unicode_subscript(PyUnicodeObject* self, PyObject* item)
9328{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009329 if (PyIndex_Check(item)) {
9330 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009331 if (i == -1 && PyErr_Occurred())
9332 return NULL;
9333 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009334 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009335 return unicode_getitem(self, i);
9336 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009337 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009338 Py_UNICODE* source_buf;
9339 Py_UNICODE* result_buf;
9340 PyObject* result;
9341
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009342 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009344 return NULL;
9345 }
9346
9347 if (slicelength <= 0) {
9348 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009349 } else if (start == 0 && step == 1 && slicelength == self->length &&
9350 PyUnicode_CheckExact(self)) {
9351 Py_INCREF(self);
9352 return (PyObject *)self;
9353 } else if (step == 1) {
9354 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009355 } else {
9356 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009357 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9358 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009359
Benjamin Peterson29060642009-01-31 22:14:21 +00009360 if (result_buf == NULL)
9361 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009362
9363 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9364 result_buf[i] = source_buf[cur];
9365 }
Tim Petersced69f82003-09-16 20:30:58 +00009366
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009367 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009368 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009369 return result;
9370 }
9371 } else {
9372 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9373 return NULL;
9374 }
9375}
9376
9377static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009378 (lenfunc)unicode_length, /* mp_length */
9379 (binaryfunc)unicode_subscript, /* mp_subscript */
9380 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009381};
9382
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384/* Helpers for PyUnicode_Format() */
9385
9386static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009387getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009389 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 (*p_argidx)++;
9392 if (arglen < 0)
9393 return args;
9394 else
9395 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396 }
9397 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009398 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399 return NULL;
9400}
9401
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009402/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009404static PyObject *
9405formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009407 char *p;
9408 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009410
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411 x = PyFloat_AsDouble(v);
9412 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009413 return NULL;
9414
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009417
Eric Smith0923d1d2009-04-16 20:16:10 +00009418 p = PyOS_double_to_string(x, type, prec,
9419 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009420 if (p == NULL)
9421 return NULL;
9422 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009423 PyMem_Free(p);
9424 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425}
9426
Tim Peters38fd5b62000-09-21 05:43:11 +00009427static PyObject*
9428formatlong(PyObject *val, int flags, int prec, int type)
9429{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009430 char *buf;
9431 int len;
9432 PyObject *str; /* temporary string object. */
9433 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009434
Benjamin Peterson14339b62009-01-31 16:36:08 +00009435 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9436 if (!str)
9437 return NULL;
9438 result = PyUnicode_FromStringAndSize(buf, len);
9439 Py_DECREF(str);
9440 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009441}
9442
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443static int
9444formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009445 size_t buflen,
9446 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009448 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009449 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009450 if (PyUnicode_GET_SIZE(v) == 1) {
9451 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9452 buf[1] = '\0';
9453 return 1;
9454 }
9455#ifndef Py_UNICODE_WIDE
9456 if (PyUnicode_GET_SIZE(v) == 2) {
9457 /* Decode a valid surrogate pair */
9458 int c0 = PyUnicode_AS_UNICODE(v)[0];
9459 int c1 = PyUnicode_AS_UNICODE(v)[1];
9460 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9461 0xDC00 <= c1 && c1 <= 0xDFFF) {
9462 buf[0] = c0;
9463 buf[1] = c1;
9464 buf[2] = '\0';
9465 return 2;
9466 }
9467 }
9468#endif
9469 goto onError;
9470 }
9471 else {
9472 /* Integer input truncated to a character */
9473 long x;
9474 x = PyLong_AsLong(v);
9475 if (x == -1 && PyErr_Occurred())
9476 goto onError;
9477
9478 if (x < 0 || x > 0x10ffff) {
9479 PyErr_SetString(PyExc_OverflowError,
9480 "%c arg not in range(0x110000)");
9481 return -1;
9482 }
9483
9484#ifndef Py_UNICODE_WIDE
9485 if (x > 0xffff) {
9486 x -= 0x10000;
9487 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9488 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9489 return 2;
9490 }
9491#endif
9492 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009493 buf[1] = '\0';
9494 return 1;
9495 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009496
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009498 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009499 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009500 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501}
9502
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009503/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009504 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009505*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009506#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009507
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510{
9511 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009512 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513 int args_owned = 0;
9514 PyUnicodeObject *result = NULL;
9515 PyObject *dict = NULL;
9516 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009517
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 PyErr_BadInternalCall();
9520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521 }
9522 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009523 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525 fmt = PyUnicode_AS_UNICODE(uformat);
9526 fmtcnt = PyUnicode_GET_SIZE(uformat);
9527
9528 reslen = rescnt = fmtcnt + 100;
9529 result = _PyUnicode_New(reslen);
9530 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009531 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 res = PyUnicode_AS_UNICODE(result);
9533
9534 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 arglen = PyTuple_Size(args);
9536 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 }
9538 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 arglen = -1;
9540 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -04009542 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544
9545 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 if (*fmt != '%') {
9547 if (--rescnt < 0) {
9548 rescnt = fmtcnt + 100;
9549 reslen += rescnt;
9550 if (_PyUnicode_Resize(&result, reslen) < 0)
9551 goto onError;
9552 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9553 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009554 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009556 }
9557 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009558 /* Got a format specifier */
9559 int flags = 0;
9560 Py_ssize_t width = -1;
9561 int prec = -1;
9562 Py_UNICODE c = '\0';
9563 Py_UNICODE fill;
9564 int isnumok;
9565 PyObject *v = NULL;
9566 PyObject *temp = NULL;
9567 Py_UNICODE *pbuf;
9568 Py_UNICODE sign;
9569 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009570 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 fmt++;
9573 if (*fmt == '(') {
9574 Py_UNICODE *keystart;
9575 Py_ssize_t keylen;
9576 PyObject *key;
9577 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009578
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 if (dict == NULL) {
9580 PyErr_SetString(PyExc_TypeError,
9581 "format requires a mapping");
9582 goto onError;
9583 }
9584 ++fmt;
9585 --fmtcnt;
9586 keystart = fmt;
9587 /* Skip over balanced parentheses */
9588 while (pcount > 0 && --fmtcnt >= 0) {
9589 if (*fmt == ')')
9590 --pcount;
9591 else if (*fmt == '(')
9592 ++pcount;
9593 fmt++;
9594 }
9595 keylen = fmt - keystart - 1;
9596 if (fmtcnt < 0 || pcount > 0) {
9597 PyErr_SetString(PyExc_ValueError,
9598 "incomplete format key");
9599 goto onError;
9600 }
9601#if 0
9602 /* keys are converted to strings using UTF-8 and
9603 then looked up since Python uses strings to hold
9604 variables names etc. in its namespaces and we
9605 wouldn't want to break common idioms. */
9606 key = PyUnicode_EncodeUTF8(keystart,
9607 keylen,
9608 NULL);
9609#else
9610 key = PyUnicode_FromUnicode(keystart, keylen);
9611#endif
9612 if (key == NULL)
9613 goto onError;
9614 if (args_owned) {
9615 Py_DECREF(args);
9616 args_owned = 0;
9617 }
9618 args = PyObject_GetItem(dict, key);
9619 Py_DECREF(key);
9620 if (args == NULL) {
9621 goto onError;
9622 }
9623 args_owned = 1;
9624 arglen = -1;
9625 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009626 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009627 while (--fmtcnt >= 0) {
9628 switch (c = *fmt++) {
9629 case '-': flags |= F_LJUST; continue;
9630 case '+': flags |= F_SIGN; continue;
9631 case ' ': flags |= F_BLANK; continue;
9632 case '#': flags |= F_ALT; continue;
9633 case '0': flags |= F_ZERO; continue;
9634 }
9635 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009636 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009637 if (c == '*') {
9638 v = getnextarg(args, arglen, &argidx);
9639 if (v == NULL)
9640 goto onError;
9641 if (!PyLong_Check(v)) {
9642 PyErr_SetString(PyExc_TypeError,
9643 "* wants int");
9644 goto onError;
9645 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +02009646 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00009647 if (width == -1 && PyErr_Occurred())
9648 goto onError;
9649 if (width < 0) {
9650 flags |= F_LJUST;
9651 width = -width;
9652 }
9653 if (--fmtcnt >= 0)
9654 c = *fmt++;
9655 }
9656 else if (c >= '0' && c <= '9') {
9657 width = c - '0';
9658 while (--fmtcnt >= 0) {
9659 c = *fmt++;
9660 if (c < '0' || c > '9')
9661 break;
Mark Dickinsonfb90c092012-10-28 10:18:03 +00009662 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009663 PyErr_SetString(PyExc_ValueError,
9664 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009665 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009666 }
9667 width = width*10 + (c - '0');
9668 }
9669 }
9670 if (c == '.') {
9671 prec = 0;
9672 if (--fmtcnt >= 0)
9673 c = *fmt++;
9674 if (c == '*') {
9675 v = getnextarg(args, arglen, &argidx);
9676 if (v == NULL)
9677 goto onError;
9678 if (!PyLong_Check(v)) {
9679 PyErr_SetString(PyExc_TypeError,
9680 "* wants int");
9681 goto onError;
9682 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +02009683 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00009684 if (prec == -1 && PyErr_Occurred())
9685 goto onError;
9686 if (prec < 0)
9687 prec = 0;
9688 if (--fmtcnt >= 0)
9689 c = *fmt++;
9690 }
9691 else if (c >= '0' && c <= '9') {
9692 prec = c - '0';
9693 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009694 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 if (c < '0' || c > '9')
9696 break;
Mark Dickinsonfb90c092012-10-28 10:18:03 +00009697 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009698 PyErr_SetString(PyExc_ValueError,
9699 "prec too big");
9700 goto onError;
9701 }
9702 prec = prec*10 + (c - '0');
9703 }
9704 }
9705 } /* prec */
9706 if (fmtcnt >= 0) {
9707 if (c == 'h' || c == 'l' || c == 'L') {
9708 if (--fmtcnt >= 0)
9709 c = *fmt++;
9710 }
9711 }
9712 if (fmtcnt < 0) {
9713 PyErr_SetString(PyExc_ValueError,
9714 "incomplete format");
9715 goto onError;
9716 }
9717 if (c != '%') {
9718 v = getnextarg(args, arglen, &argidx);
9719 if (v == NULL)
9720 goto onError;
9721 }
9722 sign = 0;
9723 fill = ' ';
9724 switch (c) {
9725
9726 case '%':
9727 pbuf = formatbuf;
9728 /* presume that buffer length is at least 1 */
9729 pbuf[0] = '%';
9730 len = 1;
9731 break;
9732
9733 case 's':
9734 case 'r':
9735 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009736 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009737 temp = v;
9738 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009739 }
9740 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009741 if (c == 's')
9742 temp = PyObject_Str(v);
9743 else if (c == 'r')
9744 temp = PyObject_Repr(v);
9745 else
9746 temp = PyObject_ASCII(v);
9747 if (temp == NULL)
9748 goto onError;
9749 if (PyUnicode_Check(temp))
9750 /* nothing to do */;
9751 else {
9752 Py_DECREF(temp);
9753 PyErr_SetString(PyExc_TypeError,
9754 "%s argument has non-string str()");
9755 goto onError;
9756 }
9757 }
9758 pbuf = PyUnicode_AS_UNICODE(temp);
9759 len = PyUnicode_GET_SIZE(temp);
9760 if (prec >= 0 && len > prec)
9761 len = prec;
9762 break;
9763
9764 case 'i':
9765 case 'd':
9766 case 'u':
9767 case 'o':
9768 case 'x':
9769 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +00009770 isnumok = 0;
9771 if (PyNumber_Check(v)) {
9772 PyObject *iobj=NULL;
9773
9774 if (PyLong_Check(v)) {
9775 iobj = v;
9776 Py_INCREF(iobj);
9777 }
9778 else {
9779 iobj = PyNumber_Long(v);
9780 }
9781 if (iobj!=NULL) {
9782 if (PyLong_Check(iobj)) {
9783 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07009784 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +00009785 Py_DECREF(iobj);
9786 if (!temp)
9787 goto onError;
9788 pbuf = PyUnicode_AS_UNICODE(temp);
9789 len = PyUnicode_GET_SIZE(temp);
9790 sign = 1;
9791 }
9792 else {
9793 Py_DECREF(iobj);
9794 }
9795 }
9796 }
9797 if (!isnumok) {
9798 PyErr_Format(PyExc_TypeError,
9799 "%%%c format: a number is required, "
9800 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9801 goto onError;
9802 }
9803 if (flags & F_ZERO)
9804 fill = '0';
9805 break;
9806
9807 case 'e':
9808 case 'E':
9809 case 'f':
9810 case 'F':
9811 case 'g':
9812 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009813 temp = formatfloat(v, flags, prec, c);
9814 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009815 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009816 pbuf = PyUnicode_AS_UNICODE(temp);
9817 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009818 sign = 1;
9819 if (flags & F_ZERO)
9820 fill = '0';
9821 break;
9822
9823 case 'c':
9824 pbuf = formatbuf;
9825 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9826 if (len < 0)
9827 goto onError;
9828 break;
9829
9830 default:
9831 PyErr_Format(PyExc_ValueError,
9832 "unsupported format character '%c' (0x%x) "
9833 "at index %zd",
9834 (31<=c && c<=126) ? (char)c : '?',
9835 (int)c,
9836 (Py_ssize_t)(fmt - 1 -
9837 PyUnicode_AS_UNICODE(uformat)));
9838 goto onError;
9839 }
9840 if (sign) {
9841 if (*pbuf == '-' || *pbuf == '+') {
9842 sign = *pbuf++;
9843 len--;
9844 }
9845 else if (flags & F_SIGN)
9846 sign = '+';
9847 else if (flags & F_BLANK)
9848 sign = ' ';
9849 else
9850 sign = 0;
9851 }
9852 if (width < len)
9853 width = len;
9854 if (rescnt - (sign != 0) < width) {
9855 reslen -= rescnt;
9856 rescnt = width + fmtcnt + 100;
9857 reslen += rescnt;
9858 if (reslen < 0) {
9859 Py_XDECREF(temp);
9860 PyErr_NoMemory();
9861 goto onError;
9862 }
9863 if (_PyUnicode_Resize(&result, reslen) < 0) {
9864 Py_XDECREF(temp);
9865 goto onError;
9866 }
9867 res = PyUnicode_AS_UNICODE(result)
9868 + reslen - rescnt;
9869 }
9870 if (sign) {
9871 if (fill != ' ')
9872 *res++ = sign;
9873 rescnt--;
9874 if (width > len)
9875 width--;
9876 }
9877 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9878 assert(pbuf[0] == '0');
9879 assert(pbuf[1] == c);
9880 if (fill != ' ') {
9881 *res++ = *pbuf++;
9882 *res++ = *pbuf++;
9883 }
9884 rescnt -= 2;
9885 width -= 2;
9886 if (width < 0)
9887 width = 0;
9888 len -= 2;
9889 }
9890 if (width > len && !(flags & F_LJUST)) {
9891 do {
9892 --rescnt;
9893 *res++ = fill;
9894 } while (--width > len);
9895 }
9896 if (fill == ' ') {
9897 if (sign)
9898 *res++ = sign;
9899 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9900 assert(pbuf[0] == '0');
9901 assert(pbuf[1] == c);
9902 *res++ = *pbuf++;
9903 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009904 }
9905 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009906 Py_UNICODE_COPY(res, pbuf, len);
9907 res += len;
9908 rescnt -= len;
9909 while (--width >= len) {
9910 --rescnt;
9911 *res++ = ' ';
9912 }
9913 if (dict && (argidx < arglen) && c != '%') {
9914 PyErr_SetString(PyExc_TypeError,
9915 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009916 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 goto onError;
9918 }
9919 Py_XDECREF(temp);
9920 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921 } /* until end */
9922 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009923 PyErr_SetString(PyExc_TypeError,
9924 "not all arguments converted during string formatting");
9925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926 }
9927
Thomas Woutersa96affe2006-03-12 00:29:36 +00009928 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009929 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 }
9933 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934 return (PyObject *)result;
9935
Benjamin Peterson29060642009-01-31 22:14:21 +00009936 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937 Py_XDECREF(result);
9938 Py_DECREF(uformat);
9939 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009940 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941 }
9942 return NULL;
9943}
9944
Jeremy Hylton938ace62002-07-17 16:30:39 +00009945static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009946unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9947
Tim Peters6d6c1a32001-08-02 04:15:00 +00009948static PyObject *
9949unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9950{
Benjamin Peterson29060642009-01-31 22:14:21 +00009951 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009952 static char *kwlist[] = {"object", "encoding", "errors", 0};
9953 char *encoding = NULL;
9954 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009955
Benjamin Peterson14339b62009-01-31 16:36:08 +00009956 if (type != &PyUnicode_Type)
9957 return unicode_subtype_new(type, args, kwds);
9958 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009959 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009960 return NULL;
9961 if (x == NULL)
9962 return (PyObject *)_PyUnicode_New(0);
9963 if (encoding == NULL && errors == NULL)
9964 return PyObject_Str(x);
9965 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009966 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009967}
9968
Guido van Rossume023fe02001-08-30 03:12:59 +00009969static PyObject *
9970unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9971{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009972 PyUnicodeObject *tmp, *pnew;
9973 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009974
Benjamin Peterson14339b62009-01-31 16:36:08 +00009975 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9976 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9977 if (tmp == NULL)
9978 return NULL;
9979 assert(PyUnicode_Check(tmp));
9980 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9981 if (pnew == NULL) {
9982 Py_DECREF(tmp);
9983 return NULL;
9984 }
9985 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9986 if (pnew->str == NULL) {
9987 _Py_ForgetReference((PyObject *)pnew);
9988 PyObject_Del(pnew);
9989 Py_DECREF(tmp);
9990 return PyErr_NoMemory();
9991 }
9992 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9993 pnew->length = n;
9994 pnew->hash = tmp->hash;
9995 Py_DECREF(tmp);
9996 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009997}
9998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009999PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070010000"str(object='') -> str\n\
10001str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000010002\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100010003Create a new string object from the given object. If encoding or\n\
10004errors is specified, then the object must expose a data buffer\n\
10005that will be decoded using the given encoding and error handler.\n\
10006Otherwise, returns the result of object.__str__() (if defined)\n\
10007or repr(object).\n\
10008encoding defaults to sys.getdefaultencoding().\n\
10009errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000010010
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010011static PyObject *unicode_iter(PyObject *seq);
10012
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000010014 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010015 "str", /* tp_name */
10016 sizeof(PyUnicodeObject), /* tp_size */
10017 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010019 (destructor)unicode_dealloc, /* tp_dealloc */
10020 0, /* tp_print */
10021 0, /* tp_getattr */
10022 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010023 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010024 unicode_repr, /* tp_repr */
10025 &unicode_as_number, /* tp_as_number */
10026 &unicode_as_sequence, /* tp_as_sequence */
10027 &unicode_as_mapping, /* tp_as_mapping */
10028 (hashfunc) unicode_hash, /* tp_hash*/
10029 0, /* tp_call*/
10030 (reprfunc) unicode_str, /* tp_str */
10031 PyObject_GenericGetAttr, /* tp_getattro */
10032 0, /* tp_setattro */
10033 0, /* tp_as_buffer */
10034 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000010035 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010036 unicode_doc, /* tp_doc */
10037 0, /* tp_traverse */
10038 0, /* tp_clear */
10039 PyUnicode_RichCompare, /* tp_richcompare */
10040 0, /* tp_weaklistoffset */
10041 unicode_iter, /* tp_iter */
10042 0, /* tp_iternext */
10043 unicode_methods, /* tp_methods */
10044 0, /* tp_members */
10045 0, /* tp_getset */
10046 &PyBaseObject_Type, /* tp_base */
10047 0, /* tp_dict */
10048 0, /* tp_descr_get */
10049 0, /* tp_descr_set */
10050 0, /* tp_dictoffset */
10051 0, /* tp_init */
10052 0, /* tp_alloc */
10053 unicode_new, /* tp_new */
10054 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055};
10056
10057/* Initialize the Unicode implementation */
10058
Thomas Wouters78890102000-07-22 19:25:51 +000010059void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010061 /* XXX - move this array to unicodectype.c ? */
10062 Py_UNICODE linebreak[] = {
10063 0x000A, /* LINE FEED */
10064 0x000D, /* CARRIAGE RETURN */
10065 0x001C, /* FILE SEPARATOR */
10066 0x001D, /* GROUP SEPARATOR */
10067 0x001E, /* RECORD SEPARATOR */
10068 0x0085, /* NEXT LINE */
10069 0x2028, /* LINE SEPARATOR */
10070 0x2029, /* PARAGRAPH SEPARATOR */
10071 };
10072
Fred Drakee4315f52000-05-09 19:53:39 +000010073 /* Init the implementation */
Serhiy Storchaka05997252013-01-26 12:14:02 +020010074 if (!unicode_empty) {
10075 unicode_empty = _PyUnicode_New(0);
10076 if (!unicode_empty)
10077 return;
10078 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010079
Guido van Rossumcacfc072002-05-24 19:01:59 +000010080 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010081 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010082
10083 /* initialize the linebreak bloom filter */
10084 bloom_linebreak = make_bloom_mask(
10085 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10086 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010087
10088 PyType_Ready(&EncodingMapType);
Benjamin Petersonc4311282012-10-30 23:21:10 -040010089
10090 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
10091 Py_FatalError("Can't initialize field name iterator type");
10092
10093 if (PyType_Ready(&PyFormatterIter_Type) < 0)
10094 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095}
10096
10097/* Finalize the Unicode implementation */
10098
Christian Heimesa156e092008-02-16 07:38:31 +000010099int
10100PyUnicode_ClearFreeList(void)
10101{
10102 int freelist_size = numfree;
10103 PyUnicodeObject *u;
10104
10105 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010106 PyUnicodeObject *v = u;
10107 u = *(PyUnicodeObject **)u;
10108 if (v->str)
10109 PyObject_DEL(v->str);
10110 Py_XDECREF(v->defenc);
10111 PyObject_Del(v);
10112 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010113 }
10114 free_list = NULL;
10115 assert(numfree == 0);
10116 return freelist_size;
10117}
10118
Guido van Rossumd57fd912000-03-10 22:53:23 +000010119void
Thomas Wouters78890102000-07-22 19:25:51 +000010120_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010122 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123
Serhiy Storchaka05997252013-01-26 12:14:02 +020010124 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010125
Serhiy Storchaka05997252013-01-26 12:14:02 +020010126 for (i = 0; i < 256; i++)
10127 Py_CLEAR(unicode_latin1[i]);
10128
Christian Heimesa156e092008-02-16 07:38:31 +000010129 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010130}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010131
Walter Dörwald16807132007-05-25 13:52:07 +000010132void
10133PyUnicode_InternInPlace(PyObject **p)
10134{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010135 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10136 PyObject *t;
10137 if (s == NULL || !PyUnicode_Check(s))
10138 Py_FatalError(
10139 "PyUnicode_InternInPlace: unicode strings only please!");
10140 /* If it's a subclass, we don't really know what putting
10141 it in the interned dict might do. */
10142 if (!PyUnicode_CheckExact(s))
10143 return;
10144 if (PyUnicode_CHECK_INTERNED(s))
10145 return;
10146 if (interned == NULL) {
10147 interned = PyDict_New();
10148 if (interned == NULL) {
10149 PyErr_Clear(); /* Don't leave an exception */
10150 return;
10151 }
10152 }
10153 /* It might be that the GetItem call fails even
10154 though the key is present in the dictionary,
10155 namely when this happens during a stack overflow. */
10156 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010157 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010158 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010159
Benjamin Peterson29060642009-01-31 22:14:21 +000010160 if (t) {
10161 Py_INCREF(t);
10162 Py_DECREF(*p);
10163 *p = t;
10164 return;
10165 }
Walter Dörwald16807132007-05-25 13:52:07 +000010166
Benjamin Peterson14339b62009-01-31 16:36:08 +000010167 PyThreadState_GET()->recursion_critical = 1;
10168 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10169 PyErr_Clear();
10170 PyThreadState_GET()->recursion_critical = 0;
10171 return;
10172 }
10173 PyThreadState_GET()->recursion_critical = 0;
10174 /* The two references in interned are not counted by refcnt.
10175 The deallocator will take care of this */
10176 Py_REFCNT(s) -= 2;
10177 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010178}
10179
10180void
10181PyUnicode_InternImmortal(PyObject **p)
10182{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010183 PyUnicode_InternInPlace(p);
10184 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10185 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10186 Py_INCREF(*p);
10187 }
Walter Dörwald16807132007-05-25 13:52:07 +000010188}
10189
10190PyObject *
10191PyUnicode_InternFromString(const char *cp)
10192{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010193 PyObject *s = PyUnicode_FromString(cp);
10194 if (s == NULL)
10195 return NULL;
10196 PyUnicode_InternInPlace(&s);
10197 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010198}
10199
10200void _Py_ReleaseInternedUnicodeStrings(void)
10201{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010202 PyObject *keys;
10203 PyUnicodeObject *s;
10204 Py_ssize_t i, n;
10205 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010206
Benjamin Peterson14339b62009-01-31 16:36:08 +000010207 if (interned == NULL || !PyDict_Check(interned))
10208 return;
10209 keys = PyDict_Keys(interned);
10210 if (keys == NULL || !PyList_Check(keys)) {
10211 PyErr_Clear();
10212 return;
10213 }
Walter Dörwald16807132007-05-25 13:52:07 +000010214
Benjamin Peterson14339b62009-01-31 16:36:08 +000010215 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10216 detector, interned unicode strings are not forcibly deallocated;
10217 rather, we give them their stolen references back, and then clear
10218 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010219
Benjamin Peterson14339b62009-01-31 16:36:08 +000010220 n = PyList_GET_SIZE(keys);
10221 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010222 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010223 for (i = 0; i < n; i++) {
10224 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10225 switch (s->state) {
10226 case SSTATE_NOT_INTERNED:
10227 /* XXX Shouldn't happen */
10228 break;
10229 case SSTATE_INTERNED_IMMORTAL:
10230 Py_REFCNT(s) += 1;
10231 immortal_size += s->length;
10232 break;
10233 case SSTATE_INTERNED_MORTAL:
10234 Py_REFCNT(s) += 2;
10235 mortal_size += s->length;
10236 break;
10237 default:
10238 Py_FatalError("Inconsistent interned string state.");
10239 }
10240 s->state = SSTATE_NOT_INTERNED;
10241 }
10242 fprintf(stderr, "total size of all interned strings: "
10243 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10244 "mortal/immortal\n", mortal_size, immortal_size);
10245 Py_DECREF(keys);
10246 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020010247 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000010248}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010249
10250
10251/********************* Unicode Iterator **************************/
10252
10253typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010254 PyObject_HEAD
10255 Py_ssize_t it_index;
10256 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010257} unicodeiterobject;
10258
10259static void
10260unicodeiter_dealloc(unicodeiterobject *it)
10261{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010262 _PyObject_GC_UNTRACK(it);
10263 Py_XDECREF(it->it_seq);
10264 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010265}
10266
10267static int
10268unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10269{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010270 Py_VISIT(it->it_seq);
10271 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010272}
10273
10274static PyObject *
10275unicodeiter_next(unicodeiterobject *it)
10276{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010277 PyUnicodeObject *seq;
10278 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010279
Benjamin Peterson14339b62009-01-31 16:36:08 +000010280 assert(it != NULL);
10281 seq = it->it_seq;
10282 if (seq == NULL)
10283 return NULL;
10284 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010285
Benjamin Peterson14339b62009-01-31 16:36:08 +000010286 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10287 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010288 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010289 if (item != NULL)
10290 ++it->it_index;
10291 return item;
10292 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010293
Benjamin Peterson14339b62009-01-31 16:36:08 +000010294 Py_DECREF(seq);
10295 it->it_seq = NULL;
10296 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010297}
10298
10299static PyObject *
10300unicodeiter_len(unicodeiterobject *it)
10301{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010302 Py_ssize_t len = 0;
10303 if (it->it_seq)
10304 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10305 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010306}
10307
10308PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10309
10310static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010311 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010312 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010313 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010314};
10315
10316PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010317 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10318 "str_iterator", /* tp_name */
10319 sizeof(unicodeiterobject), /* tp_basicsize */
10320 0, /* tp_itemsize */
10321 /* methods */
10322 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10323 0, /* tp_print */
10324 0, /* tp_getattr */
10325 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010326 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010327 0, /* tp_repr */
10328 0, /* tp_as_number */
10329 0, /* tp_as_sequence */
10330 0, /* tp_as_mapping */
10331 0, /* tp_hash */
10332 0, /* tp_call */
10333 0, /* tp_str */
10334 PyObject_GenericGetAttr, /* tp_getattro */
10335 0, /* tp_setattro */
10336 0, /* tp_as_buffer */
10337 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10338 0, /* tp_doc */
10339 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10340 0, /* tp_clear */
10341 0, /* tp_richcompare */
10342 0, /* tp_weaklistoffset */
10343 PyObject_SelfIter, /* tp_iter */
10344 (iternextfunc)unicodeiter_next, /* tp_iternext */
10345 unicodeiter_methods, /* tp_methods */
10346 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010347};
10348
10349static PyObject *
10350unicode_iter(PyObject *seq)
10351{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010352 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010353
Benjamin Peterson14339b62009-01-31 16:36:08 +000010354 if (!PyUnicode_Check(seq)) {
10355 PyErr_BadInternalCall();
10356 return NULL;
10357 }
10358 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10359 if (it == NULL)
10360 return NULL;
10361 it->it_index = 0;
10362 Py_INCREF(seq);
10363 it->it_seq = (PyUnicodeObject *)seq;
10364 _PyObject_GC_TRACK(it);
10365 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010366}
10367
Martin v. Löwis5b222132007-06-10 09:51:05 +000010368size_t
10369Py_UNICODE_strlen(const Py_UNICODE *u)
10370{
10371 int res = 0;
10372 while(*u++)
10373 res++;
10374 return res;
10375}
10376
10377Py_UNICODE*
10378Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10379{
10380 Py_UNICODE *u = s1;
10381 while ((*u++ = *s2++));
10382 return s1;
10383}
10384
10385Py_UNICODE*
10386Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10387{
10388 Py_UNICODE *u = s1;
10389 while ((*u++ = *s2++))
10390 if (n-- == 0)
10391 break;
10392 return s1;
10393}
10394
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010395Py_UNICODE*
10396Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10397{
10398 Py_UNICODE *u1 = s1;
10399 u1 += Py_UNICODE_strlen(u1);
10400 Py_UNICODE_strcpy(u1, s2);
10401 return s1;
10402}
10403
Martin v. Löwis5b222132007-06-10 09:51:05 +000010404int
10405Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10406{
10407 while (*s1 && *s2 && *s1 == *s2)
10408 s1++, s2++;
10409 if (*s1 && *s2)
10410 return (*s1 < *s2) ? -1 : +1;
10411 if (*s1)
10412 return 1;
10413 if (*s2)
10414 return -1;
10415 return 0;
10416}
10417
Victor Stinneref8d95c2010-08-16 22:03:11 +000010418int
10419Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10420{
10421 register Py_UNICODE u1, u2;
10422 for (; n != 0; n--) {
10423 u1 = *s1;
10424 u2 = *s2;
10425 if (u1 != u2)
10426 return (u1 < u2) ? -1 : +1;
10427 if (u1 == '\0')
10428 return 0;
10429 s1++;
10430 s2++;
10431 }
10432 return 0;
10433}
10434
Martin v. Löwis5b222132007-06-10 09:51:05 +000010435Py_UNICODE*
10436Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10437{
10438 const Py_UNICODE *p;
10439 for (p = s; *p; p++)
10440 if (*p == c)
10441 return (Py_UNICODE*)p;
10442 return NULL;
10443}
10444
Victor Stinner331ea922010-08-10 16:37:20 +000010445Py_UNICODE*
10446Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10447{
10448 const Py_UNICODE *p;
10449 p = s + Py_UNICODE_strlen(s);
10450 while (p != s) {
10451 p--;
10452 if (*p == c)
10453 return (Py_UNICODE*)p;
10454 }
10455 return NULL;
10456}
10457
Victor Stinner71133ff2010-09-01 23:43:53 +000010458Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010459PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010460{
10461 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10462 Py_UNICODE *copy;
10463 Py_ssize_t size;
10464
10465 /* Ensure we won't overflow the size. */
10466 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10467 PyErr_NoMemory();
10468 return NULL;
10469 }
10470 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10471 size *= sizeof(Py_UNICODE);
10472 copy = PyMem_Malloc(size);
10473 if (copy == NULL) {
10474 PyErr_NoMemory();
10475 return NULL;
10476 }
10477 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10478 return copy;
10479}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010480
Georg Brandl66c221e2010-10-14 07:04:07 +000010481/* A _string module, to export formatter_parser and formatter_field_name_split
10482 to the string.Formatter class implemented in Python. */
10483
10484static PyMethodDef _string_methods[] = {
10485 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10486 METH_O, PyDoc_STR("split the argument as a field name")},
10487 {"formatter_parser", (PyCFunction) formatter_parser,
10488 METH_O, PyDoc_STR("parse the argument as a format string")},
10489 {NULL, NULL}
10490};
10491
10492static struct PyModuleDef _string_module = {
10493 PyModuleDef_HEAD_INIT,
10494 "_string",
10495 PyDoc_STR("string helper module"),
10496 0,
10497 _string_methods,
10498 NULL,
10499 NULL,
10500 NULL,
10501 NULL
10502};
10503
10504PyMODINIT_FUNC
10505PyInit__string(void)
10506{
10507 return PyModule_Create(&_string_module);
10508}
10509
10510
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010511#ifdef __cplusplus
10512}
10513#endif