blob: 19673859cf42fc2036811b08c9f95ece967a9b51 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Alexander Belopolsky40018472011-02-26 01:02:56 +0000145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
Alexander Belopolsky40018472011-02-26 01:02:56 +0000151static void
152raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Alexander Belopolsky40018472011-02-26 01:02:56 +0000228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static int
261unicode_resize(register PyUnicodeObject *unicode,
262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000302 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303 }
304 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 return 0;
307}
308
309/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000310 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312
313 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316*/
317
Alexander Belopolsky40018472011-02-26 01:02:56 +0000318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320{
321 register PyUnicodeObject *unicode;
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty);
326 return unicode_empty;
327 }
328
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000329 /* Ensure we won't overflow the size. */
330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331 return (PyUnicodeObject *)PyErr_NoMemory();
332 }
333
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000335 if (free_list) {
336 unicode = free_list;
337 free_list = *(PyUnicodeObject **)unicode;
338 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 if (unicode->str) {
340 /* Keep-Alive optimization: we only upsize the buffer,
341 never downsize it. */
342 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000343 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 PyObject_DEL(unicode->str);
345 unicode->str = NULL;
346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 }
352 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 }
354 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 if (unicode == NULL)
358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000363 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 PyErr_NoMemory();
365 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000367 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000368 * the caller fails before initializing str -- unicode_resize()
369 * reads str[0], and the Keep-Alive optimization can keep memory
370 * allocated for str alive across a call to unicode_dealloc(unicode).
371 * We don't want unicode_resize to read uninitialized memory in
372 * that case.
373 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000374 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000383 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000386 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388}
389
Alexander Belopolsky40018472011-02-26 01:02:56 +0000390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392{
Walter Dörwald16807132007-05-25 13:52:07 +0000393 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000394 case SSTATE_NOT_INTERNED:
395 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000396
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError(
402 "deletion of interned string failed");
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_IMMORTAL:
406 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 default:
409 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410 }
411
Guido van Rossum604ddf82001-12-06 20:03:56 +0000412 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000414 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416 PyObject_DEL(unicode->str);
417 unicode->str = NULL;
418 unicode->length = 0;
419 }
420 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000421 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 }
423 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000424 *(PyUnicodeObject **)unicode = free_list;
425 free_list = unicode;
426 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000429 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 }
433}
434
Alexander Belopolsky40018472011-02-26 01:02:56 +0000435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000437{
438 register PyUnicodeObject *v;
439
440 /* Argument checks */
441 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 PyErr_BadInternalCall();
448 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 }
450
451 /* Resizing unicode_empty and single character objects is not
452 possible since these are being shared. We simply return a fresh
453 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000454 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 (v == unicode_empty || v->length == 1)) {
456 PyUnicodeObject *w = _PyUnicode_New(length);
457 if (w == NULL)
458 return -1;
459 Py_UNICODE_COPY(w->str, v->str,
460 length < v->length ? length : v->length);
461 Py_DECREF(*unicode);
462 *unicode = w;
463 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000464 }
465
466 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */
468 return unicode_resize(v, length);
469}
470
Alexander Belopolsky40018472011-02-26 01:02:56 +0000471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473{
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476
Alexander Belopolsky40018472011-02-26 01:02:56 +0000477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 PyUnicodeObject *unicode;
481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */
484 if (u != NULL) {
485
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000490 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000491
492 /* Single character Unicode objects in the Latin-1 range are
493 shared when using this constructor */
494 if (size == 1 && *u < 256) {
495 unicode = unicode_latin1[*u];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = *u;
501 unicode_latin1[*u] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 /* Copy the Unicode data into the new object */
513 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000514 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515
516 return (PyObject *)unicode;
517}
518
Alexander Belopolsky40018472011-02-26 01:02:56 +0000519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Alexander Belopolsky40018472011-02-26 01:02:56 +0000567PyObject *
568PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569{
570 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL;
574 }
575
576 return PyUnicode_FromStringAndSize(u, size);
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579#ifdef HAVE_WCHAR_H
580
Mark Dickinson081dfee2009-03-18 14:47:41 +0000581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */
589
Alexander Belopolsky40018472011-02-26 01:02:56 +0000590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +0000592{
593 PyUnicodeObject *unicode;
594 register Py_ssize_t i;
595 Py_ssize_t alloc;
596 const wchar_t *orig_w;
597
598 if (w == NULL) {
599 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 if (size == -1) {
606 size = wcslen(w);
607 }
608
609 alloc = size;
610 orig_w = w;
611 for (i = size; i > 0; i--) {
612 if (*w > 0xFFFF)
613 alloc++;
614 w++;
615 }
616 w = orig_w;
617 unicode = _PyUnicode_New(alloc);
618 if (!unicode)
619 return NULL;
620
621 /* Copy the wchar_t data into the new object */
622 {
623 register Py_UNICODE *u;
624 u = PyUnicode_AS_UNICODE(unicode);
625 for (i = size; i > 0; i--) {
626 if (*w > 0xFFFF) {
627 wchar_t ordinal = *w++;
628 ordinal -= 0x10000;
629 *u++ = 0xD800 | (ordinal >> 10);
630 *u++ = 0xDC00 | (ordinal & 0x3FF);
631 }
632 else
633 *u++ = *w++;
634 }
635 }
636 return (PyObject *)unicode;
637}
638
639#else
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 PyUnicodeObject *unicode;
645
646 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 PyErr_BadInternalCall();
650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Martin v. Löwis790465f2008-04-05 20:41:37 +0000653 if (size == -1) {
654 size = wcslen(w);
655 }
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 unicode = _PyUnicode_New(size);
658 if (!unicode)
659 return NULL;
660
661 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000664#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000666 register Py_UNICODE *u;
667 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--)
670 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 }
672#endif
673
674 return (PyObject *)unicode;
675}
676
Mark Dickinson081dfee2009-03-18 14:47:41 +0000677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
Walter Dörwald346737f2007-05-31 10:44:43 +0000681static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000684{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 *fmt++ = '%';
686 if (width) {
687 if (zeropad)
688 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width);
690 }
691 if (precision)
692 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag)
694 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702#else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706#endif
707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000708 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T;
710 while (*f)
711 *fmt++ = *f++;
712 }
713 *fmt++ = c;
714 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000715}
716
Walter Dörwaldd2034312007-05-18 16:29:38 +0000717#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
718
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000719/* size of fixed-size buffer for formatting single arguments */
720#define ITEM_BUFFER_LEN 21
721/* maximum number of characters required for output of %ld. 21 characters
722 allows for 64-bit integers (in decimal) and an optional sign. */
723#define MAX_LONG_CHARS 21
724/* maximum number of characters required for output of %lld.
725 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
726 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
727#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
728
Walter Dörwaldd2034312007-05-18 16:29:38 +0000729PyObject *
730PyUnicode_FromFormatV(const char *format, va_list vargs)
731{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000732 va_list count;
733 Py_ssize_t callcount = 0;
734 PyObject **callresults = NULL;
735 PyObject **callresult = NULL;
736 Py_ssize_t n = 0;
737 int width = 0;
738 int precision = 0;
739 int zeropad;
740 const char* f;
741 Py_UNICODE *s;
742 PyObject *string;
743 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000744 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000745 /* use abuffer instead of buffer, if we need more space
746 * (which can happen if there's a format specifier with width). */
747 char *abuffer = NULL;
748 char *realbuffer;
749 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000750 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000752
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000753 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000754 /* step 1: count the number of %S/%R/%A/%s format specifications
755 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
756 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
757 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000758 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000759 if (*f == '%') {
760 if (*(f+1)=='%')
761 continue;
762 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
763 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000764 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000765 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000766 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000767 ;
768 if (*f == 's')
769 ++callcount;
770 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000771 else if (128 <= (unsigned char)*f) {
772 PyErr_Format(PyExc_ValueError,
773 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000774 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000775 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000776 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000777 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000778 }
779 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000780 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000781 if (callcount) {
782 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
783 if (!callresults) {
784 PyErr_NoMemory();
785 return NULL;
786 }
787 callresult = callresults;
788 }
789 /* step 3: figure out how large a buffer we need */
790 for (f = format; *f; f++) {
791 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000792#ifdef HAVE_LONG_LONG
793 int longlongflag = 0;
794#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 const char* p = f;
796 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000797 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000798 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000799 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000800 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000801
Benjamin Peterson14339b62009-01-31 16:36:08 +0000802 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
803 * they don't affect the amount of space we reserve.
804 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000805 if (*f == 'l') {
806 if (f[1] == 'd' || f[1] == 'u') {
807 ++f;
808 }
809#ifdef HAVE_LONG_LONG
810 else if (f[1] == 'l' &&
811 (f[2] == 'd' || f[2] == 'u')) {
812 longlongflag = 1;
813 f += 2;
814 }
815#endif
816 }
817 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000818 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000819 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000820
Benjamin Peterson14339b62009-01-31 16:36:08 +0000821 switch (*f) {
822 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000823 {
824#ifndef Py_UNICODE_WIDE
825 int ordinal = va_arg(count, int);
826 if (ordinal > 0xffff)
827 n += 2;
828 else
829 n++;
830#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000831 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000832 n++;
833#endif
834 break;
835 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000836 case '%':
837 n++;
838 break;
839 case 'd': case 'u': case 'i': case 'x':
840 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000841#ifdef HAVE_LONG_LONG
842 if (longlongflag) {
843 if (width < MAX_LONG_LONG_CHARS)
844 width = MAX_LONG_LONG_CHARS;
845 }
846 else
847#endif
848 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
849 including sign. Decimal takes the most space. This
850 isn't enough for octal. If a width is specified we
851 need more (which we allocate later). */
852 if (width < MAX_LONG_CHARS)
853 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000854 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000855 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000856 if (abuffersize < width)
857 abuffersize = width;
858 break;
859 case 's':
860 {
861 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000862 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000863 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
864 if (!str)
865 goto fail;
866 n += PyUnicode_GET_SIZE(str);
867 /* Remember the str and switch to the next slot */
868 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000869 break;
870 }
871 case 'U':
872 {
873 PyObject *obj = va_arg(count, PyObject *);
874 assert(obj && PyUnicode_Check(obj));
875 n += PyUnicode_GET_SIZE(obj);
876 break;
877 }
878 case 'V':
879 {
880 PyObject *obj = va_arg(count, PyObject *);
881 const char *str = va_arg(count, const char *);
882 assert(obj || str);
883 assert(!obj || PyUnicode_Check(obj));
884 if (obj)
885 n += PyUnicode_GET_SIZE(obj);
886 else
887 n += strlen(str);
888 break;
889 }
890 case 'S':
891 {
892 PyObject *obj = va_arg(count, PyObject *);
893 PyObject *str;
894 assert(obj);
895 str = PyObject_Str(obj);
896 if (!str)
897 goto fail;
898 n += PyUnicode_GET_SIZE(str);
899 /* Remember the str and switch to the next slot */
900 *callresult++ = str;
901 break;
902 }
903 case 'R':
904 {
905 PyObject *obj = va_arg(count, PyObject *);
906 PyObject *repr;
907 assert(obj);
908 repr = PyObject_Repr(obj);
909 if (!repr)
910 goto fail;
911 n += PyUnicode_GET_SIZE(repr);
912 /* Remember the repr and switch to the next slot */
913 *callresult++ = repr;
914 break;
915 }
916 case 'A':
917 {
918 PyObject *obj = va_arg(count, PyObject *);
919 PyObject *ascii;
920 assert(obj);
921 ascii = PyObject_ASCII(obj);
922 if (!ascii)
923 goto fail;
924 n += PyUnicode_GET_SIZE(ascii);
925 /* Remember the repr and switch to the next slot */
926 *callresult++ = ascii;
927 break;
928 }
929 case 'p':
930 (void) va_arg(count, int);
931 /* maximum 64-bit pointer representation:
932 * 0xffffffffffffffff
933 * so 19 characters is enough.
934 * XXX I count 18 -- what's the extra for?
935 */
936 n += 19;
937 break;
938 default:
939 /* if we stumble upon an unknown
940 formatting code, copy the rest of
941 the format string to the output
942 string. (we cannot just skip the
943 code, since there's no way to know
944 what's in the argument list) */
945 n += strlen(p);
946 goto expand;
947 }
948 } else
949 n++;
950 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000951 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000952 if (abuffersize > ITEM_BUFFER_LEN) {
953 /* add 1 for sprintf's trailing null byte */
954 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000955 if (!abuffer) {
956 PyErr_NoMemory();
957 goto fail;
958 }
959 realbuffer = abuffer;
960 }
961 else
962 realbuffer = buffer;
963 /* step 4: fill the buffer */
964 /* Since we've analyzed how much space we need for the worst case,
965 we don't have to resize the string.
966 There can be no errors beyond this point. */
967 string = PyUnicode_FromUnicode(NULL, n);
968 if (!string)
969 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000970
Benjamin Peterson14339b62009-01-31 16:36:08 +0000971 s = PyUnicode_AS_UNICODE(string);
972 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000973
Benjamin Peterson14339b62009-01-31 16:36:08 +0000974 for (f = format; *f; f++) {
975 if (*f == '%') {
976 const char* p = f++;
977 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000978 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000979 int size_tflag = 0;
980 zeropad = (*f == '0');
981 /* parse the width.precision part */
982 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000983 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000984 width = (width*10) + *f++ - '0';
985 precision = 0;
986 if (*f == '.') {
987 f++;
David Malcolm96960882010-11-05 17:23:41 +0000988 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 precision = (precision*10) + *f++ - '0';
990 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000991 /* Handle %ld, %lu, %lld and %llu. */
992 if (*f == 'l') {
993 if (f[1] == 'd' || f[1] == 'u') {
994 longflag = 1;
995 ++f;
996 }
997#ifdef HAVE_LONG_LONG
998 else if (f[1] == 'l' &&
999 (f[2] == 'd' || f[2] == 'u')) {
1000 longlongflag = 1;
1001 f += 2;
1002 }
1003#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001004 }
1005 /* handle the size_t flag. */
1006 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1007 size_tflag = 1;
1008 ++f;
1009 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001010
Benjamin Peterson14339b62009-01-31 16:36:08 +00001011 switch (*f) {
1012 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001013 {
1014 int ordinal = va_arg(vargs, int);
1015#ifndef Py_UNICODE_WIDE
1016 if (ordinal > 0xffff) {
1017 ordinal -= 0x10000;
1018 *s++ = 0xD800 | (ordinal >> 10);
1019 *s++ = 0xDC00 | (ordinal & 0x3FF);
1020 } else
1021#endif
1022 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001023 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001024 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001025 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001026 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1027 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001028 if (longflag)
1029 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001030#ifdef HAVE_LONG_LONG
1031 else if (longlongflag)
1032 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1033#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001034 else if (size_tflag)
1035 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1036 else
1037 sprintf(realbuffer, fmt, va_arg(vargs, int));
1038 appendstring(realbuffer);
1039 break;
1040 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001041 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1042 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 if (longflag)
1044 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001045#ifdef HAVE_LONG_LONG
1046 else if (longlongflag)
1047 sprintf(realbuffer, fmt, va_arg(vargs,
1048 unsigned PY_LONG_LONG));
1049#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 else if (size_tflag)
1051 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1052 else
1053 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1054 appendstring(realbuffer);
1055 break;
1056 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001057 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001058 sprintf(realbuffer, fmt, va_arg(vargs, int));
1059 appendstring(realbuffer);
1060 break;
1061 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001062 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001063 sprintf(realbuffer, fmt, va_arg(vargs, int));
1064 appendstring(realbuffer);
1065 break;
1066 case 's':
1067 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001068 /* unused, since we already have the result */
1069 (void) va_arg(vargs, char *);
1070 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1071 PyUnicode_GET_SIZE(*callresult));
1072 s += PyUnicode_GET_SIZE(*callresult);
1073 /* We're done with the unicode()/repr() => forget it */
1074 Py_DECREF(*callresult);
1075 /* switch to next unicode()/repr() result */
1076 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001077 break;
1078 }
1079 case 'U':
1080 {
1081 PyObject *obj = va_arg(vargs, PyObject *);
1082 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1083 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1084 s += size;
1085 break;
1086 }
1087 case 'V':
1088 {
1089 PyObject *obj = va_arg(vargs, PyObject *);
1090 const char *str = va_arg(vargs, const char *);
1091 if (obj) {
1092 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1093 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1094 s += size;
1095 } else {
1096 appendstring(str);
1097 }
1098 break;
1099 }
1100 case 'S':
1101 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001102 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001103 {
1104 Py_UNICODE *ucopy;
1105 Py_ssize_t usize;
1106 Py_ssize_t upos;
1107 /* unused, since we already have the result */
1108 (void) va_arg(vargs, PyObject *);
1109 ucopy = PyUnicode_AS_UNICODE(*callresult);
1110 usize = PyUnicode_GET_SIZE(*callresult);
1111 for (upos = 0; upos<usize;)
1112 *s++ = ucopy[upos++];
1113 /* We're done with the unicode()/repr() => forget it */
1114 Py_DECREF(*callresult);
1115 /* switch to next unicode()/repr() result */
1116 ++callresult;
1117 break;
1118 }
1119 case 'p':
1120 sprintf(buffer, "%p", va_arg(vargs, void*));
1121 /* %p is ill-defined: ensure leading 0x. */
1122 if (buffer[1] == 'X')
1123 buffer[1] = 'x';
1124 else if (buffer[1] != 'x') {
1125 memmove(buffer+2, buffer, strlen(buffer)+1);
1126 buffer[0] = '0';
1127 buffer[1] = 'x';
1128 }
1129 appendstring(buffer);
1130 break;
1131 case '%':
1132 *s++ = '%';
1133 break;
1134 default:
1135 appendstring(p);
1136 goto end;
1137 }
Victor Stinner1205f272010-09-11 00:54:47 +00001138 }
Victor Stinner1205f272010-09-11 00:54:47 +00001139 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001140 *s++ = *f;
1141 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001142
Benjamin Peterson29060642009-01-31 22:14:21 +00001143 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 if (callresults)
1145 PyObject_Free(callresults);
1146 if (abuffer)
1147 PyObject_Free(abuffer);
1148 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1149 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001150 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001151 if (callresults) {
1152 PyObject **callresult2 = callresults;
1153 while (callresult2 < callresult) {
1154 Py_DECREF(*callresult2);
1155 ++callresult2;
1156 }
1157 PyObject_Free(callresults);
1158 }
1159 if (abuffer)
1160 PyObject_Free(abuffer);
1161 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001162}
1163
1164#undef appendstring
1165
1166PyObject *
1167PyUnicode_FromFormat(const char *format, ...)
1168{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001169 PyObject* ret;
1170 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001171
1172#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001173 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001174#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001175 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001176#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001177 ret = PyUnicode_FromFormatV(format, vargs);
1178 va_end(vargs);
1179 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001180}
1181
Victor Stinner5593d8a2010-10-02 11:11:27 +00001182/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1183 convert a Unicode object to a wide character string.
1184
1185 - If w is NULL: return the number of wide characters (including the nul
1186 character) required to convert the unicode object. Ignore size argument.
1187
1188 - Otherwise: return the number of wide characters (excluding the nul
1189 character) written into w. Write at most size wide characters (including
1190 the nul character). */
1191static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001192unicode_aswidechar(PyUnicodeObject *unicode,
1193 wchar_t *w,
1194 Py_ssize_t size)
1195{
1196#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001197 Py_ssize_t res;
1198 if (w != NULL) {
1199 res = PyUnicode_GET_SIZE(unicode);
1200 if (size > res)
1201 size = res + 1;
1202 else
1203 res = size;
1204 memcpy(w, unicode->str, size * sizeof(wchar_t));
1205 return res;
1206 }
1207 else
1208 return PyUnicode_GET_SIZE(unicode) + 1;
1209#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1210 register const Py_UNICODE *u;
1211 const Py_UNICODE *uend;
1212 const wchar_t *worig, *wend;
1213 Py_ssize_t nchar;
1214
Victor Stinner137c34c2010-09-29 10:25:54 +00001215 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001216 uend = u + PyUnicode_GET_SIZE(unicode);
1217 if (w != NULL) {
1218 worig = w;
1219 wend = w + size;
1220 while (u != uend && w != wend) {
1221 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1222 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1223 {
1224 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1225 u += 2;
1226 }
1227 else {
1228 *w = *u;
1229 u++;
1230 }
1231 w++;
1232 }
1233 if (w != wend)
1234 *w = L'\0';
1235 return w - worig;
1236 }
1237 else {
1238 nchar = 1; /* nul character at the end */
1239 while (u != uend) {
1240 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1241 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1242 u += 2;
1243 else
1244 u++;
1245 nchar++;
1246 }
1247 }
1248 return nchar;
1249#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1250 register Py_UNICODE *u, *uend, ordinal;
1251 register Py_ssize_t i;
1252 wchar_t *worig, *wend;
1253 Py_ssize_t nchar;
1254
1255 u = PyUnicode_AS_UNICODE(unicode);
1256 uend = u + PyUnicode_GET_SIZE(u);
1257 if (w != NULL) {
1258 worig = w;
1259 wend = w + size;
1260 while (u != uend && w != wend) {
1261 ordinal = *u;
1262 if (ordinal > 0xffff) {
1263 ordinal -= 0x10000;
1264 *w++ = 0xD800 | (ordinal >> 10);
1265 *w++ = 0xDC00 | (ordinal & 0x3FF);
1266 }
1267 else
1268 *w++ = ordinal;
1269 u++;
1270 }
1271 if (w != wend)
1272 *w = 0;
1273 return w - worig;
1274 }
1275 else {
1276 nchar = 1; /* nul character */
1277 while (u != uend) {
1278 if (*u > 0xffff)
1279 nchar += 2;
1280 else
1281 nchar++;
1282 u++;
1283 }
1284 return nchar;
1285 }
1286#else
1287# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001288#endif
1289}
1290
1291Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001292PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001293 wchar_t *w,
1294 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295{
1296 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001297 PyErr_BadInternalCall();
1298 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001300 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301}
1302
Victor Stinner137c34c2010-09-29 10:25:54 +00001303wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001304PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001305 Py_ssize_t *size)
1306{
1307 wchar_t* buffer;
1308 Py_ssize_t buflen;
1309
1310 if (unicode == NULL) {
1311 PyErr_BadInternalCall();
1312 return NULL;
1313 }
1314
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001315 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001316 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001317 PyErr_NoMemory();
1318 return NULL;
1319 }
1320
Victor Stinner137c34c2010-09-29 10:25:54 +00001321 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1322 if (buffer == NULL) {
1323 PyErr_NoMemory();
1324 return NULL;
1325 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001326 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001327 if (size != NULL)
1328 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001329 return buffer;
1330}
1331
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332#endif
1333
Alexander Belopolsky40018472011-02-26 01:02:56 +00001334PyObject *
1335PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001336{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001337 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001338
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001339 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 PyErr_SetString(PyExc_ValueError,
1341 "chr() arg not in range(0x110000)");
1342 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001343 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001344
1345#ifndef Py_UNICODE_WIDE
1346 if (ordinal > 0xffff) {
1347 ordinal -= 0x10000;
1348 s[0] = 0xD800 | (ordinal >> 10);
1349 s[1] = 0xDC00 | (ordinal & 0x3FF);
1350 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001351 }
1352#endif
1353
Hye-Shik Chang40574832004-04-06 07:24:51 +00001354 s[0] = (Py_UNICODE)ordinal;
1355 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001356}
1357
Alexander Belopolsky40018472011-02-26 01:02:56 +00001358PyObject *
1359PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001361 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001362 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001363 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001364 Py_INCREF(obj);
1365 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001366 }
1367 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001368 /* For a Unicode subtype that's not a Unicode object,
1369 return a true Unicode object with the same data. */
1370 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1371 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001372 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001373 PyErr_Format(PyExc_TypeError,
1374 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001375 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001376 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001377}
1378
Alexander Belopolsky40018472011-02-26 01:02:56 +00001379PyObject *
1380PyUnicode_FromEncodedObject(register PyObject *obj,
1381 const char *encoding,
1382 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001383{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001384 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001385 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001386
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001388 PyErr_BadInternalCall();
1389 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001391
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001392 /* Decoding bytes objects is the most common case and should be fast */
1393 if (PyBytes_Check(obj)) {
1394 if (PyBytes_GET_SIZE(obj) == 0) {
1395 Py_INCREF(unicode_empty);
1396 v = (PyObject *) unicode_empty;
1397 }
1398 else {
1399 v = PyUnicode_Decode(
1400 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1401 encoding, errors);
1402 }
1403 return v;
1404 }
1405
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001406 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001407 PyErr_SetString(PyExc_TypeError,
1408 "decoding str is not supported");
1409 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001410 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001411
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001412 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1413 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1414 PyErr_Format(PyExc_TypeError,
1415 "coercing to str: need bytes, bytearray "
1416 "or buffer-like object, %.80s found",
1417 Py_TYPE(obj)->tp_name);
1418 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001419 }
Tim Petersced69f82003-09-16 20:30:58 +00001420
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001421 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001422 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001423 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 }
Tim Petersced69f82003-09-16 20:30:58 +00001425 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001426 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001427
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001428 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001429 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430}
1431
Victor Stinner600d3be2010-06-10 12:00:55 +00001432/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001433 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1434 1 on success. */
1435static int
1436normalize_encoding(const char *encoding,
1437 char *lower,
1438 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001440 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001441 char *l;
1442 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001444 e = encoding;
1445 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001446 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001447 while (*e) {
1448 if (l == l_end)
1449 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001450 if (Py_ISUPPER(*e)) {
1451 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001452 }
1453 else if (*e == '_') {
1454 *l++ = '-';
1455 e++;
1456 }
1457 else {
1458 *l++ = *e++;
1459 }
1460 }
1461 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001462 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001463}
1464
Alexander Belopolsky40018472011-02-26 01:02:56 +00001465PyObject *
1466PyUnicode_Decode(const char *s,
1467 Py_ssize_t size,
1468 const char *encoding,
1469 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00001470{
1471 PyObject *buffer = NULL, *unicode;
1472 Py_buffer info;
1473 char lower[11]; /* Enough for any encoding shortcut */
1474
1475 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001476 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001477
1478 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001479 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001480 if ((strcmp(lower, "utf-8") == 0) ||
1481 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001482 return PyUnicode_DecodeUTF8(s, size, errors);
1483 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001484 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001485 (strcmp(lower, "iso-8859-1") == 0))
1486 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001487#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001488 else if (strcmp(lower, "mbcs") == 0)
1489 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001490#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001491 else if (strcmp(lower, "ascii") == 0)
1492 return PyUnicode_DecodeASCII(s, size, errors);
1493 else if (strcmp(lower, "utf-16") == 0)
1494 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1495 else if (strcmp(lower, "utf-32") == 0)
1496 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498
1499 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001500 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001501 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001502 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001503 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 if (buffer == NULL)
1505 goto onError;
1506 unicode = PyCodec_Decode(buffer, encoding, errors);
1507 if (unicode == NULL)
1508 goto onError;
1509 if (!PyUnicode_Check(unicode)) {
1510 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001511 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001512 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001513 Py_DECREF(unicode);
1514 goto onError;
1515 }
1516 Py_DECREF(buffer);
1517 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001518
Benjamin Peterson29060642009-01-31 22:14:21 +00001519 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520 Py_XDECREF(buffer);
1521 return NULL;
1522}
1523
Alexander Belopolsky40018472011-02-26 01:02:56 +00001524PyObject *
1525PyUnicode_AsDecodedObject(PyObject *unicode,
1526 const char *encoding,
1527 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001528{
1529 PyObject *v;
1530
1531 if (!PyUnicode_Check(unicode)) {
1532 PyErr_BadArgument();
1533 goto onError;
1534 }
1535
1536 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001537 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001538
1539 /* Decode via the codec registry */
1540 v = PyCodec_Decode(unicode, encoding, errors);
1541 if (v == NULL)
1542 goto onError;
1543 return v;
1544
Benjamin Peterson29060642009-01-31 22:14:21 +00001545 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001546 return NULL;
1547}
1548
Alexander Belopolsky40018472011-02-26 01:02:56 +00001549PyObject *
1550PyUnicode_AsDecodedUnicode(PyObject *unicode,
1551 const char *encoding,
1552 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001553{
1554 PyObject *v;
1555
1556 if (!PyUnicode_Check(unicode)) {
1557 PyErr_BadArgument();
1558 goto onError;
1559 }
1560
1561 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001562 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001563
1564 /* Decode via the codec registry */
1565 v = PyCodec_Decode(unicode, encoding, errors);
1566 if (v == NULL)
1567 goto onError;
1568 if (!PyUnicode_Check(v)) {
1569 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001570 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001571 Py_TYPE(v)->tp_name);
1572 Py_DECREF(v);
1573 goto onError;
1574 }
1575 return v;
1576
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001578 return NULL;
1579}
1580
Alexander Belopolsky40018472011-02-26 01:02:56 +00001581PyObject *
1582PyUnicode_Encode(const Py_UNICODE *s,
1583 Py_ssize_t size,
1584 const char *encoding,
1585 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586{
1587 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001588
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 unicode = PyUnicode_FromUnicode(s, size);
1590 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1593 Py_DECREF(unicode);
1594 return v;
1595}
1596
Alexander Belopolsky40018472011-02-26 01:02:56 +00001597PyObject *
1598PyUnicode_AsEncodedObject(PyObject *unicode,
1599 const char *encoding,
1600 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001601{
1602 PyObject *v;
1603
1604 if (!PyUnicode_Check(unicode)) {
1605 PyErr_BadArgument();
1606 goto onError;
1607 }
1608
1609 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001610 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001611
1612 /* Encode via the codec registry */
1613 v = PyCodec_Encode(unicode, encoding, errors);
1614 if (v == NULL)
1615 goto onError;
1616 return v;
1617
Benjamin Peterson29060642009-01-31 22:14:21 +00001618 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001619 return NULL;
1620}
1621
Victor Stinnerad158722010-10-27 00:25:46 +00001622PyObject *
1623PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001624{
Victor Stinner313a1202010-06-11 23:56:51 +00001625#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001626 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1627 PyUnicode_GET_SIZE(unicode),
1628 NULL);
1629#elif defined(__APPLE__)
1630 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1631 PyUnicode_GET_SIZE(unicode),
1632 "surrogateescape");
1633#else
1634 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001635 return PyUnicode_AsEncodedString(unicode,
1636 Py_FileSystemDefaultEncoding,
1637 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001638 }
1639 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001640 /* locale encoding with surrogateescape */
1641 wchar_t *wchar;
1642 char *bytes;
1643 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001644 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001645
1646 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1647 if (wchar == NULL)
1648 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001649 bytes = _Py_wchar2char(wchar, &error_pos);
1650 if (bytes == NULL) {
1651 if (error_pos != (size_t)-1) {
1652 char *errmsg = strerror(errno);
1653 PyObject *exc = NULL;
1654 if (errmsg == NULL)
1655 errmsg = "Py_wchar2char() failed";
1656 raise_encode_exception(&exc,
1657 "filesystemencoding",
1658 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1659 error_pos, error_pos+1,
1660 errmsg);
1661 Py_XDECREF(exc);
1662 }
1663 else
1664 PyErr_NoMemory();
1665 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001666 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001667 }
1668 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001669
1670 bytes_obj = PyBytes_FromString(bytes);
1671 PyMem_Free(bytes);
1672 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001673 }
Victor Stinnerad158722010-10-27 00:25:46 +00001674#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001675}
1676
Alexander Belopolsky40018472011-02-26 01:02:56 +00001677PyObject *
1678PyUnicode_AsEncodedString(PyObject *unicode,
1679 const char *encoding,
1680 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001681{
1682 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001683 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001684
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 if (!PyUnicode_Check(unicode)) {
1686 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001687 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 }
Fred Drakee4315f52000-05-09 19:53:39 +00001689
Tim Petersced69f82003-09-16 20:30:58 +00001690 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001691 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1692 PyUnicode_GET_SIZE(unicode),
1693 errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001694
1695 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001696 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001697 if ((strcmp(lower, "utf-8") == 0) ||
1698 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001699 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1700 PyUnicode_GET_SIZE(unicode),
1701 errors);
1702 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001703 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001704 (strcmp(lower, "iso-8859-1") == 0))
1705 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1706 PyUnicode_GET_SIZE(unicode),
1707 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001708#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001709 else if (strcmp(lower, "mbcs") == 0)
1710 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1711 PyUnicode_GET_SIZE(unicode),
1712 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001713#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001714 else if (strcmp(lower, "ascii") == 0)
1715 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1716 PyUnicode_GET_SIZE(unicode),
1717 errors);
1718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719
1720 /* Encode via the codec registry */
1721 v = PyCodec_Encode(unicode, encoding, errors);
1722 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001723 return NULL;
1724
1725 /* The normal path */
1726 if (PyBytes_Check(v))
1727 return v;
1728
1729 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001730 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001731 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001732 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001733
1734 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1735 "encoder %s returned bytearray instead of bytes",
1736 encoding);
1737 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001738 Py_DECREF(v);
1739 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001740 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001741
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001742 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1743 Py_DECREF(v);
1744 return b;
1745 }
1746
1747 PyErr_Format(PyExc_TypeError,
1748 "encoder did not return a bytes object (type=%.400s)",
1749 Py_TYPE(v)->tp_name);
1750 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001751 return NULL;
1752}
1753
Alexander Belopolsky40018472011-02-26 01:02:56 +00001754PyObject *
1755PyUnicode_AsEncodedUnicode(PyObject *unicode,
1756 const char *encoding,
1757 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001758{
1759 PyObject *v;
1760
1761 if (!PyUnicode_Check(unicode)) {
1762 PyErr_BadArgument();
1763 goto onError;
1764 }
1765
1766 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001768
1769 /* Encode via the codec registry */
1770 v = PyCodec_Encode(unicode, encoding, errors);
1771 if (v == NULL)
1772 goto onError;
1773 if (!PyUnicode_Check(v)) {
1774 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001775 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001776 Py_TYPE(v)->tp_name);
1777 Py_DECREF(v);
1778 goto onError;
1779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001781
Benjamin Peterson29060642009-01-31 22:14:21 +00001782 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783 return NULL;
1784}
1785
Alexander Belopolsky40018472011-02-26 01:02:56 +00001786PyObject *
1787_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1788 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001789{
1790 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001791 if (v)
1792 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001793 if (errors != NULL)
1794 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001795 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001796 PyUnicode_GET_SIZE(unicode),
1797 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001798 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001799 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001800 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001801 return v;
1802}
1803
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001804PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001805PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001806 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001807 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1808}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001809
Christian Heimes5894ba72007-11-04 11:43:14 +00001810PyObject*
1811PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1812{
Victor Stinnerad158722010-10-27 00:25:46 +00001813#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1814 return PyUnicode_DecodeMBCS(s, size, NULL);
1815#elif defined(__APPLE__)
1816 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1817#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001818 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1819 can be undefined. If it is case, decode using UTF-8. The following assumes
1820 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1821 bootstrapping process where the codecs aren't ready yet.
1822 */
1823 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001824 return PyUnicode_Decode(s, size,
1825 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001826 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001827 }
1828 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001829 /* locale encoding with surrogateescape */
1830 wchar_t *wchar;
1831 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001832 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001833
1834 if (s[size] != '\0' || size != strlen(s)) {
1835 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1836 return NULL;
1837 }
1838
Victor Stinner168e1172010-10-16 23:16:16 +00001839 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001840 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001841 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001842
Victor Stinner168e1172010-10-16 23:16:16 +00001843 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001844 PyMem_Free(wchar);
1845 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001846 }
Victor Stinnerad158722010-10-27 00:25:46 +00001847#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001848}
1849
Martin v. Löwis011e8422009-05-05 04:43:17 +00001850
1851int
1852PyUnicode_FSConverter(PyObject* arg, void* addr)
1853{
1854 PyObject *output = NULL;
1855 Py_ssize_t size;
1856 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001857 if (arg == NULL) {
1858 Py_DECREF(*(PyObject**)addr);
1859 return 1;
1860 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001861 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001862 output = arg;
1863 Py_INCREF(output);
1864 }
1865 else {
1866 arg = PyUnicode_FromObject(arg);
1867 if (!arg)
1868 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001869 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001870 Py_DECREF(arg);
1871 if (!output)
1872 return 0;
1873 if (!PyBytes_Check(output)) {
1874 Py_DECREF(output);
1875 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1876 return 0;
1877 }
1878 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001879 size = PyBytes_GET_SIZE(output);
1880 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001881 if (size != strlen(data)) {
1882 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1883 Py_DECREF(output);
1884 return 0;
1885 }
1886 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001887 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001888}
1889
1890
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001891int
1892PyUnicode_FSDecoder(PyObject* arg, void* addr)
1893{
1894 PyObject *output = NULL;
1895 Py_ssize_t size;
1896 void *data;
1897 if (arg == NULL) {
1898 Py_DECREF(*(PyObject**)addr);
1899 return 1;
1900 }
1901 if (PyUnicode_Check(arg)) {
1902 output = arg;
1903 Py_INCREF(output);
1904 }
1905 else {
1906 arg = PyBytes_FromObject(arg);
1907 if (!arg)
1908 return 0;
1909 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1910 PyBytes_GET_SIZE(arg));
1911 Py_DECREF(arg);
1912 if (!output)
1913 return 0;
1914 if (!PyUnicode_Check(output)) {
1915 Py_DECREF(output);
1916 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1917 return 0;
1918 }
1919 }
1920 size = PyUnicode_GET_SIZE(output);
1921 data = PyUnicode_AS_UNICODE(output);
1922 if (size != Py_UNICODE_strlen(data)) {
1923 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1924 Py_DECREF(output);
1925 return 0;
1926 }
1927 *(PyObject**)addr = output;
1928 return Py_CLEANUP_SUPPORTED;
1929}
1930
1931
Martin v. Löwis5b222132007-06-10 09:51:05 +00001932char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001933_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001934{
Christian Heimesf3863112007-11-22 07:46:41 +00001935 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001936 if (!PyUnicode_Check(unicode)) {
1937 PyErr_BadArgument();
1938 return NULL;
1939 }
Christian Heimesf3863112007-11-22 07:46:41 +00001940 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1941 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001942 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001943 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001944 *psize = PyBytes_GET_SIZE(bytes);
1945 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001946}
1947
1948char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001949_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001950{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001951 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001952}
1953
Alexander Belopolsky40018472011-02-26 01:02:56 +00001954Py_UNICODE *
1955PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956{
1957 if (!PyUnicode_Check(unicode)) {
1958 PyErr_BadArgument();
1959 goto onError;
1960 }
1961 return PyUnicode_AS_UNICODE(unicode);
1962
Benjamin Peterson29060642009-01-31 22:14:21 +00001963 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 return NULL;
1965}
1966
Alexander Belopolsky40018472011-02-26 01:02:56 +00001967Py_ssize_t
1968PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969{
1970 if (!PyUnicode_Check(unicode)) {
1971 PyErr_BadArgument();
1972 goto onError;
1973 }
1974 return PyUnicode_GET_SIZE(unicode);
1975
Benjamin Peterson29060642009-01-31 22:14:21 +00001976 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 return -1;
1978}
1979
Alexander Belopolsky40018472011-02-26 01:02:56 +00001980const char *
1981PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001982{
Victor Stinner42cb4622010-09-01 19:39:01 +00001983 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001984}
1985
Victor Stinner554f3f02010-06-16 23:33:54 +00001986/* create or adjust a UnicodeDecodeError */
1987static void
1988make_decode_exception(PyObject **exceptionObject,
1989 const char *encoding,
1990 const char *input, Py_ssize_t length,
1991 Py_ssize_t startpos, Py_ssize_t endpos,
1992 const char *reason)
1993{
1994 if (*exceptionObject == NULL) {
1995 *exceptionObject = PyUnicodeDecodeError_Create(
1996 encoding, input, length, startpos, endpos, reason);
1997 }
1998 else {
1999 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2000 goto onError;
2001 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2002 goto onError;
2003 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2004 goto onError;
2005 }
2006 return;
2007
2008onError:
2009 Py_DECREF(*exceptionObject);
2010 *exceptionObject = NULL;
2011}
2012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013/* error handling callback helper:
2014 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002015 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002016 and adjust various state variables.
2017 return 0 on success, -1 on error
2018*/
2019
Alexander Belopolsky40018472011-02-26 01:02:56 +00002020static int
2021unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2022 const char *encoding, const char *reason,
2023 const char **input, const char **inend, Py_ssize_t *startinpos,
2024 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2025 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002027 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028
2029 PyObject *restuple = NULL;
2030 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002031 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002032 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002033 Py_ssize_t requiredsize;
2034 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002036 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002037 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002038 int res = -1;
2039
2040 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002041 *errorHandler = PyCodec_LookupError(errors);
2042 if (*errorHandler == NULL)
2043 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 }
2045
Victor Stinner554f3f02010-06-16 23:33:54 +00002046 make_decode_exception(exceptionObject,
2047 encoding,
2048 *input, *inend - *input,
2049 *startinpos, *endinpos,
2050 reason);
2051 if (*exceptionObject == NULL)
2052 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053
2054 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2055 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002056 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002058 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 }
2061 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002062 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002063
2064 /* Copy back the bytes variables, which might have been modified by the
2065 callback */
2066 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2067 if (!inputobj)
2068 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002069 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002070 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002071 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002072 *input = PyBytes_AS_STRING(inputobj);
2073 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002074 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002075 /* we can DECREF safely, as the exception has another reference,
2076 so the object won't go away. */
2077 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002080 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002081 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2083 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002084 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085
2086 /* need more space? (at least enough for what we
2087 have+the replacement+the rest of the string (starting
2088 at the new input position), so we won't have to check space
2089 when there are no errors in the rest of the string) */
2090 repptr = PyUnicode_AS_UNICODE(repunicode);
2091 repsize = PyUnicode_GET_SIZE(repunicode);
2092 requiredsize = *outpos + repsize + insize-newpos;
2093 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002094 if (requiredsize<2*outsize)
2095 requiredsize = 2*outsize;
2096 if (_PyUnicode_Resize(output, requiredsize) < 0)
2097 goto onError;
2098 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002099 }
2100 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002101 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002102 Py_UNICODE_COPY(*outptr, repptr, repsize);
2103 *outptr += repsize;
2104 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002106 /* we made it! */
2107 res = 0;
2108
Benjamin Peterson29060642009-01-31 22:14:21 +00002109 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002110 Py_XDECREF(restuple);
2111 return res;
2112}
2113
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002114/* --- UTF-7 Codec -------------------------------------------------------- */
2115
Antoine Pitrou244651a2009-05-04 18:56:13 +00002116/* See RFC2152 for details. We encode conservatively and decode liberally. */
2117
2118/* Three simple macros defining base-64. */
2119
2120/* Is c a base-64 character? */
2121
2122#define IS_BASE64(c) \
2123 (((c) >= 'A' && (c) <= 'Z') || \
2124 ((c) >= 'a' && (c) <= 'z') || \
2125 ((c) >= '0' && (c) <= '9') || \
2126 (c) == '+' || (c) == '/')
2127
2128/* given that c is a base-64 character, what is its base-64 value? */
2129
2130#define FROM_BASE64(c) \
2131 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2132 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2133 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2134 (c) == '+' ? 62 : 63)
2135
2136/* What is the base-64 character of the bottom 6 bits of n? */
2137
2138#define TO_BASE64(n) \
2139 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2140
2141/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2142 * decoded as itself. We are permissive on decoding; the only ASCII
2143 * byte not decoding to itself is the + which begins a base64
2144 * string. */
2145
2146#define DECODE_DIRECT(c) \
2147 ((c) <= 127 && (c) != '+')
2148
2149/* The UTF-7 encoder treats ASCII characters differently according to
2150 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2151 * the above). See RFC2152. This array identifies these different
2152 * sets:
2153 * 0 : "Set D"
2154 * alphanumeric and '(),-./:?
2155 * 1 : "Set O"
2156 * !"#$%&*;<=>@[]^_`{|}
2157 * 2 : "whitespace"
2158 * ht nl cr sp
2159 * 3 : special (must be base64 encoded)
2160 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2161 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002162
Tim Petersced69f82003-09-16 20:30:58 +00002163static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002164char utf7_category[128] = {
2165/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2166 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2167/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2168 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2169/* sp ! " # $ % & ' ( ) * + , - . / */
2170 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2171/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2172 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2173/* @ A B C D E F G H I J K L M N O */
2174 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2175/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2177/* ` a b c d e f g h i j k l m n o */
2178 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2179/* p q r s t u v w x y z { | } ~ del */
2180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002181};
2182
Antoine Pitrou244651a2009-05-04 18:56:13 +00002183/* ENCODE_DIRECT: this character should be encoded as itself. The
2184 * answer depends on whether we are encoding set O as itself, and also
2185 * on whether we are encoding whitespace as itself. RFC2152 makes it
2186 * clear that the answers to these questions vary between
2187 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002188
Antoine Pitrou244651a2009-05-04 18:56:13 +00002189#define ENCODE_DIRECT(c, directO, directWS) \
2190 ((c) < 128 && (c) > 0 && \
2191 ((utf7_category[(c)] == 0) || \
2192 (directWS && (utf7_category[(c)] == 2)) || \
2193 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002194
Alexander Belopolsky40018472011-02-26 01:02:56 +00002195PyObject *
2196PyUnicode_DecodeUTF7(const char *s,
2197 Py_ssize_t size,
2198 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002199{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002200 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2201}
2202
Antoine Pitrou244651a2009-05-04 18:56:13 +00002203/* The decoder. The only state we preserve is our read position,
2204 * i.e. how many characters we have consumed. So if we end in the
2205 * middle of a shift sequence we have to back off the read position
2206 * and the output to the beginning of the sequence, otherwise we lose
2207 * all the shift state (seen bits, number of bits seen, high
2208 * surrogate). */
2209
Alexander Belopolsky40018472011-02-26 01:02:56 +00002210PyObject *
2211PyUnicode_DecodeUTF7Stateful(const char *s,
2212 Py_ssize_t size,
2213 const char *errors,
2214 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002215{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002216 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002217 Py_ssize_t startinpos;
2218 Py_ssize_t endinpos;
2219 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002220 const char *e;
2221 PyUnicodeObject *unicode;
2222 Py_UNICODE *p;
2223 const char *errmsg = "";
2224 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002225 Py_UNICODE *shiftOutStart;
2226 unsigned int base64bits = 0;
2227 unsigned long base64buffer = 0;
2228 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002229 PyObject *errorHandler = NULL;
2230 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002231
2232 unicode = _PyUnicode_New(size);
2233 if (!unicode)
2234 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002235 if (size == 0) {
2236 if (consumed)
2237 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002238 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002239 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002240
2241 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002242 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002243 e = s + size;
2244
2245 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002246 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002247 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002248 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002249
Antoine Pitrou244651a2009-05-04 18:56:13 +00002250 if (inShift) { /* in a base-64 section */
2251 if (IS_BASE64(ch)) { /* consume a base-64 character */
2252 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2253 base64bits += 6;
2254 s++;
2255 if (base64bits >= 16) {
2256 /* we have enough bits for a UTF-16 value */
2257 Py_UNICODE outCh = (Py_UNICODE)
2258 (base64buffer >> (base64bits-16));
2259 base64bits -= 16;
2260 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2261 if (surrogate) {
2262 /* expecting a second surrogate */
2263 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2264#ifdef Py_UNICODE_WIDE
2265 *p++ = (((surrogate & 0x3FF)<<10)
2266 | (outCh & 0x3FF)) + 0x10000;
2267#else
2268 *p++ = surrogate;
2269 *p++ = outCh;
2270#endif
2271 surrogate = 0;
2272 }
2273 else {
2274 surrogate = 0;
2275 errmsg = "second surrogate missing";
2276 goto utf7Error;
2277 }
2278 }
2279 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2280 /* first surrogate */
2281 surrogate = outCh;
2282 }
2283 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2284 errmsg = "unexpected second surrogate";
2285 goto utf7Error;
2286 }
2287 else {
2288 *p++ = outCh;
2289 }
2290 }
2291 }
2292 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002293 inShift = 0;
2294 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002295 if (surrogate) {
2296 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002297 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002298 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002299 if (base64bits > 0) { /* left-over bits */
2300 if (base64bits >= 6) {
2301 /* We've seen at least one base-64 character */
2302 errmsg = "partial character in shift sequence";
2303 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002304 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002305 else {
2306 /* Some bits remain; they should be zero */
2307 if (base64buffer != 0) {
2308 errmsg = "non-zero padding bits in shift sequence";
2309 goto utf7Error;
2310 }
2311 }
2312 }
2313 if (ch != '-') {
2314 /* '-' is absorbed; other terminating
2315 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002316 *p++ = ch;
2317 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002318 }
2319 }
2320 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002321 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002322 s++; /* consume '+' */
2323 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002324 s++;
2325 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002326 }
2327 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002328 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002329 shiftOutStart = p;
2330 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002331 }
2332 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002333 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002334 *p++ = ch;
2335 s++;
2336 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002337 else {
2338 startinpos = s-starts;
2339 s++;
2340 errmsg = "unexpected special character";
2341 goto utf7Error;
2342 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002343 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002344utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002345 outpos = p-PyUnicode_AS_UNICODE(unicode);
2346 endinpos = s-starts;
2347 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002348 errors, &errorHandler,
2349 "utf7", errmsg,
2350 &starts, &e, &startinpos, &endinpos, &exc, &s,
2351 &unicode, &outpos, &p))
2352 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002353 }
2354
Antoine Pitrou244651a2009-05-04 18:56:13 +00002355 /* end of string */
2356
2357 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2358 /* if we're in an inconsistent state, that's an error */
2359 if (surrogate ||
2360 (base64bits >= 6) ||
2361 (base64bits > 0 && base64buffer != 0)) {
2362 outpos = p-PyUnicode_AS_UNICODE(unicode);
2363 endinpos = size;
2364 if (unicode_decode_call_errorhandler(
2365 errors, &errorHandler,
2366 "utf7", "unterminated shift sequence",
2367 &starts, &e, &startinpos, &endinpos, &exc, &s,
2368 &unicode, &outpos, &p))
2369 goto onError;
2370 if (s < e)
2371 goto restart;
2372 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002373 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002374
2375 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002376 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002377 if (inShift) {
2378 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002379 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002380 }
2381 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002382 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002383 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002384 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002385
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002386 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002387 goto onError;
2388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002389 Py_XDECREF(errorHandler);
2390 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002391 return (PyObject *)unicode;
2392
Benjamin Peterson29060642009-01-31 22:14:21 +00002393 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002394 Py_XDECREF(errorHandler);
2395 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002396 Py_DECREF(unicode);
2397 return NULL;
2398}
2399
2400
Alexander Belopolsky40018472011-02-26 01:02:56 +00002401PyObject *
2402PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2403 Py_ssize_t size,
2404 int base64SetO,
2405 int base64WhiteSpace,
2406 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002407{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002408 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002409 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002410 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002411 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002412 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002413 unsigned int base64bits = 0;
2414 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002415 char * out;
2416 char * start;
2417
2418 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002419 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002420
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002421 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002422 return PyErr_NoMemory();
2423
Antoine Pitrou244651a2009-05-04 18:56:13 +00002424 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002425 if (v == NULL)
2426 return NULL;
2427
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002428 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002429 for (;i < size; ++i) {
2430 Py_UNICODE ch = s[i];
2431
Antoine Pitrou244651a2009-05-04 18:56:13 +00002432 if (inShift) {
2433 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2434 /* shifting out */
2435 if (base64bits) { /* output remaining bits */
2436 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2437 base64buffer = 0;
2438 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002439 }
2440 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002441 /* Characters not in the BASE64 set implicitly unshift the sequence
2442 so no '-' is required, except if the character is itself a '-' */
2443 if (IS_BASE64(ch) || ch == '-') {
2444 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002445 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002446 *out++ = (char) ch;
2447 }
2448 else {
2449 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002450 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002451 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002452 else { /* not in a shift sequence */
2453 if (ch == '+') {
2454 *out++ = '+';
2455 *out++ = '-';
2456 }
2457 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2458 *out++ = (char) ch;
2459 }
2460 else {
2461 *out++ = '+';
2462 inShift = 1;
2463 goto encode_char;
2464 }
2465 }
2466 continue;
2467encode_char:
2468#ifdef Py_UNICODE_WIDE
2469 if (ch >= 0x10000) {
2470 /* code first surrogate */
2471 base64bits += 16;
2472 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2473 while (base64bits >= 6) {
2474 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2475 base64bits -= 6;
2476 }
2477 /* prepare second surrogate */
2478 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2479 }
2480#endif
2481 base64bits += 16;
2482 base64buffer = (base64buffer << 16) | ch;
2483 while (base64bits >= 6) {
2484 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2485 base64bits -= 6;
2486 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002487 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002488 if (base64bits)
2489 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2490 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002491 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002492 if (_PyBytes_Resize(&v, out - start) < 0)
2493 return NULL;
2494 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002495}
2496
Antoine Pitrou244651a2009-05-04 18:56:13 +00002497#undef IS_BASE64
2498#undef FROM_BASE64
2499#undef TO_BASE64
2500#undef DECODE_DIRECT
2501#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002502
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503/* --- UTF-8 Codec -------------------------------------------------------- */
2504
Tim Petersced69f82003-09-16 20:30:58 +00002505static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002507 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2508 illegal prefix. See RFC 3629 for details */
2509 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2510 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002511 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2513 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2514 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2515 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002516 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002520 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2521 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2522 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2523 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2524 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525};
2526
Alexander Belopolsky40018472011-02-26 01:02:56 +00002527PyObject *
2528PyUnicode_DecodeUTF8(const char *s,
2529 Py_ssize_t size,
2530 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531{
Walter Dörwald69652032004-09-07 20:24:22 +00002532 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2533}
2534
Antoine Pitrouab868312009-01-10 15:40:25 +00002535/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2536#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2537
2538/* Mask to quickly check whether a C 'long' contains a
2539 non-ASCII, UTF8-encoded char. */
2540#if (SIZEOF_LONG == 8)
2541# define ASCII_CHAR_MASK 0x8080808080808080L
2542#elif (SIZEOF_LONG == 4)
2543# define ASCII_CHAR_MASK 0x80808080L
2544#else
2545# error C 'long' size should be either 4 or 8!
2546#endif
2547
Alexander Belopolsky40018472011-02-26 01:02:56 +00002548PyObject *
2549PyUnicode_DecodeUTF8Stateful(const char *s,
2550 Py_ssize_t size,
2551 const char *errors,
2552 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002553{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002554 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002556 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002557 Py_ssize_t startinpos;
2558 Py_ssize_t endinpos;
2559 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002560 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 PyUnicodeObject *unicode;
2562 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002563 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002564 PyObject *errorHandler = NULL;
2565 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566
2567 /* Note: size will always be longer than the resulting Unicode
2568 character count */
2569 unicode = _PyUnicode_New(size);
2570 if (!unicode)
2571 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002572 if (size == 0) {
2573 if (consumed)
2574 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577
2578 /* Unpack UTF-8 encoded data */
2579 p = unicode->str;
2580 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002581 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582
2583 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002584 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585
2586 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002587 /* Fast path for runs of ASCII characters. Given that common UTF-8
2588 input will consist of an overwhelming majority of ASCII
2589 characters, we try to optimize for this case by checking
2590 as many characters as a C 'long' can contain.
2591 First, check if we can do an aligned read, as most CPUs have
2592 a penalty for unaligned reads.
2593 */
2594 if (!((size_t) s & LONG_PTR_MASK)) {
2595 /* Help register allocation */
2596 register const char *_s = s;
2597 register Py_UNICODE *_p = p;
2598 while (_s < aligned_end) {
2599 /* Read a whole long at a time (either 4 or 8 bytes),
2600 and do a fast unrolled copy if it only contains ASCII
2601 characters. */
2602 unsigned long data = *(unsigned long *) _s;
2603 if (data & ASCII_CHAR_MASK)
2604 break;
2605 _p[0] = (unsigned char) _s[0];
2606 _p[1] = (unsigned char) _s[1];
2607 _p[2] = (unsigned char) _s[2];
2608 _p[3] = (unsigned char) _s[3];
2609#if (SIZEOF_LONG == 8)
2610 _p[4] = (unsigned char) _s[4];
2611 _p[5] = (unsigned char) _s[5];
2612 _p[6] = (unsigned char) _s[6];
2613 _p[7] = (unsigned char) _s[7];
2614#endif
2615 _s += SIZEOF_LONG;
2616 _p += SIZEOF_LONG;
2617 }
2618 s = _s;
2619 p = _p;
2620 if (s == e)
2621 break;
2622 ch = (unsigned char)*s;
2623 }
2624 }
2625
2626 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002627 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628 s++;
2629 continue;
2630 }
2631
2632 n = utf8_code_length[ch];
2633
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002634 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002635 if (consumed)
2636 break;
2637 else {
2638 errmsg = "unexpected end of data";
2639 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002640 endinpos = startinpos+1;
2641 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2642 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002643 goto utf8Error;
2644 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002645 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002646
2647 switch (n) {
2648
2649 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002650 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002651 startinpos = s-starts;
2652 endinpos = startinpos+1;
2653 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654
2655 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002656 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002657 startinpos = s-starts;
2658 endinpos = startinpos+1;
2659 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660
2661 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002662 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002663 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002664 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002665 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002666 goto utf8Error;
2667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002669 assert ((ch > 0x007F) && (ch <= 0x07FF));
2670 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 break;
2672
2673 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002674 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2675 will result in surrogates in range d800-dfff. Surrogates are
2676 not valid UTF-8 so they are rejected.
2677 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2678 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002679 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002680 (s[2] & 0xc0) != 0x80 ||
2681 ((unsigned char)s[0] == 0xE0 &&
2682 (unsigned char)s[1] < 0xA0) ||
2683 ((unsigned char)s[0] == 0xED &&
2684 (unsigned char)s[1] > 0x9F)) {
2685 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002687 endinpos = startinpos + 1;
2688
2689 /* if s[1] first two bits are 1 and 0, then the invalid
2690 continuation byte is s[2], so increment endinpos by 1,
2691 if not, s[1] is invalid and endinpos doesn't need to
2692 be incremented. */
2693 if ((s[1] & 0xC0) == 0x80)
2694 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002695 goto utf8Error;
2696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002698 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2699 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002700 break;
2701
2702 case 4:
2703 if ((s[1] & 0xc0) != 0x80 ||
2704 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002705 (s[3] & 0xc0) != 0x80 ||
2706 ((unsigned char)s[0] == 0xF0 &&
2707 (unsigned char)s[1] < 0x90) ||
2708 ((unsigned char)s[0] == 0xF4 &&
2709 (unsigned char)s[1] > 0x8F)) {
2710 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002712 endinpos = startinpos + 1;
2713 if ((s[1] & 0xC0) == 0x80) {
2714 endinpos++;
2715 if ((s[2] & 0xC0) == 0x80)
2716 endinpos++;
2717 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 goto utf8Error;
2719 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002720 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002721 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2722 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2723
Fredrik Lundh8f455852001-06-27 18:59:43 +00002724#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002725 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002726#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002727 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002728
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002729 /* translate from 10000..10FFFF to 0..FFFF */
2730 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002731
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002732 /* high surrogate = top 10 bits added to D800 */
2733 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002734
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002735 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002736 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002737#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 }
2740 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002741 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002742
Benjamin Peterson29060642009-01-31 22:14:21 +00002743 utf8Error:
2744 outpos = p-PyUnicode_AS_UNICODE(unicode);
2745 if (unicode_decode_call_errorhandler(
2746 errors, &errorHandler,
2747 "utf8", errmsg,
2748 &starts, &e, &startinpos, &endinpos, &exc, &s,
2749 &unicode, &outpos, &p))
2750 goto onError;
2751 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 }
Walter Dörwald69652032004-09-07 20:24:22 +00002753 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002754 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755
2756 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002757 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 goto onError;
2759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 Py_XDECREF(errorHandler);
2761 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 return (PyObject *)unicode;
2763
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002765 Py_XDECREF(errorHandler);
2766 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 Py_DECREF(unicode);
2768 return NULL;
2769}
2770
Antoine Pitrouab868312009-01-10 15:40:25 +00002771#undef ASCII_CHAR_MASK
2772
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002773#ifdef __APPLE__
2774
2775/* Simplified UTF-8 decoder using surrogateescape error handler,
2776 used to decode the command line arguments on Mac OS X. */
2777
2778wchar_t*
2779_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2780{
2781 int n;
2782 const char *e;
2783 wchar_t *unicode, *p;
2784
2785 /* Note: size will always be longer than the resulting Unicode
2786 character count */
2787 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2788 PyErr_NoMemory();
2789 return NULL;
2790 }
2791 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2792 if (!unicode)
2793 return NULL;
2794
2795 /* Unpack UTF-8 encoded data */
2796 p = unicode;
2797 e = s + size;
2798 while (s < e) {
2799 Py_UCS4 ch = (unsigned char)*s;
2800
2801 if (ch < 0x80) {
2802 *p++ = (wchar_t)ch;
2803 s++;
2804 continue;
2805 }
2806
2807 n = utf8_code_length[ch];
2808 if (s + n > e) {
2809 goto surrogateescape;
2810 }
2811
2812 switch (n) {
2813 case 0:
2814 case 1:
2815 goto surrogateescape;
2816
2817 case 2:
2818 if ((s[1] & 0xc0) != 0x80)
2819 goto surrogateescape;
2820 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2821 assert ((ch > 0x007F) && (ch <= 0x07FF));
2822 *p++ = (wchar_t)ch;
2823 break;
2824
2825 case 3:
2826 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2827 will result in surrogates in range d800-dfff. Surrogates are
2828 not valid UTF-8 so they are rejected.
2829 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2830 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2831 if ((s[1] & 0xc0) != 0x80 ||
2832 (s[2] & 0xc0) != 0x80 ||
2833 ((unsigned char)s[0] == 0xE0 &&
2834 (unsigned char)s[1] < 0xA0) ||
2835 ((unsigned char)s[0] == 0xED &&
2836 (unsigned char)s[1] > 0x9F)) {
2837
2838 goto surrogateescape;
2839 }
2840 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2841 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2842 *p++ = (Py_UNICODE)ch;
2843 break;
2844
2845 case 4:
2846 if ((s[1] & 0xc0) != 0x80 ||
2847 (s[2] & 0xc0) != 0x80 ||
2848 (s[3] & 0xc0) != 0x80 ||
2849 ((unsigned char)s[0] == 0xF0 &&
2850 (unsigned char)s[1] < 0x90) ||
2851 ((unsigned char)s[0] == 0xF4 &&
2852 (unsigned char)s[1] > 0x8F)) {
2853 goto surrogateescape;
2854 }
2855 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2856 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2857 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2858
2859#if SIZEOF_WCHAR_T == 4
2860 *p++ = (wchar_t)ch;
2861#else
2862 /* compute and append the two surrogates: */
2863
2864 /* translate from 10000..10FFFF to 0..FFFF */
2865 ch -= 0x10000;
2866
2867 /* high surrogate = top 10 bits added to D800 */
2868 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2869
2870 /* low surrogate = bottom 10 bits added to DC00 */
2871 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2872#endif
2873 break;
2874 }
2875 s += n;
2876 continue;
2877
2878 surrogateescape:
2879 *p++ = 0xDC00 + ch;
2880 s++;
2881 }
2882 *p = L'\0';
2883 return unicode;
2884}
2885
2886#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002887
Tim Peters602f7402002-04-27 18:03:26 +00002888/* Allocation strategy: if the string is short, convert into a stack buffer
2889 and allocate exactly as much space needed at the end. Else allocate the
2890 maximum possible needed (4 result bytes per Unicode character), and return
2891 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002892*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002893PyObject *
2894PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002895 Py_ssize_t size,
2896 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897{
Tim Peters602f7402002-04-27 18:03:26 +00002898#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002899
Guido van Rossum98297ee2007-11-06 21:34:58 +00002900 Py_ssize_t i; /* index into s of next input byte */
2901 PyObject *result; /* result string object */
2902 char *p; /* next free byte in output buffer */
2903 Py_ssize_t nallocated; /* number of result bytes allocated */
2904 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002905 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002906 PyObject *errorHandler = NULL;
2907 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002908
Tim Peters602f7402002-04-27 18:03:26 +00002909 assert(s != NULL);
2910 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911
Tim Peters602f7402002-04-27 18:03:26 +00002912 if (size <= MAX_SHORT_UNICHARS) {
2913 /* Write into the stack buffer; nallocated can't overflow.
2914 * At the end, we'll allocate exactly as much heap space as it
2915 * turns out we need.
2916 */
2917 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002918 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002919 p = stackbuf;
2920 }
2921 else {
2922 /* Overallocate on the heap, and give the excess back at the end. */
2923 nallocated = size * 4;
2924 if (nallocated / 4 != size) /* overflow! */
2925 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002926 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002927 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002928 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002929 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002930 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002931
Tim Peters602f7402002-04-27 18:03:26 +00002932 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002933 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002934
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002935 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002936 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002938
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002940 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002941 *p++ = (char)(0xc0 | (ch >> 6));
2942 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002943 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002944#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002945 /* Special case: check for high and low surrogate */
2946 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2947 Py_UCS4 ch2 = s[i];
2948 /* Combine the two surrogates to form a UCS4 value */
2949 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2950 i++;
2951
2952 /* Encode UCS4 Unicode ordinals */
2953 *p++ = (char)(0xf0 | (ch >> 18));
2954 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002955 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2956 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002957 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002958#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002959 Py_ssize_t newpos;
2960 PyObject *rep;
2961 Py_ssize_t repsize, k;
2962 rep = unicode_encode_call_errorhandler
2963 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2964 s, size, &exc, i-1, i, &newpos);
2965 if (!rep)
2966 goto error;
2967
2968 if (PyBytes_Check(rep))
2969 repsize = PyBytes_GET_SIZE(rep);
2970 else
2971 repsize = PyUnicode_GET_SIZE(rep);
2972
2973 if (repsize > 4) {
2974 Py_ssize_t offset;
2975
2976 if (result == NULL)
2977 offset = p - stackbuf;
2978 else
2979 offset = p - PyBytes_AS_STRING(result);
2980
2981 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2982 /* integer overflow */
2983 PyErr_NoMemory();
2984 goto error;
2985 }
2986 nallocated += repsize - 4;
2987 if (result != NULL) {
2988 if (_PyBytes_Resize(&result, nallocated) < 0)
2989 goto error;
2990 } else {
2991 result = PyBytes_FromStringAndSize(NULL, nallocated);
2992 if (result == NULL)
2993 goto error;
2994 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2995 }
2996 p = PyBytes_AS_STRING(result) + offset;
2997 }
2998
2999 if (PyBytes_Check(rep)) {
3000 char *prep = PyBytes_AS_STRING(rep);
3001 for(k = repsize; k > 0; k--)
3002 *p++ = *prep++;
3003 } else /* rep is unicode */ {
3004 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3005 Py_UNICODE c;
3006
3007 for(k=0; k<repsize; k++) {
3008 c = prep[k];
3009 if (0x80 <= c) {
3010 raise_encode_exception(&exc, "utf-8", s, size,
3011 i-1, i, "surrogates not allowed");
3012 goto error;
3013 }
3014 *p++ = (char)prep[k];
3015 }
3016 }
3017 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003018#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003019 }
Victor Stinner445a6232010-04-22 20:01:57 +00003020#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003021 } else if (ch < 0x10000) {
3022 *p++ = (char)(0xe0 | (ch >> 12));
3023 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3024 *p++ = (char)(0x80 | (ch & 0x3f));
3025 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003026 /* Encode UCS4 Unicode ordinals */
3027 *p++ = (char)(0xf0 | (ch >> 18));
3028 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3029 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3030 *p++ = (char)(0x80 | (ch & 0x3f));
3031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003033
Guido van Rossum98297ee2007-11-06 21:34:58 +00003034 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003035 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003036 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003037 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003038 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003039 }
3040 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003041 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003042 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003043 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003044 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003045 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003046 Py_XDECREF(errorHandler);
3047 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003048 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003049 error:
3050 Py_XDECREF(errorHandler);
3051 Py_XDECREF(exc);
3052 Py_XDECREF(result);
3053 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003054
Tim Peters602f7402002-04-27 18:03:26 +00003055#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056}
3057
Alexander Belopolsky40018472011-02-26 01:02:56 +00003058PyObject *
3059PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 if (!PyUnicode_Check(unicode)) {
3062 PyErr_BadArgument();
3063 return NULL;
3064 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003065 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003066 PyUnicode_GET_SIZE(unicode),
3067 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068}
3069
Walter Dörwald41980ca2007-08-16 21:55:45 +00003070/* --- UTF-32 Codec ------------------------------------------------------- */
3071
3072PyObject *
3073PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003074 Py_ssize_t size,
3075 const char *errors,
3076 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003077{
3078 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3079}
3080
3081PyObject *
3082PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003083 Py_ssize_t size,
3084 const char *errors,
3085 int *byteorder,
3086 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003087{
3088 const char *starts = s;
3089 Py_ssize_t startinpos;
3090 Py_ssize_t endinpos;
3091 Py_ssize_t outpos;
3092 PyUnicodeObject *unicode;
3093 Py_UNICODE *p;
3094#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003095 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003096 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003097#else
3098 const int pairs = 0;
3099#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003100 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003101 int bo = 0; /* assume native ordering by default */
3102 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003103 /* Offsets from q for retrieving bytes in the right order. */
3104#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3105 int iorder[] = {0, 1, 2, 3};
3106#else
3107 int iorder[] = {3, 2, 1, 0};
3108#endif
3109 PyObject *errorHandler = NULL;
3110 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003111
Walter Dörwald41980ca2007-08-16 21:55:45 +00003112 q = (unsigned char *)s;
3113 e = q + size;
3114
3115 if (byteorder)
3116 bo = *byteorder;
3117
3118 /* Check for BOM marks (U+FEFF) in the input and adjust current
3119 byte order setting accordingly. In native mode, the leading BOM
3120 mark is skipped, in all other modes, it is copied to the output
3121 stream as-is (giving a ZWNBSP character). */
3122 if (bo == 0) {
3123 if (size >= 4) {
3124 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003125 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003126#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 if (bom == 0x0000FEFF) {
3128 q += 4;
3129 bo = -1;
3130 }
3131 else if (bom == 0xFFFE0000) {
3132 q += 4;
3133 bo = 1;
3134 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003135#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003136 if (bom == 0x0000FEFF) {
3137 q += 4;
3138 bo = 1;
3139 }
3140 else if (bom == 0xFFFE0000) {
3141 q += 4;
3142 bo = -1;
3143 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003144#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003146 }
3147
3148 if (bo == -1) {
3149 /* force LE */
3150 iorder[0] = 0;
3151 iorder[1] = 1;
3152 iorder[2] = 2;
3153 iorder[3] = 3;
3154 }
3155 else if (bo == 1) {
3156 /* force BE */
3157 iorder[0] = 3;
3158 iorder[1] = 2;
3159 iorder[2] = 1;
3160 iorder[3] = 0;
3161 }
3162
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003163 /* On narrow builds we split characters outside the BMP into two
3164 codepoints => count how much extra space we need. */
3165#ifndef Py_UNICODE_WIDE
3166 for (qq = q; qq < e; qq += 4)
3167 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3168 pairs++;
3169#endif
3170
3171 /* This might be one to much, because of a BOM */
3172 unicode = _PyUnicode_New((size+3)/4+pairs);
3173 if (!unicode)
3174 return NULL;
3175 if (size == 0)
3176 return (PyObject *)unicode;
3177
3178 /* Unpack UTF-32 encoded data */
3179 p = unicode->str;
3180
Walter Dörwald41980ca2007-08-16 21:55:45 +00003181 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003182 Py_UCS4 ch;
3183 /* remaining bytes at the end? (size should be divisible by 4) */
3184 if (e-q<4) {
3185 if (consumed)
3186 break;
3187 errmsg = "truncated data";
3188 startinpos = ((const char *)q)-starts;
3189 endinpos = ((const char *)e)-starts;
3190 goto utf32Error;
3191 /* The remaining input chars are ignored if the callback
3192 chooses to skip the input */
3193 }
3194 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3195 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003196
Benjamin Peterson29060642009-01-31 22:14:21 +00003197 if (ch >= 0x110000)
3198 {
3199 errmsg = "codepoint not in range(0x110000)";
3200 startinpos = ((const char *)q)-starts;
3201 endinpos = startinpos+4;
3202 goto utf32Error;
3203 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003204#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 if (ch >= 0x10000)
3206 {
3207 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3208 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3209 }
3210 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003211#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003212 *p++ = ch;
3213 q += 4;
3214 continue;
3215 utf32Error:
3216 outpos = p-PyUnicode_AS_UNICODE(unicode);
3217 if (unicode_decode_call_errorhandler(
3218 errors, &errorHandler,
3219 "utf32", errmsg,
3220 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3221 &unicode, &outpos, &p))
3222 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003223 }
3224
3225 if (byteorder)
3226 *byteorder = bo;
3227
3228 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003230
3231 /* Adjust length */
3232 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3233 goto onError;
3234
3235 Py_XDECREF(errorHandler);
3236 Py_XDECREF(exc);
3237 return (PyObject *)unicode;
3238
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003240 Py_DECREF(unicode);
3241 Py_XDECREF(errorHandler);
3242 Py_XDECREF(exc);
3243 return NULL;
3244}
3245
3246PyObject *
3247PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 Py_ssize_t size,
3249 const char *errors,
3250 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003251{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003252 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003253 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003254 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003255#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003256 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003257#else
3258 const int pairs = 0;
3259#endif
3260 /* Offsets from p for storing byte pairs in the right order. */
3261#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3262 int iorder[] = {0, 1, 2, 3};
3263#else
3264 int iorder[] = {3, 2, 1, 0};
3265#endif
3266
Benjamin Peterson29060642009-01-31 22:14:21 +00003267#define STORECHAR(CH) \
3268 do { \
3269 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3270 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3271 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3272 p[iorder[0]] = (CH) & 0xff; \
3273 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003274 } while(0)
3275
3276 /* In narrow builds we can output surrogate pairs as one codepoint,
3277 so we need less space. */
3278#ifndef Py_UNICODE_WIDE
3279 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003280 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3281 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3282 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003283#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003284 nsize = (size - pairs + (byteorder == 0));
3285 bytesize = nsize * 4;
3286 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003287 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003288 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003289 if (v == NULL)
3290 return NULL;
3291
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003292 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003293 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003294 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003295 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003296 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003297
3298 if (byteorder == -1) {
3299 /* force LE */
3300 iorder[0] = 0;
3301 iorder[1] = 1;
3302 iorder[2] = 2;
3303 iorder[3] = 3;
3304 }
3305 else if (byteorder == 1) {
3306 /* force BE */
3307 iorder[0] = 3;
3308 iorder[1] = 2;
3309 iorder[2] = 1;
3310 iorder[3] = 0;
3311 }
3312
3313 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003315#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3317 Py_UCS4 ch2 = *s;
3318 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3319 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3320 s++;
3321 size--;
3322 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003323 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003324#endif
3325 STORECHAR(ch);
3326 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003327
3328 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003329 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003330#undef STORECHAR
3331}
3332
Alexander Belopolsky40018472011-02-26 01:02:56 +00003333PyObject *
3334PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003335{
3336 if (!PyUnicode_Check(unicode)) {
3337 PyErr_BadArgument();
3338 return NULL;
3339 }
3340 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003341 PyUnicode_GET_SIZE(unicode),
3342 NULL,
3343 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003344}
3345
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346/* --- UTF-16 Codec ------------------------------------------------------- */
3347
Tim Peters772747b2001-08-09 22:21:55 +00003348PyObject *
3349PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003350 Py_ssize_t size,
3351 const char *errors,
3352 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353{
Walter Dörwald69652032004-09-07 20:24:22 +00003354 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3355}
3356
Antoine Pitrouab868312009-01-10 15:40:25 +00003357/* Two masks for fast checking of whether a C 'long' may contain
3358 UTF16-encoded surrogate characters. This is an efficient heuristic,
3359 assuming that non-surrogate characters with a code point >= 0x8000 are
3360 rare in most input.
3361 FAST_CHAR_MASK is used when the input is in native byte ordering,
3362 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003363*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003364#if (SIZEOF_LONG == 8)
3365# define FAST_CHAR_MASK 0x8000800080008000L
3366# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3367#elif (SIZEOF_LONG == 4)
3368# define FAST_CHAR_MASK 0x80008000L
3369# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3370#else
3371# error C 'long' size should be either 4 or 8!
3372#endif
3373
Walter Dörwald69652032004-09-07 20:24:22 +00003374PyObject *
3375PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003376 Py_ssize_t size,
3377 const char *errors,
3378 int *byteorder,
3379 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003380{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003381 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003382 Py_ssize_t startinpos;
3383 Py_ssize_t endinpos;
3384 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 PyUnicodeObject *unicode;
3386 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003387 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003388 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003389 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003390 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003391 /* Offsets from q for retrieving byte pairs in the right order. */
3392#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3393 int ihi = 1, ilo = 0;
3394#else
3395 int ihi = 0, ilo = 1;
3396#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003397 PyObject *errorHandler = NULL;
3398 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399
3400 /* Note: size will always be longer than the resulting Unicode
3401 character count */
3402 unicode = _PyUnicode_New(size);
3403 if (!unicode)
3404 return NULL;
3405 if (size == 0)
3406 return (PyObject *)unicode;
3407
3408 /* Unpack UTF-16 encoded data */
3409 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003410 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003411 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412
3413 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003414 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003415
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003416 /* Check for BOM marks (U+FEFF) in the input and adjust current
3417 byte order setting accordingly. In native mode, the leading BOM
3418 mark is skipped, in all other modes, it is copied to the output
3419 stream as-is (giving a ZWNBSP character). */
3420 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003421 if (size >= 2) {
3422 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003423#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003424 if (bom == 0xFEFF) {
3425 q += 2;
3426 bo = -1;
3427 }
3428 else if (bom == 0xFFFE) {
3429 q += 2;
3430 bo = 1;
3431 }
Tim Petersced69f82003-09-16 20:30:58 +00003432#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003433 if (bom == 0xFEFF) {
3434 q += 2;
3435 bo = 1;
3436 }
3437 else if (bom == 0xFFFE) {
3438 q += 2;
3439 bo = -1;
3440 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003441#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003442 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444
Tim Peters772747b2001-08-09 22:21:55 +00003445 if (bo == -1) {
3446 /* force LE */
3447 ihi = 1;
3448 ilo = 0;
3449 }
3450 else if (bo == 1) {
3451 /* force BE */
3452 ihi = 0;
3453 ilo = 1;
3454 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003455#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3456 native_ordering = ilo < ihi;
3457#else
3458 native_ordering = ilo > ihi;
3459#endif
Tim Peters772747b2001-08-09 22:21:55 +00003460
Antoine Pitrouab868312009-01-10 15:40:25 +00003461 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003462 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003463 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003464 /* First check for possible aligned read of a C 'long'. Unaligned
3465 reads are more expensive, better to defer to another iteration. */
3466 if (!((size_t) q & LONG_PTR_MASK)) {
3467 /* Fast path for runs of non-surrogate chars. */
3468 register const unsigned char *_q = q;
3469 Py_UNICODE *_p = p;
3470 if (native_ordering) {
3471 /* Native ordering is simple: as long as the input cannot
3472 possibly contain a surrogate char, do an unrolled copy
3473 of several 16-bit code points to the target object.
3474 The non-surrogate check is done on several input bytes
3475 at a time (as many as a C 'long' can contain). */
3476 while (_q < aligned_end) {
3477 unsigned long data = * (unsigned long *) _q;
3478 if (data & FAST_CHAR_MASK)
3479 break;
3480 _p[0] = ((unsigned short *) _q)[0];
3481 _p[1] = ((unsigned short *) _q)[1];
3482#if (SIZEOF_LONG == 8)
3483 _p[2] = ((unsigned short *) _q)[2];
3484 _p[3] = ((unsigned short *) _q)[3];
3485#endif
3486 _q += SIZEOF_LONG;
3487 _p += SIZEOF_LONG / 2;
3488 }
3489 }
3490 else {
3491 /* Byteswapped ordering is similar, but we must decompose
3492 the copy bytewise, and take care of zero'ing out the
3493 upper bytes if the target object is in 32-bit units
3494 (that is, in UCS-4 builds). */
3495 while (_q < aligned_end) {
3496 unsigned long data = * (unsigned long *) _q;
3497 if (data & SWAPPED_FAST_CHAR_MASK)
3498 break;
3499 /* Zero upper bytes in UCS-4 builds */
3500#if (Py_UNICODE_SIZE > 2)
3501 _p[0] = 0;
3502 _p[1] = 0;
3503#if (SIZEOF_LONG == 8)
3504 _p[2] = 0;
3505 _p[3] = 0;
3506#endif
3507#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003508 /* Issue #4916; UCS-4 builds on big endian machines must
3509 fill the two last bytes of each 4-byte unit. */
3510#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3511# define OFF 2
3512#else
3513# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003514#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003515 ((unsigned char *) _p)[OFF + 1] = _q[0];
3516 ((unsigned char *) _p)[OFF + 0] = _q[1];
3517 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3518 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3519#if (SIZEOF_LONG == 8)
3520 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3521 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3522 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3523 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3524#endif
3525#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003526 _q += SIZEOF_LONG;
3527 _p += SIZEOF_LONG / 2;
3528 }
3529 }
3530 p = _p;
3531 q = _q;
3532 if (q >= e)
3533 break;
3534 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536
Benjamin Peterson14339b62009-01-31 16:36:08 +00003537 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003538
3539 if (ch < 0xD800 || ch > 0xDFFF) {
3540 *p++ = ch;
3541 continue;
3542 }
3543
3544 /* UTF-16 code pair: */
3545 if (q > e) {
3546 errmsg = "unexpected end of data";
3547 startinpos = (((const char *)q) - 2) - starts;
3548 endinpos = ((const char *)e) + 1 - starts;
3549 goto utf16Error;
3550 }
3551 if (0xD800 <= ch && ch <= 0xDBFF) {
3552 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3553 q += 2;
3554 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003555#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003556 *p++ = ch;
3557 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003558#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003559 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003560#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003561 continue;
3562 }
3563 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003564 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003565 startinpos = (((const char *)q)-4)-starts;
3566 endinpos = startinpos+2;
3567 goto utf16Error;
3568 }
3569
Benjamin Peterson14339b62009-01-31 16:36:08 +00003570 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003571 errmsg = "illegal encoding";
3572 startinpos = (((const char *)q)-2)-starts;
3573 endinpos = startinpos+2;
3574 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003575
Benjamin Peterson29060642009-01-31 22:14:21 +00003576 utf16Error:
3577 outpos = p - PyUnicode_AS_UNICODE(unicode);
3578 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003579 errors,
3580 &errorHandler,
3581 "utf16", errmsg,
3582 &starts,
3583 (const char **)&e,
3584 &startinpos,
3585 &endinpos,
3586 &exc,
3587 (const char **)&q,
3588 &unicode,
3589 &outpos,
3590 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003593 /* remaining byte at the end? (size should be even) */
3594 if (e == q) {
3595 if (!consumed) {
3596 errmsg = "truncated data";
3597 startinpos = ((const char *)q) - starts;
3598 endinpos = ((const char *)e) + 1 - starts;
3599 outpos = p - PyUnicode_AS_UNICODE(unicode);
3600 if (unicode_decode_call_errorhandler(
3601 errors,
3602 &errorHandler,
3603 "utf16", errmsg,
3604 &starts,
3605 (const char **)&e,
3606 &startinpos,
3607 &endinpos,
3608 &exc,
3609 (const char **)&q,
3610 &unicode,
3611 &outpos,
3612 &p))
3613 goto onError;
3614 /* The remaining input chars are ignored if the callback
3615 chooses to skip the input */
3616 }
3617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618
3619 if (byteorder)
3620 *byteorder = bo;
3621
Walter Dörwald69652032004-09-07 20:24:22 +00003622 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003623 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003624
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003626 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627 goto onError;
3628
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629 Py_XDECREF(errorHandler);
3630 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631 return (PyObject *)unicode;
3632
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 Py_XDECREF(errorHandler);
3636 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637 return NULL;
3638}
3639
Antoine Pitrouab868312009-01-10 15:40:25 +00003640#undef FAST_CHAR_MASK
3641#undef SWAPPED_FAST_CHAR_MASK
3642
Tim Peters772747b2001-08-09 22:21:55 +00003643PyObject *
3644PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 Py_ssize_t size,
3646 const char *errors,
3647 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003649 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003650 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003651 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003652#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003653 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003654#else
3655 const int pairs = 0;
3656#endif
Tim Peters772747b2001-08-09 22:21:55 +00003657 /* Offsets from p for storing byte pairs in the right order. */
3658#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3659 int ihi = 1, ilo = 0;
3660#else
3661 int ihi = 0, ilo = 1;
3662#endif
3663
Benjamin Peterson29060642009-01-31 22:14:21 +00003664#define STORECHAR(CH) \
3665 do { \
3666 p[ihi] = ((CH) >> 8) & 0xff; \
3667 p[ilo] = (CH) & 0xff; \
3668 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003669 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003671#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003672 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 if (s[i] >= 0x10000)
3674 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003675#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003676 /* 2 * (size + pairs + (byteorder == 0)) */
3677 if (size > PY_SSIZE_T_MAX ||
3678 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003680 nsize = size + pairs + (byteorder == 0);
3681 bytesize = nsize * 2;
3682 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003684 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 if (v == NULL)
3686 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003688 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003690 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003691 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003692 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003693
3694 if (byteorder == -1) {
3695 /* force LE */
3696 ihi = 1;
3697 ilo = 0;
3698 }
3699 else if (byteorder == 1) {
3700 /* force BE */
3701 ihi = 0;
3702 ilo = 1;
3703 }
3704
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003705 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003706 Py_UNICODE ch = *s++;
3707 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003708#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 if (ch >= 0x10000) {
3710 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3711 ch = 0xD800 | ((ch-0x10000) >> 10);
3712 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003713#endif
Tim Peters772747b2001-08-09 22:21:55 +00003714 STORECHAR(ch);
3715 if (ch2)
3716 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003717 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003718
3719 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003720 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003721#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722}
3723
Alexander Belopolsky40018472011-02-26 01:02:56 +00003724PyObject *
3725PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003726{
3727 if (!PyUnicode_Check(unicode)) {
3728 PyErr_BadArgument();
3729 return NULL;
3730 }
3731 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003732 PyUnicode_GET_SIZE(unicode),
3733 NULL,
3734 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735}
3736
3737/* --- Unicode Escape Codec ----------------------------------------------- */
3738
Fredrik Lundh06d12682001-01-24 07:59:11 +00003739static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003740
Alexander Belopolsky40018472011-02-26 01:02:56 +00003741PyObject *
3742PyUnicode_DecodeUnicodeEscape(const char *s,
3743 Py_ssize_t size,
3744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003746 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003747 Py_ssize_t startinpos;
3748 Py_ssize_t endinpos;
3749 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003754 char* message;
3755 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003756 PyObject *errorHandler = NULL;
3757 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003758
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759 /* Escaped strings will always be longer than the resulting
3760 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 length after conversion to the true value.
3762 (but if the error callback returns a long replacement string
3763 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 v = _PyUnicode_New(size);
3765 if (v == NULL)
3766 goto onError;
3767 if (size == 0)
3768 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003769
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003772
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 while (s < end) {
3774 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003775 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777
3778 /* Non-escape characters are interpreted as Unicode ordinals */
3779 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003780 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 continue;
3782 }
3783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 /* \ - Escapes */
3786 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003787 c = *s++;
3788 if (s > end)
3789 c = '\0'; /* Invalid after \ */
3790 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791
Benjamin Peterson29060642009-01-31 22:14:21 +00003792 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 case '\n': break;
3794 case '\\': *p++ = '\\'; break;
3795 case '\'': *p++ = '\''; break;
3796 case '\"': *p++ = '\"'; break;
3797 case 'b': *p++ = '\b'; break;
3798 case 'f': *p++ = '\014'; break; /* FF */
3799 case 't': *p++ = '\t'; break;
3800 case 'n': *p++ = '\n'; break;
3801 case 'r': *p++ = '\r'; break;
3802 case 'v': *p++ = '\013'; break; /* VT */
3803 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3804
Benjamin Peterson29060642009-01-31 22:14:21 +00003805 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 case '0': case '1': case '2': case '3':
3807 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003808 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003809 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003810 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003811 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003812 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003814 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 break;
3816
Benjamin Peterson29060642009-01-31 22:14:21 +00003817 /* hex escapes */
3818 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003820 digits = 2;
3821 message = "truncated \\xXX escape";
3822 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823
Benjamin Peterson29060642009-01-31 22:14:21 +00003824 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003826 digits = 4;
3827 message = "truncated \\uXXXX escape";
3828 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829
Benjamin Peterson29060642009-01-31 22:14:21 +00003830 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003831 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003832 digits = 8;
3833 message = "truncated \\UXXXXXXXX escape";
3834 hexescape:
3835 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 outpos = p-PyUnicode_AS_UNICODE(v);
3837 if (s+digits>end) {
3838 endinpos = size;
3839 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003840 errors, &errorHandler,
3841 "unicodeescape", "end of string in escape sequence",
3842 &starts, &end, &startinpos, &endinpos, &exc, &s,
3843 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 goto onError;
3845 goto nextByte;
3846 }
3847 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003848 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003849 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003850 endinpos = (s+i+1)-starts;
3851 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003852 errors, &errorHandler,
3853 "unicodeescape", message,
3854 &starts, &end, &startinpos, &endinpos, &exc, &s,
3855 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003856 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003858 }
3859 chr = (chr<<4) & ~0xF;
3860 if (c >= '0' && c <= '9')
3861 chr += c - '0';
3862 else if (c >= 'a' && c <= 'f')
3863 chr += 10 + c - 'a';
3864 else
3865 chr += 10 + c - 'A';
3866 }
3867 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003868 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003869 /* _decoding_error will have already written into the
3870 target buffer. */
3871 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003872 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003873 /* when we get here, chr is a 32-bit unicode character */
3874 if (chr <= 0xffff)
3875 /* UCS-2 character */
3876 *p++ = (Py_UNICODE) chr;
3877 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003878 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003879 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003880#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003881 *p++ = chr;
3882#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003883 chr -= 0x10000L;
3884 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003885 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003886#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003887 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 endinpos = s-starts;
3889 outpos = p-PyUnicode_AS_UNICODE(v);
3890 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003891 errors, &errorHandler,
3892 "unicodeescape", "illegal Unicode character",
3893 &starts, &end, &startinpos, &endinpos, &exc, &s,
3894 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003895 goto onError;
3896 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003897 break;
3898
Benjamin Peterson29060642009-01-31 22:14:21 +00003899 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003900 case 'N':
3901 message = "malformed \\N character escape";
3902 if (ucnhash_CAPI == NULL) {
3903 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003904 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003905 if (ucnhash_CAPI == NULL)
3906 goto ucnhashError;
3907 }
3908 if (*s == '{') {
3909 const char *start = s+1;
3910 /* look for the closing brace */
3911 while (*s != '}' && s < end)
3912 s++;
3913 if (s > start && s < end && *s == '}') {
3914 /* found a name. look it up in the unicode database */
3915 message = "unknown Unicode character name";
3916 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003917 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003918 goto store;
3919 }
3920 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 endinpos = s-starts;
3922 outpos = p-PyUnicode_AS_UNICODE(v);
3923 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 errors, &errorHandler,
3925 "unicodeescape", message,
3926 &starts, &end, &startinpos, &endinpos, &exc, &s,
3927 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003928 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003929 break;
3930
3931 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003932 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 message = "\\ at end of string";
3934 s--;
3935 endinpos = s-starts;
3936 outpos = p-PyUnicode_AS_UNICODE(v);
3937 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003938 errors, &errorHandler,
3939 "unicodeescape", message,
3940 &starts, &end, &startinpos, &endinpos, &exc, &s,
3941 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003942 goto onError;
3943 }
3944 else {
3945 *p++ = '\\';
3946 *p++ = (unsigned char)s[-1];
3947 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003948 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003951 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003953 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003955 Py_XDECREF(errorHandler);
3956 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003958
Benjamin Peterson29060642009-01-31 22:14:21 +00003959 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003960 PyErr_SetString(
3961 PyExc_UnicodeError,
3962 "\\N escapes not supported (can't load unicodedata module)"
3963 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003964 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 Py_XDECREF(errorHandler);
3966 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003967 return NULL;
3968
Benjamin Peterson29060642009-01-31 22:14:21 +00003969 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 Py_XDECREF(errorHandler);
3972 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 return NULL;
3974}
3975
3976/* Return a Unicode-Escape string version of the Unicode object.
3977
3978 If quotes is true, the string is enclosed in u"" or u'' quotes as
3979 appropriate.
3980
3981*/
3982
Thomas Wouters477c8d52006-05-27 19:21:47 +00003983Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003984 Py_ssize_t size,
3985 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003986{
3987 /* like wcschr, but doesn't stop at NULL characters */
3988
3989 while (size-- > 0) {
3990 if (*s == ch)
3991 return s;
3992 s++;
3993 }
3994
3995 return NULL;
3996}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003997
Walter Dörwald79e913e2007-05-12 11:08:06 +00003998static const char *hexdigits = "0123456789abcdef";
3999
Alexander Belopolsky40018472011-02-26 01:02:56 +00004000PyObject *
4001PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4002 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004004 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004007#ifdef Py_UNICODE_WIDE
4008 const Py_ssize_t expandsize = 10;
4009#else
4010 const Py_ssize_t expandsize = 6;
4011#endif
4012
Thomas Wouters89f507f2006-12-13 04:49:30 +00004013 /* XXX(nnorwitz): rather than over-allocating, it would be
4014 better to choose a different scheme. Perhaps scan the
4015 first N-chars of the string and allocate based on that size.
4016 */
4017 /* Initial allocation is based on the longest-possible unichr
4018 escape.
4019
4020 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4021 unichr, so in this case it's the longest unichr escape. In
4022 narrow (UTF-16) builds this is five chars per source unichr
4023 since there are two unichrs in the surrogate pair, so in narrow
4024 (UTF-16) builds it's not the longest unichr escape.
4025
4026 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4027 so in the narrow (UTF-16) build case it's the longest unichr
4028 escape.
4029 */
4030
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004031 if (size == 0)
4032 return PyBytes_FromStringAndSize(NULL, 0);
4033
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004034 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004036
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004037 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004038 2
4039 + expandsize*size
4040 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 if (repr == NULL)
4042 return NULL;
4043
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004044 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046 while (size-- > 0) {
4047 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004048
Walter Dörwald79e913e2007-05-12 11:08:06 +00004049 /* Escape backslashes */
4050 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 *p++ = '\\';
4052 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004053 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004054 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004055
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004056#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004057 /* Map 21-bit characters to '\U00xxxxxx' */
4058 else if (ch >= 0x10000) {
4059 *p++ = '\\';
4060 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004061 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4062 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4063 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4064 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4065 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4066 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4067 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4068 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004070 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004071#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004072 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4073 else if (ch >= 0xD800 && ch < 0xDC00) {
4074 Py_UNICODE ch2;
4075 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004076
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 ch2 = *s++;
4078 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004079 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4081 *p++ = '\\';
4082 *p++ = 'U';
4083 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4084 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4085 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4086 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4087 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4088 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4089 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4090 *p++ = hexdigits[ucs & 0x0000000F];
4091 continue;
4092 }
4093 /* Fall through: isolated surrogates are copied as-is */
4094 s--;
4095 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004096 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004097#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004098
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004100 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 *p++ = '\\';
4102 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004103 *p++ = hexdigits[(ch >> 12) & 0x000F];
4104 *p++ = hexdigits[(ch >> 8) & 0x000F];
4105 *p++ = hexdigits[(ch >> 4) & 0x000F];
4106 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004108
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004109 /* Map special whitespace to '\t', \n', '\r' */
4110 else if (ch == '\t') {
4111 *p++ = '\\';
4112 *p++ = 't';
4113 }
4114 else if (ch == '\n') {
4115 *p++ = '\\';
4116 *p++ = 'n';
4117 }
4118 else if (ch == '\r') {
4119 *p++ = '\\';
4120 *p++ = 'r';
4121 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004122
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004123 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004124 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004126 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004127 *p++ = hexdigits[(ch >> 4) & 0x000F];
4128 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004129 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004130
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 /* Copy everything else as-is */
4132 else
4133 *p++ = (char) ch;
4134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004136 assert(p - PyBytes_AS_STRING(repr) > 0);
4137 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4138 return NULL;
4139 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140}
4141
Alexander Belopolsky40018472011-02-26 01:02:56 +00004142PyObject *
4143PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004145 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 if (!PyUnicode_Check(unicode)) {
4147 PyErr_BadArgument();
4148 return NULL;
4149 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004150 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4151 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004152 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153}
4154
4155/* --- Raw Unicode Escape Codec ------------------------------------------- */
4156
Alexander Belopolsky40018472011-02-26 01:02:56 +00004157PyObject *
4158PyUnicode_DecodeRawUnicodeEscape(const char *s,
4159 Py_ssize_t size,
4160 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004163 Py_ssize_t startinpos;
4164 Py_ssize_t endinpos;
4165 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 const char *end;
4169 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 PyObject *errorHandler = NULL;
4171 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004172
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 /* Escaped strings will always be longer than the resulting
4174 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 length after conversion to the true value. (But decoding error
4176 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 v = _PyUnicode_New(size);
4178 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183 end = s + size;
4184 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 unsigned char c;
4186 Py_UCS4 x;
4187 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004188 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 /* Non-escape characters are interpreted as Unicode ordinals */
4191 if (*s != '\\') {
4192 *p++ = (unsigned char)*s++;
4193 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 startinpos = s-starts;
4196
4197 /* \u-escapes are only interpreted iff the number of leading
4198 backslashes if odd */
4199 bs = s;
4200 for (;s < end;) {
4201 if (*s != '\\')
4202 break;
4203 *p++ = (unsigned char)*s++;
4204 }
4205 if (((s - bs) & 1) == 0 ||
4206 s >= end ||
4207 (*s != 'u' && *s != 'U')) {
4208 continue;
4209 }
4210 p--;
4211 count = *s=='u' ? 4 : 8;
4212 s++;
4213
4214 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4215 outpos = p-PyUnicode_AS_UNICODE(v);
4216 for (x = 0, i = 0; i < count; ++i, ++s) {
4217 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004218 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004219 endinpos = s-starts;
4220 if (unicode_decode_call_errorhandler(
4221 errors, &errorHandler,
4222 "rawunicodeescape", "truncated \\uXXXX",
4223 &starts, &end, &startinpos, &endinpos, &exc, &s,
4224 &v, &outpos, &p))
4225 goto onError;
4226 goto nextByte;
4227 }
4228 x = (x<<4) & ~0xF;
4229 if (c >= '0' && c <= '9')
4230 x += c - '0';
4231 else if (c >= 'a' && c <= 'f')
4232 x += 10 + c - 'a';
4233 else
4234 x += 10 + c - 'A';
4235 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004236 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004237 /* UCS-2 character */
4238 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004239 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 /* UCS-4 character. Either store directly, or as
4241 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004242#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004244#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004245 x -= 0x10000L;
4246 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4247 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004248#endif
4249 } else {
4250 endinpos = s-starts;
4251 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004252 if (unicode_decode_call_errorhandler(
4253 errors, &errorHandler,
4254 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 &starts, &end, &startinpos, &endinpos, &exc, &s,
4256 &v, &outpos, &p))
4257 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004258 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 nextByte:
4260 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004262 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004263 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264 Py_XDECREF(errorHandler);
4265 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004267
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270 Py_XDECREF(errorHandler);
4271 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272 return NULL;
4273}
4274
Alexander Belopolsky40018472011-02-26 01:02:56 +00004275PyObject *
4276PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4277 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004279 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 char *p;
4281 char *q;
4282
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004283#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004284 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004285#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004286 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004287#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004288
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004289 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004291
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004292 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293 if (repr == NULL)
4294 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004295 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004296 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004298 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 while (size-- > 0) {
4300 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004301#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 /* Map 32-bit characters to '\Uxxxxxxxx' */
4303 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004304 *p++ = '\\';
4305 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004306 *p++ = hexdigits[(ch >> 28) & 0xf];
4307 *p++ = hexdigits[(ch >> 24) & 0xf];
4308 *p++ = hexdigits[(ch >> 20) & 0xf];
4309 *p++ = hexdigits[(ch >> 16) & 0xf];
4310 *p++ = hexdigits[(ch >> 12) & 0xf];
4311 *p++ = hexdigits[(ch >> 8) & 0xf];
4312 *p++ = hexdigits[(ch >> 4) & 0xf];
4313 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004314 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004315 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004316#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004317 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4318 if (ch >= 0xD800 && ch < 0xDC00) {
4319 Py_UNICODE ch2;
4320 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004321
Benjamin Peterson29060642009-01-31 22:14:21 +00004322 ch2 = *s++;
4323 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004324 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004325 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4326 *p++ = '\\';
4327 *p++ = 'U';
4328 *p++ = hexdigits[(ucs >> 28) & 0xf];
4329 *p++ = hexdigits[(ucs >> 24) & 0xf];
4330 *p++ = hexdigits[(ucs >> 20) & 0xf];
4331 *p++ = hexdigits[(ucs >> 16) & 0xf];
4332 *p++ = hexdigits[(ucs >> 12) & 0xf];
4333 *p++ = hexdigits[(ucs >> 8) & 0xf];
4334 *p++ = hexdigits[(ucs >> 4) & 0xf];
4335 *p++ = hexdigits[ucs & 0xf];
4336 continue;
4337 }
4338 /* Fall through: isolated surrogates are copied as-is */
4339 s--;
4340 size++;
4341 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004342#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004343 /* Map 16-bit characters to '\uxxxx' */
4344 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 *p++ = '\\';
4346 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004347 *p++ = hexdigits[(ch >> 12) & 0xf];
4348 *p++ = hexdigits[(ch >> 8) & 0xf];
4349 *p++ = hexdigits[(ch >> 4) & 0xf];
4350 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 /* Copy everything else as-is */
4353 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 *p++ = (char) ch;
4355 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004356 size = p - q;
4357
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004358 assert(size > 0);
4359 if (_PyBytes_Resize(&repr, size) < 0)
4360 return NULL;
4361 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362}
4363
Alexander Belopolsky40018472011-02-26 01:02:56 +00004364PyObject *
4365PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004367 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004369 PyErr_BadArgument();
4370 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004372 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4373 PyUnicode_GET_SIZE(unicode));
4374
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004375 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376}
4377
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004378/* --- Unicode Internal Codec ------------------------------------------- */
4379
Alexander Belopolsky40018472011-02-26 01:02:56 +00004380PyObject *
4381_PyUnicode_DecodeUnicodeInternal(const char *s,
4382 Py_ssize_t size,
4383 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004384{
4385 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004386 Py_ssize_t startinpos;
4387 Py_ssize_t endinpos;
4388 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004389 PyUnicodeObject *v;
4390 Py_UNICODE *p;
4391 const char *end;
4392 const char *reason;
4393 PyObject *errorHandler = NULL;
4394 PyObject *exc = NULL;
4395
Neal Norwitzd43069c2006-01-08 01:12:10 +00004396#ifdef Py_UNICODE_WIDE
4397 Py_UNICODE unimax = PyUnicode_GetMax();
4398#endif
4399
Thomas Wouters89f507f2006-12-13 04:49:30 +00004400 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004401 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4402 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004403 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004404 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004406 p = PyUnicode_AS_UNICODE(v);
4407 end = s + size;
4408
4409 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004410 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004411 /* We have to sanity check the raw data, otherwise doom looms for
4412 some malformed UCS-4 data. */
4413 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004414#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004415 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004416#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004417 end-s < Py_UNICODE_SIZE
4418 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004420 startinpos = s - starts;
4421 if (end-s < Py_UNICODE_SIZE) {
4422 endinpos = end-starts;
4423 reason = "truncated input";
4424 }
4425 else {
4426 endinpos = s - starts + Py_UNICODE_SIZE;
4427 reason = "illegal code point (> 0x10FFFF)";
4428 }
4429 outpos = p - PyUnicode_AS_UNICODE(v);
4430 if (unicode_decode_call_errorhandler(
4431 errors, &errorHandler,
4432 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004433 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004434 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004435 goto onError;
4436 }
4437 }
4438 else {
4439 p++;
4440 s += Py_UNICODE_SIZE;
4441 }
4442 }
4443
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004444 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004445 goto onError;
4446 Py_XDECREF(errorHandler);
4447 Py_XDECREF(exc);
4448 return (PyObject *)v;
4449
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004451 Py_XDECREF(v);
4452 Py_XDECREF(errorHandler);
4453 Py_XDECREF(exc);
4454 return NULL;
4455}
4456
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457/* --- Latin-1 Codec ------------------------------------------------------ */
4458
Alexander Belopolsky40018472011-02-26 01:02:56 +00004459PyObject *
4460PyUnicode_DecodeLatin1(const char *s,
4461 Py_ssize_t size,
4462 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463{
4464 PyUnicodeObject *v;
4465 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004466 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004467
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004469 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004470 Py_UNICODE r = *(unsigned char*)s;
4471 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004472 }
4473
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 v = _PyUnicode_New(size);
4475 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004476 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004478 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004480 e = s + size;
4481 /* Unrolling the copy makes it much faster by reducing the looping
4482 overhead. This is similar to what many memcpy() implementations do. */
4483 unrolled_end = e - 4;
4484 while (s < unrolled_end) {
4485 p[0] = (unsigned char) s[0];
4486 p[1] = (unsigned char) s[1];
4487 p[2] = (unsigned char) s[2];
4488 p[3] = (unsigned char) s[3];
4489 s += 4;
4490 p += 4;
4491 }
4492 while (s < e)
4493 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004495
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 Py_XDECREF(v);
4498 return NULL;
4499}
4500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004502static void
4503make_encode_exception(PyObject **exceptionObject,
4504 const char *encoding,
4505 const Py_UNICODE *unicode, Py_ssize_t size,
4506 Py_ssize_t startpos, Py_ssize_t endpos,
4507 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004510 *exceptionObject = PyUnicodeEncodeError_Create(
4511 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512 }
4513 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4515 goto onError;
4516 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4517 goto onError;
4518 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4519 goto onError;
4520 return;
4521 onError:
4522 Py_DECREF(*exceptionObject);
4523 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 }
4525}
4526
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004528static void
4529raise_encode_exception(PyObject **exceptionObject,
4530 const char *encoding,
4531 const Py_UNICODE *unicode, Py_ssize_t size,
4532 Py_ssize_t startpos, Py_ssize_t endpos,
4533 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534{
4535 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004536 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539}
4540
4541/* error handling callback helper:
4542 build arguments, call the callback and check the arguments,
4543 put the result into newpos and return the replacement string, which
4544 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004545static PyObject *
4546unicode_encode_call_errorhandler(const char *errors,
4547 PyObject **errorHandler,
4548 const char *encoding, const char *reason,
4549 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4550 Py_ssize_t startpos, Py_ssize_t endpos,
4551 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004553 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554
4555 PyObject *restuple;
4556 PyObject *resunicode;
4557
4558 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004559 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004561 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 }
4563
4564 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004565 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568
4569 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004572 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004574 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 Py_DECREF(restuple);
4576 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004578 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 &resunicode, newpos)) {
4580 Py_DECREF(restuple);
4581 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004583 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4584 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4585 Py_DECREF(restuple);
4586 return NULL;
4587 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004590 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4592 Py_DECREF(restuple);
4593 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004594 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595 Py_INCREF(resunicode);
4596 Py_DECREF(restuple);
4597 return resunicode;
4598}
4599
Alexander Belopolsky40018472011-02-26 01:02:56 +00004600static PyObject *
4601unicode_encode_ucs1(const Py_UNICODE *p,
4602 Py_ssize_t size,
4603 const char *errors,
4604 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605{
4606 /* output object */
4607 PyObject *res;
4608 /* pointers to the beginning and end+1 of input */
4609 const Py_UNICODE *startp = p;
4610 const Py_UNICODE *endp = p + size;
4611 /* pointer to the beginning of the unencodable characters */
4612 /* const Py_UNICODE *badp = NULL; */
4613 /* pointer into the output */
4614 char *str;
4615 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004616 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004617 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4618 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 PyObject *errorHandler = NULL;
4620 PyObject *exc = NULL;
4621 /* the following variable is used for caching string comparisons
4622 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4623 int known_errorHandler = -1;
4624
4625 /* allocate enough for a simple encoding without
4626 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004627 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004628 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004629 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004630 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004631 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004632 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 ressize = size;
4634
4635 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637
Benjamin Peterson29060642009-01-31 22:14:21 +00004638 /* can we encode this? */
4639 if (c<limit) {
4640 /* no overflow check, because we know that the space is enough */
4641 *str++ = (char)c;
4642 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004643 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 else {
4645 Py_ssize_t unicodepos = p-startp;
4646 Py_ssize_t requiredsize;
4647 PyObject *repunicode;
4648 Py_ssize_t repsize;
4649 Py_ssize_t newpos;
4650 Py_ssize_t respos;
4651 Py_UNICODE *uni2;
4652 /* startpos for collecting unencodable chars */
4653 const Py_UNICODE *collstart = p;
4654 const Py_UNICODE *collend = p;
4655 /* find all unecodable characters */
4656 while ((collend < endp) && ((*collend)>=limit))
4657 ++collend;
4658 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4659 if (known_errorHandler==-1) {
4660 if ((errors==NULL) || (!strcmp(errors, "strict")))
4661 known_errorHandler = 1;
4662 else if (!strcmp(errors, "replace"))
4663 known_errorHandler = 2;
4664 else if (!strcmp(errors, "ignore"))
4665 known_errorHandler = 3;
4666 else if (!strcmp(errors, "xmlcharrefreplace"))
4667 known_errorHandler = 4;
4668 else
4669 known_errorHandler = 0;
4670 }
4671 switch (known_errorHandler) {
4672 case 1: /* strict */
4673 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4674 goto onError;
4675 case 2: /* replace */
4676 while (collstart++<collend)
4677 *str++ = '?'; /* fall through */
4678 case 3: /* ignore */
4679 p = collend;
4680 break;
4681 case 4: /* xmlcharrefreplace */
4682 respos = str - PyBytes_AS_STRING(res);
4683 /* determine replacement size (temporarily (mis)uses p) */
4684 for (p = collstart, repsize = 0; p < collend; ++p) {
4685 if (*p<10)
4686 repsize += 2+1+1;
4687 else if (*p<100)
4688 repsize += 2+2+1;
4689 else if (*p<1000)
4690 repsize += 2+3+1;
4691 else if (*p<10000)
4692 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004693#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 else
4695 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004696#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 else if (*p<100000)
4698 repsize += 2+5+1;
4699 else if (*p<1000000)
4700 repsize += 2+6+1;
4701 else
4702 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004703#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 }
4705 requiredsize = respos+repsize+(endp-collend);
4706 if (requiredsize > ressize) {
4707 if (requiredsize<2*ressize)
4708 requiredsize = 2*ressize;
4709 if (_PyBytes_Resize(&res, requiredsize))
4710 goto onError;
4711 str = PyBytes_AS_STRING(res) + respos;
4712 ressize = requiredsize;
4713 }
4714 /* generate replacement (temporarily (mis)uses p) */
4715 for (p = collstart; p < collend; ++p) {
4716 str += sprintf(str, "&#%d;", (int)*p);
4717 }
4718 p = collend;
4719 break;
4720 default:
4721 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4722 encoding, reason, startp, size, &exc,
4723 collstart-startp, collend-startp, &newpos);
4724 if (repunicode == NULL)
4725 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004726 if (PyBytes_Check(repunicode)) {
4727 /* Directly copy bytes result to output. */
4728 repsize = PyBytes_Size(repunicode);
4729 if (repsize > 1) {
4730 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004731 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004732 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4733 Py_DECREF(repunicode);
4734 goto onError;
4735 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004736 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004737 ressize += repsize-1;
4738 }
4739 memcpy(str, PyBytes_AsString(repunicode), repsize);
4740 str += repsize;
4741 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004742 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004743 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004744 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004745 /* need more space? (at least enough for what we
4746 have+the replacement+the rest of the string, so
4747 we won't have to check space for encodable characters) */
4748 respos = str - PyBytes_AS_STRING(res);
4749 repsize = PyUnicode_GET_SIZE(repunicode);
4750 requiredsize = respos+repsize+(endp-collend);
4751 if (requiredsize > ressize) {
4752 if (requiredsize<2*ressize)
4753 requiredsize = 2*ressize;
4754 if (_PyBytes_Resize(&res, requiredsize)) {
4755 Py_DECREF(repunicode);
4756 goto onError;
4757 }
4758 str = PyBytes_AS_STRING(res) + respos;
4759 ressize = requiredsize;
4760 }
4761 /* check if there is anything unencodable in the replacement
4762 and copy it to the output */
4763 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4764 c = *uni2;
4765 if (c >= limit) {
4766 raise_encode_exception(&exc, encoding, startp, size,
4767 unicodepos, unicodepos+1, reason);
4768 Py_DECREF(repunicode);
4769 goto onError;
4770 }
4771 *str = (char)c;
4772 }
4773 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004774 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004775 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004776 }
4777 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004778 /* Resize if we allocated to much */
4779 size = str - PyBytes_AS_STRING(res);
4780 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004781 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004782 if (_PyBytes_Resize(&res, size) < 0)
4783 goto onError;
4784 }
4785
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 Py_XDECREF(errorHandler);
4787 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004788 return res;
4789
4790 onError:
4791 Py_XDECREF(res);
4792 Py_XDECREF(errorHandler);
4793 Py_XDECREF(exc);
4794 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795}
4796
Alexander Belopolsky40018472011-02-26 01:02:56 +00004797PyObject *
4798PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4799 Py_ssize_t size,
4800 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803}
4804
Alexander Belopolsky40018472011-02-26 01:02:56 +00004805PyObject *
4806PyUnicode_AsLatin1String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807{
4808 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 PyErr_BadArgument();
4810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 }
4812 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 PyUnicode_GET_SIZE(unicode),
4814 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815}
4816
4817/* --- 7-bit ASCII Codec -------------------------------------------------- */
4818
Alexander Belopolsky40018472011-02-26 01:02:56 +00004819PyObject *
4820PyUnicode_DecodeASCII(const char *s,
4821 Py_ssize_t size,
4822 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825 PyUnicodeObject *v;
4826 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004827 Py_ssize_t startinpos;
4828 Py_ssize_t endinpos;
4829 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 const char *e;
4831 PyObject *errorHandler = NULL;
4832 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004833
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004835 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 Py_UNICODE r = *(unsigned char*)s;
4837 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004838 }
Tim Petersced69f82003-09-16 20:30:58 +00004839
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 v = _PyUnicode_New(size);
4841 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004842 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 e = s + size;
4847 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004848 register unsigned char c = (unsigned char)*s;
4849 if (c < 128) {
4850 *p++ = c;
4851 ++s;
4852 }
4853 else {
4854 startinpos = s-starts;
4855 endinpos = startinpos + 1;
4856 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4857 if (unicode_decode_call_errorhandler(
4858 errors, &errorHandler,
4859 "ascii", "ordinal not in range(128)",
4860 &starts, &e, &startinpos, &endinpos, &exc, &s,
4861 &v, &outpos, &p))
4862 goto onError;
4863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004865 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4867 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868 Py_XDECREF(errorHandler);
4869 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004871
Benjamin Peterson29060642009-01-31 22:14:21 +00004872 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004874 Py_XDECREF(errorHandler);
4875 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 return NULL;
4877}
4878
Alexander Belopolsky40018472011-02-26 01:02:56 +00004879PyObject *
4880PyUnicode_EncodeASCII(const Py_UNICODE *p,
4881 Py_ssize_t size,
4882 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004884 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885}
4886
Alexander Belopolsky40018472011-02-26 01:02:56 +00004887PyObject *
4888PyUnicode_AsASCIIString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889{
4890 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 PyErr_BadArgument();
4892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 }
4894 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004895 PyUnicode_GET_SIZE(unicode),
4896 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897}
4898
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004899#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004900
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004901/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004902
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004903#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004904#define NEED_RETRY
4905#endif
4906
4907/* XXX This code is limited to "true" double-byte encodings, as
4908 a) it assumes an incomplete character consists of a single byte, and
4909 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004911
Alexander Belopolsky40018472011-02-26 01:02:56 +00004912static int
4913is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004914{
4915 const char *curr = s + offset;
4916
4917 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004918 const char *prev = CharPrev(s, curr);
4919 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004920 }
4921 return 0;
4922}
4923
4924/*
4925 * Decode MBCS string into unicode object. If 'final' is set, converts
4926 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4927 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004928static int
4929decode_mbcs(PyUnicodeObject **v,
4930 const char *s, /* MBCS string */
4931 int size, /* sizeof MBCS string */
4932 int final,
4933 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004934{
4935 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004936 Py_ssize_t n;
4937 DWORD usize;
4938 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004939
4940 assert(size >= 0);
4941
Victor Stinner554f3f02010-06-16 23:33:54 +00004942 /* check and handle 'errors' arg */
4943 if (errors==NULL || strcmp(errors, "strict")==0)
4944 flags = MB_ERR_INVALID_CHARS;
4945 else if (strcmp(errors, "ignore")==0)
4946 flags = 0;
4947 else {
4948 PyErr_Format(PyExc_ValueError,
4949 "mbcs encoding does not support errors='%s'",
4950 errors);
4951 return -1;
4952 }
4953
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004954 /* Skip trailing lead-byte unless 'final' is set */
4955 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004957
4958 /* First get the size of the result */
4959 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004960 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4961 if (usize==0)
4962 goto mbcs_decode_error;
4963 } else
4964 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004965
4966 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 /* Create unicode object */
4968 *v = _PyUnicode_New(usize);
4969 if (*v == NULL)
4970 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004971 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004972 }
4973 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 /* Extend unicode object */
4975 n = PyUnicode_GET_SIZE(*v);
4976 if (_PyUnicode_Resize(v, n + usize) < 0)
4977 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004978 }
4979
4980 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004981 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004983 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4984 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004985 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004986 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004987 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004988
4989mbcs_decode_error:
4990 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4991 we raise a UnicodeDecodeError - else it is a 'generic'
4992 windows error
4993 */
4994 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4995 /* Ideally, we should get reason from FormatMessage - this
4996 is the Windows 2000 English version of the message
4997 */
4998 PyObject *exc = NULL;
4999 const char *reason = "No mapping for the Unicode character exists "
5000 "in the target multi-byte code page.";
5001 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5002 if (exc != NULL) {
5003 PyCodec_StrictErrors(exc);
5004 Py_DECREF(exc);
5005 }
5006 } else {
5007 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5008 }
5009 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005010}
5011
Alexander Belopolsky40018472011-02-26 01:02:56 +00005012PyObject *
5013PyUnicode_DecodeMBCSStateful(const char *s,
5014 Py_ssize_t size,
5015 const char *errors,
5016 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005017{
5018 PyUnicodeObject *v = NULL;
5019 int done;
5020
5021 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005023
5024#ifdef NEED_RETRY
5025 retry:
5026 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005027 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005028 else
5029#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005030 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005031
5032 if (done < 0) {
5033 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005035 }
5036
5037 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005039
5040#ifdef NEED_RETRY
5041 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 s += done;
5043 size -= done;
5044 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005045 }
5046#endif
5047
5048 return (PyObject *)v;
5049}
5050
Alexander Belopolsky40018472011-02-26 01:02:56 +00005051PyObject *
5052PyUnicode_DecodeMBCS(const char *s,
5053 Py_ssize_t size,
5054 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005055{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005056 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5057}
5058
5059/*
5060 * Convert unicode into string object (MBCS).
5061 * Returns 0 if succeed, -1 otherwise.
5062 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005063static int
5064encode_mbcs(PyObject **repr,
5065 const Py_UNICODE *p, /* unicode */
5066 int size, /* size of unicode */
5067 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005068{
Victor Stinner554f3f02010-06-16 23:33:54 +00005069 BOOL usedDefaultChar = FALSE;
5070 BOOL *pusedDefaultChar;
5071 int mbcssize;
5072 Py_ssize_t n;
5073 PyObject *exc = NULL;
5074 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005075
5076 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005077
Victor Stinner554f3f02010-06-16 23:33:54 +00005078 /* check and handle 'errors' arg */
5079 if (errors==NULL || strcmp(errors, "strict")==0) {
5080 flags = WC_NO_BEST_FIT_CHARS;
5081 pusedDefaultChar = &usedDefaultChar;
5082 } else if (strcmp(errors, "replace")==0) {
5083 flags = 0;
5084 pusedDefaultChar = NULL;
5085 } else {
5086 PyErr_Format(PyExc_ValueError,
5087 "mbcs encoding does not support errors='%s'",
5088 errors);
5089 return -1;
5090 }
5091
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005092 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005093 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005094 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5095 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 if (mbcssize == 0) {
5097 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5098 return -1;
5099 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005100 /* If we used a default char, then we failed! */
5101 if (pusedDefaultChar && *pusedDefaultChar)
5102 goto mbcs_encode_error;
5103 } else {
5104 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005105 }
5106
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005107 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 /* Create string object */
5109 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5110 if (*repr == NULL)
5111 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005112 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005113 }
5114 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 /* Extend string object */
5116 n = PyBytes_Size(*repr);
5117 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5118 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005119 }
5120
5121 /* Do the conversion */
5122 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005123 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005124 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5125 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5127 return -1;
5128 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005129 if (pusedDefaultChar && *pusedDefaultChar)
5130 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005131 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005132 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005133
5134mbcs_encode_error:
5135 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5136 Py_XDECREF(exc);
5137 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005138}
5139
Alexander Belopolsky40018472011-02-26 01:02:56 +00005140PyObject *
5141PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5142 Py_ssize_t size,
5143 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005144{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005145 PyObject *repr = NULL;
5146 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005147
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005148#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005149 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005150 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005151 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005152 else
5153#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005154 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005155
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005156 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 Py_XDECREF(repr);
5158 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005159 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005160
5161#ifdef NEED_RETRY
5162 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 p += INT_MAX;
5164 size -= INT_MAX;
5165 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005166 }
5167#endif
5168
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005169 return repr;
5170}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005171
Alexander Belopolsky40018472011-02-26 01:02:56 +00005172PyObject *
5173PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005174{
5175 if (!PyUnicode_Check(unicode)) {
5176 PyErr_BadArgument();
5177 return NULL;
5178 }
5179 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 PyUnicode_GET_SIZE(unicode),
5181 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005182}
5183
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005184#undef NEED_RETRY
5185
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005186#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005187
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188/* --- Character Mapping Codec -------------------------------------------- */
5189
Alexander Belopolsky40018472011-02-26 01:02:56 +00005190PyObject *
5191PyUnicode_DecodeCharmap(const char *s,
5192 Py_ssize_t size,
5193 PyObject *mapping,
5194 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005197 Py_ssize_t startinpos;
5198 Py_ssize_t endinpos;
5199 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005200 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 PyUnicodeObject *v;
5202 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005203 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005204 PyObject *errorHandler = NULL;
5205 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005206 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005207 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005208
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 /* Default to Latin-1 */
5210 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212
5213 v = _PyUnicode_New(size);
5214 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005215 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005217 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005219 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005220 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 mapstring = PyUnicode_AS_UNICODE(mapping);
5222 maplen = PyUnicode_GET_SIZE(mapping);
5223 while (s < e) {
5224 unsigned char ch = *s;
5225 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 if (ch < maplen)
5228 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 if (x == 0xfffe) {
5231 /* undefined mapping */
5232 outpos = p-PyUnicode_AS_UNICODE(v);
5233 startinpos = s-starts;
5234 endinpos = startinpos+1;
5235 if (unicode_decode_call_errorhandler(
5236 errors, &errorHandler,
5237 "charmap", "character maps to <undefined>",
5238 &starts, &e, &startinpos, &endinpos, &exc, &s,
5239 &v, &outpos, &p)) {
5240 goto onError;
5241 }
5242 continue;
5243 }
5244 *p++ = x;
5245 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005246 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005247 }
5248 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 while (s < e) {
5250 unsigned char ch = *s;
5251 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005252
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5254 w = PyLong_FromLong((long)ch);
5255 if (w == NULL)
5256 goto onError;
5257 x = PyObject_GetItem(mapping, w);
5258 Py_DECREF(w);
5259 if (x == NULL) {
5260 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5261 /* No mapping found means: mapping is undefined. */
5262 PyErr_Clear();
5263 x = Py_None;
5264 Py_INCREF(x);
5265 } else
5266 goto onError;
5267 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005268
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 /* Apply mapping */
5270 if (PyLong_Check(x)) {
5271 long value = PyLong_AS_LONG(x);
5272 if (value < 0 || value > 65535) {
5273 PyErr_SetString(PyExc_TypeError,
5274 "character mapping must be in range(65536)");
5275 Py_DECREF(x);
5276 goto onError;
5277 }
5278 *p++ = (Py_UNICODE)value;
5279 }
5280 else if (x == Py_None) {
5281 /* undefined mapping */
5282 outpos = p-PyUnicode_AS_UNICODE(v);
5283 startinpos = s-starts;
5284 endinpos = startinpos+1;
5285 if (unicode_decode_call_errorhandler(
5286 errors, &errorHandler,
5287 "charmap", "character maps to <undefined>",
5288 &starts, &e, &startinpos, &endinpos, &exc, &s,
5289 &v, &outpos, &p)) {
5290 Py_DECREF(x);
5291 goto onError;
5292 }
5293 Py_DECREF(x);
5294 continue;
5295 }
5296 else if (PyUnicode_Check(x)) {
5297 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005298
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 if (targetsize == 1)
5300 /* 1-1 mapping */
5301 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005302
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 else if (targetsize > 1) {
5304 /* 1-n mapping */
5305 if (targetsize > extrachars) {
5306 /* resize first */
5307 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5308 Py_ssize_t needed = (targetsize - extrachars) + \
5309 (targetsize << 2);
5310 extrachars += needed;
5311 /* XXX overflow detection missing */
5312 if (_PyUnicode_Resize(&v,
5313 PyUnicode_GET_SIZE(v) + needed) < 0) {
5314 Py_DECREF(x);
5315 goto onError;
5316 }
5317 p = PyUnicode_AS_UNICODE(v) + oldpos;
5318 }
5319 Py_UNICODE_COPY(p,
5320 PyUnicode_AS_UNICODE(x),
5321 targetsize);
5322 p += targetsize;
5323 extrachars -= targetsize;
5324 }
5325 /* 1-0 mapping: skip the character */
5326 }
5327 else {
5328 /* wrong return value */
5329 PyErr_SetString(PyExc_TypeError,
5330 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005331 Py_DECREF(x);
5332 goto onError;
5333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 Py_DECREF(x);
5335 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 }
5338 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5340 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005341 Py_XDECREF(errorHandler);
5342 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005344
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346 Py_XDECREF(errorHandler);
5347 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 Py_XDECREF(v);
5349 return NULL;
5350}
5351
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005352/* Charmap encoding: the lookup table */
5353
Alexander Belopolsky40018472011-02-26 01:02:56 +00005354struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 PyObject_HEAD
5356 unsigned char level1[32];
5357 int count2, count3;
5358 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005359};
5360
5361static PyObject*
5362encoding_map_size(PyObject *obj, PyObject* args)
5363{
5364 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005365 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005367}
5368
5369static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005370 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 PyDoc_STR("Return the size (in bytes) of this object") },
5372 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005373};
5374
5375static void
5376encoding_map_dealloc(PyObject* o)
5377{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005378 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005379}
5380
5381static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005382 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 "EncodingMap", /*tp_name*/
5384 sizeof(struct encoding_map), /*tp_basicsize*/
5385 0, /*tp_itemsize*/
5386 /* methods */
5387 encoding_map_dealloc, /*tp_dealloc*/
5388 0, /*tp_print*/
5389 0, /*tp_getattr*/
5390 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005391 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 0, /*tp_repr*/
5393 0, /*tp_as_number*/
5394 0, /*tp_as_sequence*/
5395 0, /*tp_as_mapping*/
5396 0, /*tp_hash*/
5397 0, /*tp_call*/
5398 0, /*tp_str*/
5399 0, /*tp_getattro*/
5400 0, /*tp_setattro*/
5401 0, /*tp_as_buffer*/
5402 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5403 0, /*tp_doc*/
5404 0, /*tp_traverse*/
5405 0, /*tp_clear*/
5406 0, /*tp_richcompare*/
5407 0, /*tp_weaklistoffset*/
5408 0, /*tp_iter*/
5409 0, /*tp_iternext*/
5410 encoding_map_methods, /*tp_methods*/
5411 0, /*tp_members*/
5412 0, /*tp_getset*/
5413 0, /*tp_base*/
5414 0, /*tp_dict*/
5415 0, /*tp_descr_get*/
5416 0, /*tp_descr_set*/
5417 0, /*tp_dictoffset*/
5418 0, /*tp_init*/
5419 0, /*tp_alloc*/
5420 0, /*tp_new*/
5421 0, /*tp_free*/
5422 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005423};
5424
5425PyObject*
5426PyUnicode_BuildEncodingMap(PyObject* string)
5427{
5428 Py_UNICODE *decode;
5429 PyObject *result;
5430 struct encoding_map *mresult;
5431 int i;
5432 int need_dict = 0;
5433 unsigned char level1[32];
5434 unsigned char level2[512];
5435 unsigned char *mlevel1, *mlevel2, *mlevel3;
5436 int count2 = 0, count3 = 0;
5437
5438 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5439 PyErr_BadArgument();
5440 return NULL;
5441 }
5442 decode = PyUnicode_AS_UNICODE(string);
5443 memset(level1, 0xFF, sizeof level1);
5444 memset(level2, 0xFF, sizeof level2);
5445
5446 /* If there isn't a one-to-one mapping of NULL to \0,
5447 or if there are non-BMP characters, we need to use
5448 a mapping dictionary. */
5449 if (decode[0] != 0)
5450 need_dict = 1;
5451 for (i = 1; i < 256; i++) {
5452 int l1, l2;
5453 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005454#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005455 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005456#endif
5457 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005458 need_dict = 1;
5459 break;
5460 }
5461 if (decode[i] == 0xFFFE)
5462 /* unmapped character */
5463 continue;
5464 l1 = decode[i] >> 11;
5465 l2 = decode[i] >> 7;
5466 if (level1[l1] == 0xFF)
5467 level1[l1] = count2++;
5468 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005469 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005470 }
5471
5472 if (count2 >= 0xFF || count3 >= 0xFF)
5473 need_dict = 1;
5474
5475 if (need_dict) {
5476 PyObject *result = PyDict_New();
5477 PyObject *key, *value;
5478 if (!result)
5479 return NULL;
5480 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005481 key = PyLong_FromLong(decode[i]);
5482 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005483 if (!key || !value)
5484 goto failed1;
5485 if (PyDict_SetItem(result, key, value) == -1)
5486 goto failed1;
5487 Py_DECREF(key);
5488 Py_DECREF(value);
5489 }
5490 return result;
5491 failed1:
5492 Py_XDECREF(key);
5493 Py_XDECREF(value);
5494 Py_DECREF(result);
5495 return NULL;
5496 }
5497
5498 /* Create a three-level trie */
5499 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5500 16*count2 + 128*count3 - 1);
5501 if (!result)
5502 return PyErr_NoMemory();
5503 PyObject_Init(result, &EncodingMapType);
5504 mresult = (struct encoding_map*)result;
5505 mresult->count2 = count2;
5506 mresult->count3 = count3;
5507 mlevel1 = mresult->level1;
5508 mlevel2 = mresult->level23;
5509 mlevel3 = mresult->level23 + 16*count2;
5510 memcpy(mlevel1, level1, 32);
5511 memset(mlevel2, 0xFF, 16*count2);
5512 memset(mlevel3, 0, 128*count3);
5513 count3 = 0;
5514 for (i = 1; i < 256; i++) {
5515 int o1, o2, o3, i2, i3;
5516 if (decode[i] == 0xFFFE)
5517 /* unmapped character */
5518 continue;
5519 o1 = decode[i]>>11;
5520 o2 = (decode[i]>>7) & 0xF;
5521 i2 = 16*mlevel1[o1] + o2;
5522 if (mlevel2[i2] == 0xFF)
5523 mlevel2[i2] = count3++;
5524 o3 = decode[i] & 0x7F;
5525 i3 = 128*mlevel2[i2] + o3;
5526 mlevel3[i3] = i;
5527 }
5528 return result;
5529}
5530
5531static int
5532encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5533{
5534 struct encoding_map *map = (struct encoding_map*)mapping;
5535 int l1 = c>>11;
5536 int l2 = (c>>7) & 0xF;
5537 int l3 = c & 0x7F;
5538 int i;
5539
5540#ifdef Py_UNICODE_WIDE
5541 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005543 }
5544#endif
5545 if (c == 0)
5546 return 0;
5547 /* level 1*/
5548 i = map->level1[l1];
5549 if (i == 0xFF) {
5550 return -1;
5551 }
5552 /* level 2*/
5553 i = map->level23[16*i+l2];
5554 if (i == 0xFF) {
5555 return -1;
5556 }
5557 /* level 3 */
5558 i = map->level23[16*map->count2 + 128*i + l3];
5559 if (i == 0) {
5560 return -1;
5561 }
5562 return i;
5563}
5564
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005565/* Lookup the character ch in the mapping. If the character
5566 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005567 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005568static PyObject *
5569charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570{
Christian Heimes217cfd12007-12-02 14:31:20 +00005571 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 PyObject *x;
5573
5574 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005575 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 x = PyObject_GetItem(mapping, w);
5577 Py_DECREF(w);
5578 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5580 /* No mapping found means: mapping is undefined. */
5581 PyErr_Clear();
5582 x = Py_None;
5583 Py_INCREF(x);
5584 return x;
5585 } else
5586 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005588 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005590 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 long value = PyLong_AS_LONG(x);
5592 if (value < 0 || value > 255) {
5593 PyErr_SetString(PyExc_TypeError,
5594 "character mapping must be in range(256)");
5595 Py_DECREF(x);
5596 return NULL;
5597 }
5598 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005600 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 /* wrong return value */
5604 PyErr_Format(PyExc_TypeError,
5605 "character mapping must return integer, bytes or None, not %.400s",
5606 x->ob_type->tp_name);
5607 Py_DECREF(x);
5608 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 }
5610}
5611
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005612static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005613charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005614{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005615 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5616 /* exponentially overallocate to minimize reallocations */
5617 if (requiredsize < 2*outsize)
5618 requiredsize = 2*outsize;
5619 if (_PyBytes_Resize(outobj, requiredsize))
5620 return -1;
5621 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005622}
5623
Benjamin Peterson14339b62009-01-31 16:36:08 +00005624typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00005626} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005628 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005629 space is available. Return a new reference to the object that
5630 was put in the output buffer, or Py_None, if the mapping was undefined
5631 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005632 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005633static charmapencode_result
5634charmapencode_output(Py_UNICODE c, PyObject *mapping,
5635 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005637 PyObject *rep;
5638 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005639 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640
Christian Heimes90aa7642007-12-19 02:45:37 +00005641 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005642 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005643 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005644 if (res == -1)
5645 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 if (outsize<requiredsize)
5647 if (charmapencode_resize(outobj, outpos, requiredsize))
5648 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005649 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 outstart[(*outpos)++] = (char)res;
5651 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005652 }
5653
5654 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005657 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 Py_DECREF(rep);
5659 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005660 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 if (PyLong_Check(rep)) {
5662 Py_ssize_t requiredsize = *outpos+1;
5663 if (outsize<requiredsize)
5664 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5665 Py_DECREF(rep);
5666 return enc_EXCEPTION;
5667 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005668 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005670 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 else {
5672 const char *repchars = PyBytes_AS_STRING(rep);
5673 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5674 Py_ssize_t requiredsize = *outpos+repsize;
5675 if (outsize<requiredsize)
5676 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5677 Py_DECREF(rep);
5678 return enc_EXCEPTION;
5679 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005680 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 memcpy(outstart + *outpos, repchars, repsize);
5682 *outpos += repsize;
5683 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005685 Py_DECREF(rep);
5686 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687}
5688
5689/* handle an error in PyUnicode_EncodeCharmap
5690 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005691static int
5692charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005693 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005695 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005696 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697{
5698 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005699 Py_ssize_t repsize;
5700 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 Py_UNICODE *uni2;
5702 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005703 Py_ssize_t collstartpos = *inpos;
5704 Py_ssize_t collendpos = *inpos+1;
5705 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706 char *encoding = "charmap";
5707 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005708 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710 /* find all unencodable characters */
5711 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005712 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005713 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 int res = encoding_map_lookup(p[collendpos], mapping);
5715 if (res != -1)
5716 break;
5717 ++collendpos;
5718 continue;
5719 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005720
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 rep = charmapencode_lookup(p[collendpos], mapping);
5722 if (rep==NULL)
5723 return -1;
5724 else if (rep!=Py_None) {
5725 Py_DECREF(rep);
5726 break;
5727 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005728 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005730 }
5731 /* cache callback name lookup
5732 * (if not done yet, i.e. it's the first error) */
5733 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 if ((errors==NULL) || (!strcmp(errors, "strict")))
5735 *known_errorHandler = 1;
5736 else if (!strcmp(errors, "replace"))
5737 *known_errorHandler = 2;
5738 else if (!strcmp(errors, "ignore"))
5739 *known_errorHandler = 3;
5740 else if (!strcmp(errors, "xmlcharrefreplace"))
5741 *known_errorHandler = 4;
5742 else
5743 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 }
5745 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005746 case 1: /* strict */
5747 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5748 return -1;
5749 case 2: /* replace */
5750 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 x = charmapencode_output('?', mapping, res, respos);
5752 if (x==enc_EXCEPTION) {
5753 return -1;
5754 }
5755 else if (x==enc_FAILED) {
5756 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5757 return -1;
5758 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005759 }
5760 /* fall through */
5761 case 3: /* ignore */
5762 *inpos = collendpos;
5763 break;
5764 case 4: /* xmlcharrefreplace */
5765 /* generate replacement (temporarily (mis)uses p) */
5766 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 char buffer[2+29+1+1];
5768 char *cp;
5769 sprintf(buffer, "&#%d;", (int)p[collpos]);
5770 for (cp = buffer; *cp; ++cp) {
5771 x = charmapencode_output(*cp, mapping, res, respos);
5772 if (x==enc_EXCEPTION)
5773 return -1;
5774 else if (x==enc_FAILED) {
5775 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5776 return -1;
5777 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005778 }
5779 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005780 *inpos = collendpos;
5781 break;
5782 default:
5783 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 encoding, reason, p, size, exceptionObject,
5785 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005786 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005788 if (PyBytes_Check(repunicode)) {
5789 /* Directly copy bytes result to output. */
5790 Py_ssize_t outsize = PyBytes_Size(*res);
5791 Py_ssize_t requiredsize;
5792 repsize = PyBytes_Size(repunicode);
5793 requiredsize = *respos + repsize;
5794 if (requiredsize > outsize)
5795 /* Make room for all additional bytes. */
5796 if (charmapencode_resize(res, respos, requiredsize)) {
5797 Py_DECREF(repunicode);
5798 return -1;
5799 }
5800 memcpy(PyBytes_AsString(*res) + *respos,
5801 PyBytes_AsString(repunicode), repsize);
5802 *respos += repsize;
5803 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005804 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005805 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005806 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005807 /* generate replacement */
5808 repsize = PyUnicode_GET_SIZE(repunicode);
5809 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 x = charmapencode_output(*uni2, mapping, res, respos);
5811 if (x==enc_EXCEPTION) {
5812 return -1;
5813 }
5814 else if (x==enc_FAILED) {
5815 Py_DECREF(repunicode);
5816 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5817 return -1;
5818 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005819 }
5820 *inpos = newpos;
5821 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005822 }
5823 return 0;
5824}
5825
Alexander Belopolsky40018472011-02-26 01:02:56 +00005826PyObject *
5827PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5828 Py_ssize_t size,
5829 PyObject *mapping,
5830 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005832 /* output object */
5833 PyObject *res = NULL;
5834 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005835 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005836 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005837 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005838 PyObject *errorHandler = NULL;
5839 PyObject *exc = NULL;
5840 /* the following variable is used for caching string comparisons
5841 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5842 * 3=ignore, 4=xmlcharrefreplace */
5843 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844
5845 /* Default to Latin-1 */
5846 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 /* allocate enough for a simple encoding without
5850 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005851 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852 if (res == NULL)
5853 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005854 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 /* try to encode it */
5859 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5860 if (x==enc_EXCEPTION) /* error */
5861 goto onError;
5862 if (x==enc_FAILED) { /* unencodable character */
5863 if (charmap_encoding_error(p, size, &inpos, mapping,
5864 &exc,
5865 &known_errorHandler, &errorHandler, errors,
5866 &res, &respos)) {
5867 goto onError;
5868 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005869 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 else
5871 /* done with this character => adjust input position */
5872 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005876 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005877 if (_PyBytes_Resize(&res, respos) < 0)
5878 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 Py_XDECREF(exc);
5881 Py_XDECREF(errorHandler);
5882 return res;
5883
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885 Py_XDECREF(res);
5886 Py_XDECREF(exc);
5887 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 return NULL;
5889}
5890
Alexander Belopolsky40018472011-02-26 01:02:56 +00005891PyObject *
5892PyUnicode_AsCharmapString(PyObject *unicode,
5893 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894{
5895 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 PyErr_BadArgument();
5897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 }
5899 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 PyUnicode_GET_SIZE(unicode),
5901 mapping,
5902 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903}
5904
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005906static void
5907make_translate_exception(PyObject **exceptionObject,
5908 const Py_UNICODE *unicode, Py_ssize_t size,
5909 Py_ssize_t startpos, Py_ssize_t endpos,
5910 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005912 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005913 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 }
5916 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5918 goto onError;
5919 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5920 goto onError;
5921 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5922 goto onError;
5923 return;
5924 onError:
5925 Py_DECREF(*exceptionObject);
5926 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 }
5928}
5929
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005931static void
5932raise_translate_exception(PyObject **exceptionObject,
5933 const Py_UNICODE *unicode, Py_ssize_t size,
5934 Py_ssize_t startpos, Py_ssize_t endpos,
5935 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936{
5937 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005939 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941}
5942
5943/* error handling callback helper:
5944 build arguments, call the callback and check the arguments,
5945 put the result into newpos and return the replacement string, which
5946 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005947static PyObject *
5948unicode_translate_call_errorhandler(const char *errors,
5949 PyObject **errorHandler,
5950 const char *reason,
5951 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5952 Py_ssize_t startpos, Py_ssize_t endpos,
5953 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005954{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005955 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005956
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005957 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958 PyObject *restuple;
5959 PyObject *resunicode;
5960
5961 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005963 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005965 }
5966
5967 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005969 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005971
5972 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005974 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005975 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005977 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 Py_DECREF(restuple);
5979 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980 }
5981 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 &resunicode, &i_newpos)) {
5983 Py_DECREF(restuple);
5984 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005985 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005986 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005988 else
5989 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005990 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5992 Py_DECREF(restuple);
5993 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005994 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005995 Py_INCREF(resunicode);
5996 Py_DECREF(restuple);
5997 return resunicode;
5998}
5999
6000/* Lookup the character ch in the mapping and put the result in result,
6001 which must be decrefed by the caller.
6002 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006003static int
6004charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006005{
Christian Heimes217cfd12007-12-02 14:31:20 +00006006 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 PyObject *x;
6008
6009 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 x = PyObject_GetItem(mapping, w);
6012 Py_DECREF(w);
6013 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6015 /* No mapping found means: use 1:1 mapping. */
6016 PyErr_Clear();
6017 *result = NULL;
6018 return 0;
6019 } else
6020 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006021 }
6022 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 *result = x;
6024 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006025 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006026 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 long value = PyLong_AS_LONG(x);
6028 long max = PyUnicode_GetMax();
6029 if (value < 0 || value > max) {
6030 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006031 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 Py_DECREF(x);
6033 return -1;
6034 }
6035 *result = x;
6036 return 0;
6037 }
6038 else if (PyUnicode_Check(x)) {
6039 *result = x;
6040 return 0;
6041 }
6042 else {
6043 /* wrong return value */
6044 PyErr_SetString(PyExc_TypeError,
6045 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006046 Py_DECREF(x);
6047 return -1;
6048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049}
6050/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 if not reallocate and adjust various state variables.
6052 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006053static int
6054charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006057 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006058 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 /* remember old output position */
6060 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6061 /* exponentially overallocate to minimize reallocations */
6062 if (requiredsize < 2 * oldsize)
6063 requiredsize = 2 * oldsize;
6064 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6065 return -1;
6066 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006067 }
6068 return 0;
6069}
6070/* lookup the character, put the result in the output string and adjust
6071 various state variables. Return a new reference to the object that
6072 was put in the output buffer in *result, or Py_None, if the mapping was
6073 undefined (in which case no character was written).
6074 The called must decref result.
6075 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006076static int
6077charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6078 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6079 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080{
Walter Dörwald4894c302003-10-24 14:25:28 +00006081 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 /* not found => default to 1:1 mapping */
6085 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 }
6087 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006089 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 /* no overflow check, because we know that the space is enough */
6091 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092 }
6093 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6095 if (repsize==1) {
6096 /* no overflow check, because we know that the space is enough */
6097 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6098 }
6099 else if (repsize!=0) {
6100 /* more than one character */
6101 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6102 (insize - (curinp-startinp)) +
6103 repsize - 1;
6104 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6105 return -1;
6106 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6107 *outp += repsize;
6108 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006109 }
6110 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 return 0;
6113}
6114
Alexander Belopolsky40018472011-02-26 01:02:56 +00006115PyObject *
6116PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6117 Py_ssize_t size,
6118 PyObject *mapping,
6119 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 /* output object */
6122 PyObject *res = NULL;
6123 /* pointers to the beginning and end+1 of input */
6124 const Py_UNICODE *startp = p;
6125 const Py_UNICODE *endp = p + size;
6126 /* pointer into the output */
6127 Py_UNICODE *str;
6128 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006130 char *reason = "character maps to <undefined>";
6131 PyObject *errorHandler = NULL;
6132 PyObject *exc = NULL;
6133 /* the following variable is used for caching string comparisons
6134 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6135 * 3=ignore, 4=xmlcharrefreplace */
6136 int known_errorHandler = -1;
6137
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 PyErr_BadArgument();
6140 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006142
6143 /* allocate enough for a simple 1:1 translation without
6144 replacements, if we need more, we'll resize */
6145 res = PyUnicode_FromUnicode(NULL, size);
6146 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006150 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006152 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 /* try to encode it */
6154 PyObject *x = NULL;
6155 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6156 Py_XDECREF(x);
6157 goto onError;
6158 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006159 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 if (x!=Py_None) /* it worked => adjust input pointer */
6161 ++p;
6162 else { /* untranslatable character */
6163 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6164 Py_ssize_t repsize;
6165 Py_ssize_t newpos;
6166 Py_UNICODE *uni2;
6167 /* startpos for collecting untranslatable chars */
6168 const Py_UNICODE *collstart = p;
6169 const Py_UNICODE *collend = p+1;
6170 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 /* find all untranslatable characters */
6173 while (collend < endp) {
6174 if (charmaptranslate_lookup(*collend, mapping, &x))
6175 goto onError;
6176 Py_XDECREF(x);
6177 if (x!=Py_None)
6178 break;
6179 ++collend;
6180 }
6181 /* cache callback name lookup
6182 * (if not done yet, i.e. it's the first error) */
6183 if (known_errorHandler==-1) {
6184 if ((errors==NULL) || (!strcmp(errors, "strict")))
6185 known_errorHandler = 1;
6186 else if (!strcmp(errors, "replace"))
6187 known_errorHandler = 2;
6188 else if (!strcmp(errors, "ignore"))
6189 known_errorHandler = 3;
6190 else if (!strcmp(errors, "xmlcharrefreplace"))
6191 known_errorHandler = 4;
6192 else
6193 known_errorHandler = 0;
6194 }
6195 switch (known_errorHandler) {
6196 case 1: /* strict */
6197 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006198 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 case 2: /* replace */
6200 /* No need to check for space, this is a 1:1 replacement */
6201 for (coll = collstart; coll<collend; ++coll)
6202 *str++ = '?';
6203 /* fall through */
6204 case 3: /* ignore */
6205 p = collend;
6206 break;
6207 case 4: /* xmlcharrefreplace */
6208 /* generate replacement (temporarily (mis)uses p) */
6209 for (p = collstart; p < collend; ++p) {
6210 char buffer[2+29+1+1];
6211 char *cp;
6212 sprintf(buffer, "&#%d;", (int)*p);
6213 if (charmaptranslate_makespace(&res, &str,
6214 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6215 goto onError;
6216 for (cp = buffer; *cp; ++cp)
6217 *str++ = *cp;
6218 }
6219 p = collend;
6220 break;
6221 default:
6222 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6223 reason, startp, size, &exc,
6224 collstart-startp, collend-startp, &newpos);
6225 if (repunicode == NULL)
6226 goto onError;
6227 /* generate replacement */
6228 repsize = PyUnicode_GET_SIZE(repunicode);
6229 if (charmaptranslate_makespace(&res, &str,
6230 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6231 Py_DECREF(repunicode);
6232 goto onError;
6233 }
6234 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6235 *str++ = *uni2;
6236 p = startp + newpos;
6237 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006238 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006239 }
6240 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006241 /* Resize if we allocated to much */
6242 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006243 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 if (PyUnicode_Resize(&res, respos) < 0)
6245 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006246 }
6247 Py_XDECREF(exc);
6248 Py_XDECREF(errorHandler);
6249 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006252 Py_XDECREF(res);
6253 Py_XDECREF(exc);
6254 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 return NULL;
6256}
6257
Alexander Belopolsky40018472011-02-26 01:02:56 +00006258PyObject *
6259PyUnicode_Translate(PyObject *str,
6260 PyObject *mapping,
6261 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262{
6263 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006264
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 str = PyUnicode_FromObject(str);
6266 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 PyUnicode_GET_SIZE(str),
6270 mapping,
6271 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 Py_DECREF(str);
6273 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006274
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 Py_XDECREF(str);
6277 return NULL;
6278}
Tim Petersced69f82003-09-16 20:30:58 +00006279
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006280PyObject *
6281PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6282 Py_ssize_t length)
6283{
6284 PyObject *result;
6285 Py_UNICODE *p; /* write pointer into result */
6286 Py_ssize_t i;
6287 /* Copy to a new string */
6288 result = (PyObject *)_PyUnicode_New(length);
6289 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6290 if (result == NULL)
6291 return result;
6292 p = PyUnicode_AS_UNICODE(result);
6293 /* Iterate over code points */
6294 for (i = 0; i < length; i++) {
6295 Py_UNICODE ch =s[i];
6296 if (ch > 127) {
6297 int decimal = Py_UNICODE_TODECIMAL(ch);
6298 if (decimal >= 0)
6299 p[i] = '0' + decimal;
6300 }
6301 }
6302 return result;
6303}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006304/* --- Decimal Encoder ---------------------------------------------------- */
6305
Alexander Belopolsky40018472011-02-26 01:02:56 +00006306int
6307PyUnicode_EncodeDecimal(Py_UNICODE *s,
6308 Py_ssize_t length,
6309 char *output,
6310 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006311{
6312 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006313 PyObject *errorHandler = NULL;
6314 PyObject *exc = NULL;
6315 const char *encoding = "decimal";
6316 const char *reason = "invalid decimal Unicode string";
6317 /* the following variable is used for caching string comparisons
6318 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6319 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006320
6321 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 PyErr_BadArgument();
6323 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006324 }
6325
6326 p = s;
6327 end = s + length;
6328 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 register Py_UNICODE ch = *p;
6330 int decimal;
6331 PyObject *repunicode;
6332 Py_ssize_t repsize;
6333 Py_ssize_t newpos;
6334 Py_UNICODE *uni2;
6335 Py_UNICODE *collstart;
6336 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006337
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006339 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 ++p;
6341 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006342 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 decimal = Py_UNICODE_TODECIMAL(ch);
6344 if (decimal >= 0) {
6345 *output++ = '0' + decimal;
6346 ++p;
6347 continue;
6348 }
6349 if (0 < ch && ch < 256) {
6350 *output++ = (char)ch;
6351 ++p;
6352 continue;
6353 }
6354 /* All other characters are considered unencodable */
6355 collstart = p;
6356 collend = p+1;
6357 while (collend < end) {
6358 if ((0 < *collend && *collend < 256) ||
6359 !Py_UNICODE_ISSPACE(*collend) ||
6360 Py_UNICODE_TODECIMAL(*collend))
6361 break;
6362 }
6363 /* cache callback name lookup
6364 * (if not done yet, i.e. it's the first error) */
6365 if (known_errorHandler==-1) {
6366 if ((errors==NULL) || (!strcmp(errors, "strict")))
6367 known_errorHandler = 1;
6368 else if (!strcmp(errors, "replace"))
6369 known_errorHandler = 2;
6370 else if (!strcmp(errors, "ignore"))
6371 known_errorHandler = 3;
6372 else if (!strcmp(errors, "xmlcharrefreplace"))
6373 known_errorHandler = 4;
6374 else
6375 known_errorHandler = 0;
6376 }
6377 switch (known_errorHandler) {
6378 case 1: /* strict */
6379 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6380 goto onError;
6381 case 2: /* replace */
6382 for (p = collstart; p < collend; ++p)
6383 *output++ = '?';
6384 /* fall through */
6385 case 3: /* ignore */
6386 p = collend;
6387 break;
6388 case 4: /* xmlcharrefreplace */
6389 /* generate replacement (temporarily (mis)uses p) */
6390 for (p = collstart; p < collend; ++p)
6391 output += sprintf(output, "&#%d;", (int)*p);
6392 p = collend;
6393 break;
6394 default:
6395 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6396 encoding, reason, s, length, &exc,
6397 collstart-s, collend-s, &newpos);
6398 if (repunicode == NULL)
6399 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006400 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006401 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006402 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6403 Py_DECREF(repunicode);
6404 goto onError;
6405 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 /* generate replacement */
6407 repsize = PyUnicode_GET_SIZE(repunicode);
6408 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6409 Py_UNICODE ch = *uni2;
6410 if (Py_UNICODE_ISSPACE(ch))
6411 *output++ = ' ';
6412 else {
6413 decimal = Py_UNICODE_TODECIMAL(ch);
6414 if (decimal >= 0)
6415 *output++ = '0' + decimal;
6416 else if (0 < ch && ch < 256)
6417 *output++ = (char)ch;
6418 else {
6419 Py_DECREF(repunicode);
6420 raise_encode_exception(&exc, encoding,
6421 s, length, collstart-s, collend-s, reason);
6422 goto onError;
6423 }
6424 }
6425 }
6426 p = s + newpos;
6427 Py_DECREF(repunicode);
6428 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006429 }
6430 /* 0-terminate the output string */
6431 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 Py_XDECREF(exc);
6433 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006434 return 0;
6435
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 Py_XDECREF(exc);
6438 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006439 return -1;
6440}
6441
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442/* --- Helpers ------------------------------------------------------------ */
6443
Eric Smith8c663262007-08-25 02:26:07 +00006444#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006445#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006446
Thomas Wouters477c8d52006-05-27 19:21:47 +00006447#include "stringlib/count.h"
6448#include "stringlib/find.h"
6449#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006450#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006451
Eric Smith5807c412008-05-11 21:00:57 +00006452#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006453#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006454#include "stringlib/localeutil.h"
6455
Thomas Wouters477c8d52006-05-27 19:21:47 +00006456/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006457#define ADJUST_INDICES(start, end, len) \
6458 if (end > len) \
6459 end = len; \
6460 else if (end < 0) { \
6461 end += len; \
6462 if (end < 0) \
6463 end = 0; \
6464 } \
6465 if (start < 0) { \
6466 start += len; \
6467 if (start < 0) \
6468 start = 0; \
6469 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006470
Alexander Belopolsky40018472011-02-26 01:02:56 +00006471Py_ssize_t
6472PyUnicode_Count(PyObject *str,
6473 PyObject *substr,
6474 Py_ssize_t start,
6475 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006477 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006478 PyUnicodeObject* str_obj;
6479 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006480
Thomas Wouters477c8d52006-05-27 19:21:47 +00006481 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6482 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006484 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6485 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 Py_DECREF(str_obj);
6487 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 }
Tim Petersced69f82003-09-16 20:30:58 +00006489
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006490 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006491 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006492 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6493 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006494 );
6495
6496 Py_DECREF(sub_obj);
6497 Py_DECREF(str_obj);
6498
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 return result;
6500}
6501
Alexander Belopolsky40018472011-02-26 01:02:56 +00006502Py_ssize_t
6503PyUnicode_Find(PyObject *str,
6504 PyObject *sub,
6505 Py_ssize_t start,
6506 Py_ssize_t end,
6507 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006509 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006510
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006512 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006514 sub = PyUnicode_FromObject(sub);
6515 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 Py_DECREF(str);
6517 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518 }
Tim Petersced69f82003-09-16 20:30:58 +00006519
Thomas Wouters477c8d52006-05-27 19:21:47 +00006520 if (direction > 0)
6521 result = stringlib_find_slice(
6522 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6523 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6524 start, end
6525 );
6526 else
6527 result = stringlib_rfind_slice(
6528 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6529 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6530 start, end
6531 );
6532
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006534 Py_DECREF(sub);
6535
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 return result;
6537}
6538
Alexander Belopolsky40018472011-02-26 01:02:56 +00006539static int
6540tailmatch(PyUnicodeObject *self,
6541 PyUnicodeObject *substring,
6542 Py_ssize_t start,
6543 Py_ssize_t end,
6544 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 if (substring->length == 0)
6547 return 1;
6548
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006549 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 end -= substring->length;
6551 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553
6554 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 if (Py_UNICODE_MATCH(self, end, substring))
6556 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 } else {
6558 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 }
6561
6562 return 0;
6563}
6564
Alexander Belopolsky40018472011-02-26 01:02:56 +00006565Py_ssize_t
6566PyUnicode_Tailmatch(PyObject *str,
6567 PyObject *substr,
6568 Py_ssize_t start,
6569 Py_ssize_t end,
6570 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006572 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006573
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 str = PyUnicode_FromObject(str);
6575 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 substr = PyUnicode_FromObject(substr);
6578 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 Py_DECREF(str);
6580 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 }
Tim Petersced69f82003-09-16 20:30:58 +00006582
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 (PyUnicodeObject *)substr,
6585 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 Py_DECREF(str);
6587 Py_DECREF(substr);
6588 return result;
6589}
6590
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591/* Apply fixfct filter to the Unicode object self and return a
6592 reference to the modified object */
6593
Alexander Belopolsky40018472011-02-26 01:02:56 +00006594static PyObject *
6595fixup(PyUnicodeObject *self,
6596 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597{
6598
6599 PyUnicodeObject *u;
6600
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006601 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006604
6605 Py_UNICODE_COPY(u->str, self->str, self->length);
6606
Tim Peters7a29bd52001-09-12 03:03:31 +00006607 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 /* fixfct should return TRUE if it modified the buffer. If
6609 FALSE, return a reference to the original buffer instead
6610 (to save space, not time) */
6611 Py_INCREF(self);
6612 Py_DECREF(u);
6613 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 }
6615 return (PyObject*) u;
6616}
6617
Alexander Belopolsky40018472011-02-26 01:02:56 +00006618static int
6619fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006621 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 Py_UNICODE *s = self->str;
6623 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006624
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006627
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 ch = Py_UNICODE_TOUPPER(*s);
6629 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 *s = ch;
6632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 s++;
6634 }
6635
6636 return status;
6637}
6638
Alexander Belopolsky40018472011-02-26 01:02:56 +00006639static int
6640fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006642 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 Py_UNICODE *s = self->str;
6644 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006645
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006648
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 ch = Py_UNICODE_TOLOWER(*s);
6650 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 *s = ch;
6653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 s++;
6655 }
6656
6657 return status;
6658}
6659
Alexander Belopolsky40018472011-02-26 01:02:56 +00006660static int
6661fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006663 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 Py_UNICODE *s = self->str;
6665 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006666
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 while (len-- > 0) {
6668 if (Py_UNICODE_ISUPPER(*s)) {
6669 *s = Py_UNICODE_TOLOWER(*s);
6670 status = 1;
6671 } else if (Py_UNICODE_ISLOWER(*s)) {
6672 *s = Py_UNICODE_TOUPPER(*s);
6673 status = 1;
6674 }
6675 s++;
6676 }
6677
6678 return status;
6679}
6680
Alexander Belopolsky40018472011-02-26 01:02:56 +00006681static int
6682fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006684 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006685 Py_UNICODE *s = self->str;
6686 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006687
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006688 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006690 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 *s = Py_UNICODE_TOUPPER(*s);
6692 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006694 s++;
6695 while (--len > 0) {
6696 if (Py_UNICODE_ISUPPER(*s)) {
6697 *s = Py_UNICODE_TOLOWER(*s);
6698 status = 1;
6699 }
6700 s++;
6701 }
6702 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703}
6704
Alexander Belopolsky40018472011-02-26 01:02:56 +00006705static int
6706fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707{
6708 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6709 register Py_UNICODE *e;
6710 int previous_is_cased;
6711
6712 /* Shortcut for single character strings */
6713 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6715 if (*p != ch) {
6716 *p = ch;
6717 return 1;
6718 }
6719 else
6720 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 }
Tim Petersced69f82003-09-16 20:30:58 +00006722
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 e = p + PyUnicode_GET_SIZE(self);
6724 previous_is_cased = 0;
6725 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006727
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 if (previous_is_cased)
6729 *p = Py_UNICODE_TOLOWER(ch);
6730 else
6731 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006732
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 if (Py_UNICODE_ISLOWER(ch) ||
6734 Py_UNICODE_ISUPPER(ch) ||
6735 Py_UNICODE_ISTITLE(ch))
6736 previous_is_cased = 1;
6737 else
6738 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 }
6740 return 1;
6741}
6742
Tim Peters8ce9f162004-08-27 01:49:32 +00006743PyObject *
6744PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745{
Skip Montanaro6543b452004-09-16 03:28:13 +00006746 const Py_UNICODE blank = ' ';
6747 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006748 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006749 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006750 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6751 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006752 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6753 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006754 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006755 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756
Tim Peters05eba1f2004-08-27 21:32:02 +00006757 fseq = PySequence_Fast(seq, "");
6758 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006759 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006760 }
6761
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006762 /* NOTE: the following code can't call back into Python code,
6763 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006764 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006765
Tim Peters05eba1f2004-08-27 21:32:02 +00006766 seqlen = PySequence_Fast_GET_SIZE(fseq);
6767 /* If empty sequence, return u"". */
6768 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006769 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6770 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006771 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006772 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006773 /* If singleton sequence with an exact Unicode, return that. */
6774 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 item = items[0];
6776 if (PyUnicode_CheckExact(item)) {
6777 Py_INCREF(item);
6778 res = (PyUnicodeObject *)item;
6779 goto Done;
6780 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006781 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006782 else {
6783 /* Set up sep and seplen */
6784 if (separator == NULL) {
6785 sep = &blank;
6786 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006787 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006788 else {
6789 if (!PyUnicode_Check(separator)) {
6790 PyErr_Format(PyExc_TypeError,
6791 "separator: expected str instance,"
6792 " %.80s found",
6793 Py_TYPE(separator)->tp_name);
6794 goto onError;
6795 }
6796 sep = PyUnicode_AS_UNICODE(separator);
6797 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006798 }
6799 }
6800
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006801 /* There are at least two things to join, or else we have a subclass
6802 * of str in the sequence.
6803 * Do a pre-pass to figure out the total amount of space we'll
6804 * need (sz), and see whether all argument are strings.
6805 */
6806 sz = 0;
6807 for (i = 0; i < seqlen; i++) {
6808 const Py_ssize_t old_sz = sz;
6809 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 if (!PyUnicode_Check(item)) {
6811 PyErr_Format(PyExc_TypeError,
6812 "sequence item %zd: expected str instance,"
6813 " %.80s found",
6814 i, Py_TYPE(item)->tp_name);
6815 goto onError;
6816 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006817 sz += PyUnicode_GET_SIZE(item);
6818 if (i != 0)
6819 sz += seplen;
6820 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6821 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006823 goto onError;
6824 }
6825 }
Tim Petersced69f82003-09-16 20:30:58 +00006826
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006827 res = _PyUnicode_New(sz);
6828 if (res == NULL)
6829 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006830
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006831 /* Catenate everything. */
6832 res_p = PyUnicode_AS_UNICODE(res);
6833 for (i = 0; i < seqlen; ++i) {
6834 Py_ssize_t itemlen;
6835 item = items[i];
6836 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 /* Copy item, and maybe the separator. */
6838 if (i) {
6839 Py_UNICODE_COPY(res_p, sep, seplen);
6840 res_p += seplen;
6841 }
6842 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6843 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006844 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006845
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006847 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 return (PyObject *)res;
6849
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006851 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006852 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 return NULL;
6854}
6855
Alexander Belopolsky40018472011-02-26 01:02:56 +00006856static PyUnicodeObject *
6857pad(PyUnicodeObject *self,
6858 Py_ssize_t left,
6859 Py_ssize_t right,
6860 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861{
6862 PyUnicodeObject *u;
6863
6864 if (left < 0)
6865 left = 0;
6866 if (right < 0)
6867 right = 0;
6868
Tim Peters7a29bd52001-09-12 03:03:31 +00006869 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 Py_INCREF(self);
6871 return self;
6872 }
6873
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006874 if (left > PY_SSIZE_T_MAX - self->length ||
6875 right > PY_SSIZE_T_MAX - (left + self->length)) {
6876 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6877 return NULL;
6878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 u = _PyUnicode_New(left + self->length + right);
6880 if (u) {
6881 if (left)
6882 Py_UNICODE_FILL(u->str, fill, left);
6883 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6884 if (right)
6885 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6886 }
6887
6888 return u;
6889}
6890
Alexander Belopolsky40018472011-02-26 01:02:56 +00006891PyObject *
6892PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895
6896 string = PyUnicode_FromObject(string);
6897 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006900 list = stringlib_splitlines(
6901 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6902 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903
6904 Py_DECREF(string);
6905 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906}
6907
Alexander Belopolsky40018472011-02-26 01:02:56 +00006908static PyObject *
6909split(PyUnicodeObject *self,
6910 PyUnicodeObject *substring,
6911 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006914 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006917 return stringlib_split_whitespace(
6918 (PyObject*) self, self->str, self->length, maxcount
6919 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006921 return stringlib_split(
6922 (PyObject*) self, self->str, self->length,
6923 substring->str, substring->length,
6924 maxcount
6925 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926}
6927
Alexander Belopolsky40018472011-02-26 01:02:56 +00006928static PyObject *
6929rsplit(PyUnicodeObject *self,
6930 PyUnicodeObject *substring,
6931 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006932{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006933 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006934 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006935
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006936 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006937 return stringlib_rsplit_whitespace(
6938 (PyObject*) self, self->str, self->length, maxcount
6939 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006940
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006941 return stringlib_rsplit(
6942 (PyObject*) self, self->str, self->length,
6943 substring->str, substring->length,
6944 maxcount
6945 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006946}
6947
Alexander Belopolsky40018472011-02-26 01:02:56 +00006948static PyObject *
6949replace(PyUnicodeObject *self,
6950 PyUnicodeObject *str1,
6951 PyUnicodeObject *str2,
6952 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953{
6954 PyUnicodeObject *u;
6955
6956 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006958 else if (maxcount == 0 || self->length == 0)
6959 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
Thomas Wouters477c8d52006-05-27 19:21:47 +00006961 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006962 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006963 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006964 if (str1->length == 0)
6965 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006966 if (str1->length == 1) {
6967 /* replace characters */
6968 Py_UNICODE u1, u2;
6969 if (!findchar(self->str, self->length, str1->str[0]))
6970 goto nothing;
6971 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6972 if (!u)
6973 return NULL;
6974 Py_UNICODE_COPY(u->str, self->str, self->length);
6975 u1 = str1->str[0];
6976 u2 = str2->str[0];
6977 for (i = 0; i < u->length; i++)
6978 if (u->str[i] == u1) {
6979 if (--maxcount < 0)
6980 break;
6981 u->str[i] = u2;
6982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006984 i = stringlib_find(
6985 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006987 if (i < 0)
6988 goto nothing;
6989 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6990 if (!u)
6991 return NULL;
6992 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006993
6994 /* change everything in-place, starting with this one */
6995 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6996 i += str1->length;
6997
6998 while ( --maxcount > 0) {
6999 i = stringlib_find(self->str+i, self->length-i,
7000 str1->str, str1->length,
7001 i);
7002 if (i == -1)
7003 break;
7004 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7005 i += str1->length;
7006 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007007 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007009
Brett Cannonb94767f2011-02-22 20:15:44 +00007010 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007011 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 Py_UNICODE *p;
7013
7014 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007015 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7016 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007017 if (n == 0)
7018 goto nothing;
7019 /* new_size = self->length + n * (str2->length - str1->length)); */
7020 delta = (str2->length - str1->length);
7021 if (delta == 0) {
7022 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007024 product = n * (str2->length - str1->length);
7025 if ((product / (str2->length - str1->length)) != n) {
7026 PyErr_SetString(PyExc_OverflowError,
7027 "replace string is too long");
7028 return NULL;
7029 }
7030 new_size = self->length + product;
7031 if (new_size < 0) {
7032 PyErr_SetString(PyExc_OverflowError,
7033 "replace string is too long");
7034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 }
7036 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007037 u = _PyUnicode_New(new_size);
7038 if (!u)
7039 return NULL;
7040 i = 0;
7041 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007042 if (str1->length > 0) {
7043 while (n-- > 0) {
7044 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007045 j = stringlib_find(self->str+i, self->length-i,
7046 str1->str, str1->length,
7047 i);
7048 if (j == -1)
7049 break;
7050 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007051 /* copy unchanged part [i:j] */
7052 Py_UNICODE_COPY(p, self->str+i, j-i);
7053 p += j - i;
7054 }
7055 /* copy substitution string */
7056 if (str2->length > 0) {
7057 Py_UNICODE_COPY(p, str2->str, str2->length);
7058 p += str2->length;
7059 }
7060 i = j + str1->length;
7061 }
7062 if (i < self->length)
7063 /* copy tail [i:] */
7064 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7065 } else {
7066 /* interleave */
7067 while (n > 0) {
7068 Py_UNICODE_COPY(p, str2->str, str2->length);
7069 p += str2->length;
7070 if (--n <= 0)
7071 break;
7072 *p++ = self->str[i++];
7073 }
7074 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007078
Benjamin Peterson29060642009-01-31 22:14:21 +00007079 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007080 /* nothing to replace; return original string (when possible) */
7081 if (PyUnicode_CheckExact(self)) {
7082 Py_INCREF(self);
7083 return (PyObject *) self;
7084 }
7085 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086}
7087
7088/* --- Unicode Object Methods --------------------------------------------- */
7089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007090PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092\n\
7093Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007094characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095
7096static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007097unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 return fixup(self, fixtitle);
7100}
7101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007102PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007103 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104\n\
7105Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007106have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107
7108static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007109unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111 return fixup(self, fixcapitalize);
7112}
7113
7114#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007115PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117\n\
7118Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007119normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
7121static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007122unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123{
7124 PyObject *list;
7125 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007126 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128 /* Split into words */
7129 list = split(self, NULL, -1);
7130 if (!list)
7131 return NULL;
7132
7133 /* Capitalize each word */
7134 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7135 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 if (item == NULL)
7138 goto onError;
7139 Py_DECREF(PyList_GET_ITEM(list, i));
7140 PyList_SET_ITEM(list, i, item);
7141 }
7142
7143 /* Join the words to form a new string */
7144 item = PyUnicode_Join(NULL, list);
7145
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 Py_DECREF(list);
7148 return (PyObject *)item;
7149}
7150#endif
7151
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007152/* Argument converter. Coerces to a single unicode character */
7153
7154static int
7155convert_uc(PyObject *obj, void *addr)
7156{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007157 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7158 PyObject *uniobj;
7159 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007160
Benjamin Peterson14339b62009-01-31 16:36:08 +00007161 uniobj = PyUnicode_FromObject(obj);
7162 if (uniobj == NULL) {
7163 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007165 return 0;
7166 }
7167 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7168 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007170 Py_DECREF(uniobj);
7171 return 0;
7172 }
7173 unistr = PyUnicode_AS_UNICODE(uniobj);
7174 *fillcharloc = unistr[0];
7175 Py_DECREF(uniobj);
7176 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007177}
7178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007179PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007182Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007183done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184
7185static PyObject *
7186unicode_center(PyUnicodeObject *self, PyObject *args)
7187{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007188 Py_ssize_t marg, left;
7189 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007190 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191
Thomas Woutersde017742006-02-16 19:34:37 +00007192 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 return NULL;
7194
Tim Peters7a29bd52001-09-12 03:03:31 +00007195 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196 Py_INCREF(self);
7197 return (PyObject*) self;
7198 }
7199
7200 marg = width - self->length;
7201 left = marg / 2 + (marg & width & 1);
7202
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007203 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204}
7205
Marc-André Lemburge5034372000-08-08 08:04:29 +00007206#if 0
7207
7208/* This code should go into some future Unicode collation support
7209 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007210 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007211
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007212/* speedy UTF-16 code point order comparison */
7213/* gleaned from: */
7214/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7215
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007216static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007217{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007218 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007219 0, 0, 0, 0, 0, 0, 0, 0,
7220 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007221 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007222};
7223
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224static int
7225unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7226{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007227 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007228
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 Py_UNICODE *s1 = str1->str;
7230 Py_UNICODE *s2 = str2->str;
7231
7232 len1 = str1->length;
7233 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007234
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007236 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007237
7238 c1 = *s1++;
7239 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007240
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 if (c1 > (1<<11) * 26)
7242 c1 += utf16Fixup[c1>>11];
7243 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007244 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007245 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007246
7247 if (c1 != c2)
7248 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007249
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007250 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 }
7252
7253 return (len1 < len2) ? -1 : (len1 != len2);
7254}
7255
Marc-André Lemburge5034372000-08-08 08:04:29 +00007256#else
7257
7258static int
7259unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7260{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007261 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007262
7263 Py_UNICODE *s1 = str1->str;
7264 Py_UNICODE *s2 = str2->str;
7265
7266 len1 = str1->length;
7267 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007268
Marc-André Lemburge5034372000-08-08 08:04:29 +00007269 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007270 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007271
Fredrik Lundh45714e92001-06-26 16:39:36 +00007272 c1 = *s1++;
7273 c2 = *s2++;
7274
7275 if (c1 != c2)
7276 return (c1 < c2) ? -1 : 1;
7277
Marc-André Lemburge5034372000-08-08 08:04:29 +00007278 len1--; len2--;
7279 }
7280
7281 return (len1 < len2) ? -1 : (len1 != len2);
7282}
7283
7284#endif
7285
Alexander Belopolsky40018472011-02-26 01:02:56 +00007286int
7287PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007289 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7290 return unicode_compare((PyUnicodeObject *)left,
7291 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007292 PyErr_Format(PyExc_TypeError,
7293 "Can't compare %.100s and %.100s",
7294 left->ob_type->tp_name,
7295 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296 return -1;
7297}
7298
Martin v. Löwis5b222132007-06-10 09:51:05 +00007299int
7300PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7301{
7302 int i;
7303 Py_UNICODE *id;
7304 assert(PyUnicode_Check(uni));
7305 id = PyUnicode_AS_UNICODE(uni);
7306 /* Compare Unicode string and source character set string */
7307 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007308 if (id[i] != str[i])
7309 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007310 /* This check keeps Python strings that end in '\0' from comparing equal
7311 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007312 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007314 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007316 return 0;
7317}
7318
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007319
Benjamin Peterson29060642009-01-31 22:14:21 +00007320#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007321 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007322
Alexander Belopolsky40018472011-02-26 01:02:56 +00007323PyObject *
7324PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007325{
7326 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007327
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007328 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7329 PyObject *v;
7330 if (((PyUnicodeObject *) left)->length !=
7331 ((PyUnicodeObject *) right)->length) {
7332 if (op == Py_EQ) {
7333 Py_INCREF(Py_False);
7334 return Py_False;
7335 }
7336 if (op == Py_NE) {
7337 Py_INCREF(Py_True);
7338 return Py_True;
7339 }
7340 }
7341 if (left == right)
7342 result = 0;
7343 else
7344 result = unicode_compare((PyUnicodeObject *)left,
7345 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007346
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007347 /* Convert the return value to a Boolean */
7348 switch (op) {
7349 case Py_EQ:
7350 v = TEST_COND(result == 0);
7351 break;
7352 case Py_NE:
7353 v = TEST_COND(result != 0);
7354 break;
7355 case Py_LE:
7356 v = TEST_COND(result <= 0);
7357 break;
7358 case Py_GE:
7359 v = TEST_COND(result >= 0);
7360 break;
7361 case Py_LT:
7362 v = TEST_COND(result == -1);
7363 break;
7364 case Py_GT:
7365 v = TEST_COND(result == 1);
7366 break;
7367 default:
7368 PyErr_BadArgument();
7369 return NULL;
7370 }
7371 Py_INCREF(v);
7372 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007373 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007374
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007375 Py_INCREF(Py_NotImplemented);
7376 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007377}
7378
Alexander Belopolsky40018472011-02-26 01:02:56 +00007379int
7380PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007381{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007382 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007383 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007384
7385 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007386 sub = PyUnicode_FromObject(element);
7387 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 PyErr_Format(PyExc_TypeError,
7389 "'in <string>' requires string as left operand, not %s",
7390 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007391 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007392 }
7393
Thomas Wouters477c8d52006-05-27 19:21:47 +00007394 str = PyUnicode_FromObject(container);
7395 if (!str) {
7396 Py_DECREF(sub);
7397 return -1;
7398 }
7399
7400 result = stringlib_contains_obj(str, sub);
7401
7402 Py_DECREF(str);
7403 Py_DECREF(sub);
7404
Guido van Rossum403d68b2000-03-13 15:55:09 +00007405 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007406}
7407
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408/* Concat to string or Unicode object giving a new Unicode object. */
7409
Alexander Belopolsky40018472011-02-26 01:02:56 +00007410PyObject *
7411PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412{
7413 PyUnicodeObject *u = NULL, *v = NULL, *w;
7414
7415 /* Coerce the two arguments */
7416 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7417 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7420 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422
7423 /* Shortcuts */
7424 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 Py_DECREF(v);
7426 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427 }
7428 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 Py_DECREF(u);
7430 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431 }
7432
7433 /* Concat the two Unicode strings */
7434 w = _PyUnicode_New(u->length + v->length);
7435 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 Py_UNICODE_COPY(w->str, u->str, u->length);
7438 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7439
7440 Py_DECREF(u);
7441 Py_DECREF(v);
7442 return (PyObject *)w;
7443
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 Py_XDECREF(u);
7446 Py_XDECREF(v);
7447 return NULL;
7448}
7449
Walter Dörwald1ab83302007-05-18 17:15:44 +00007450void
7451PyUnicode_Append(PyObject **pleft, PyObject *right)
7452{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007453 PyObject *new;
7454 if (*pleft == NULL)
7455 return;
7456 if (right == NULL || !PyUnicode_Check(*pleft)) {
7457 Py_DECREF(*pleft);
7458 *pleft = NULL;
7459 return;
7460 }
7461 new = PyUnicode_Concat(*pleft, right);
7462 Py_DECREF(*pleft);
7463 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007464}
7465
7466void
7467PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7468{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007469 PyUnicode_Append(pleft, right);
7470 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007471}
7472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007473PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007476Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007477string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007478interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
7480static PyObject *
7481unicode_count(PyUnicodeObject *self, PyObject *args)
7482{
7483 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007484 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007485 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486 PyObject *result;
7487
Guido van Rossumb8872e62000-05-09 14:14:27 +00007488 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 return NULL;
7491
7492 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007493 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007496
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007497 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007498 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007499 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007500 substring->str, substring->length,
7501 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007502 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503
7504 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007505
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 return result;
7507}
7508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007509PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007510 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007512Encode S using the codec registered for encoding. Default encoding\n\
7513is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007514handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007515a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7516'xmlcharrefreplace' as well as any other name registered with\n\
7517codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518
7519static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007520unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007522 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523 char *encoding = NULL;
7524 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007525
Benjamin Peterson308d6372009-09-18 21:42:35 +00007526 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7527 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007529 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007530}
7531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007532PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007533 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534\n\
7535Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007536If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537
7538static PyObject*
7539unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7540{
7541 Py_UNICODE *e;
7542 Py_UNICODE *p;
7543 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007544 Py_UNICODE *qe;
7545 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546 PyUnicodeObject *u;
7547 int tabsize = 8;
7548
7549 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551
Thomas Wouters7e474022000-07-16 12:04:32 +00007552 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007553 i = 0; /* chars up to and including most recent \n or \r */
7554 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7555 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 for (p = self->str; p < e; p++)
7557 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 if (tabsize > 0) {
7559 incr = tabsize - (j % tabsize); /* cannot overflow */
7560 if (j > PY_SSIZE_T_MAX - incr)
7561 goto overflow1;
7562 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007563 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 if (j > PY_SSIZE_T_MAX - 1)
7567 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 j++;
7569 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 if (i > PY_SSIZE_T_MAX - j)
7571 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007573 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 }
7575 }
7576
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007577 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007579
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 /* Second pass: create output string and fill it */
7581 u = _PyUnicode_New(i + j);
7582 if (!u)
7583 return NULL;
7584
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007585 j = 0; /* same as in first pass */
7586 q = u->str; /* next output char */
7587 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588
7589 for (p = self->str; p < e; p++)
7590 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 if (tabsize > 0) {
7592 i = tabsize - (j % tabsize);
7593 j += i;
7594 while (i--) {
7595 if (q >= qe)
7596 goto overflow2;
7597 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007598 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007600 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 else {
7602 if (q >= qe)
7603 goto overflow2;
7604 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007605 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 if (*p == '\n' || *p == '\r')
7607 j = 0;
7608 }
7609
7610 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007611
7612 overflow2:
7613 Py_DECREF(u);
7614 overflow1:
7615 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617}
7618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007619PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621\n\
7622Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007623such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624arguments start and end are interpreted as in slice notation.\n\
7625\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007626Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
7628static PyObject *
7629unicode_find(PyUnicodeObject *self, PyObject *args)
7630{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007631 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007632 Py_ssize_t start;
7633 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007634 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635
Christian Heimes9cd17752007-11-18 19:35:23 +00007636 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638
Thomas Wouters477c8d52006-05-27 19:21:47 +00007639 result = stringlib_find_slice(
7640 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7641 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7642 start, end
7643 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644
7645 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007646
Christian Heimes217cfd12007-12-02 14:31:20 +00007647 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648}
7649
7650static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007651unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652{
7653 if (index < 0 || index >= self->length) {
7654 PyErr_SetString(PyExc_IndexError, "string index out of range");
7655 return NULL;
7656 }
7657
7658 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7659}
7660
Guido van Rossumc2504932007-09-18 19:42:40 +00007661/* Believe it or not, this produces the same value for ASCII strings
7662 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007663static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007664unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665{
Guido van Rossumc2504932007-09-18 19:42:40 +00007666 Py_ssize_t len;
7667 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007668 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007669
7670 if (self->hash != -1)
7671 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007672 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007673 p = self->str;
7674 x = *p << 7;
7675 while (--len >= 0)
7676 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007677 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007678 if (x == -1)
7679 x = -2;
7680 self->hash = x;
7681 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682}
7683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007684PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007687Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688
7689static PyObject *
7690unicode_index(PyUnicodeObject *self, PyObject *args)
7691{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007692 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007693 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007694 Py_ssize_t start;
7695 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696
Christian Heimes9cd17752007-11-18 19:35:23 +00007697 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699
Thomas Wouters477c8d52006-05-27 19:21:47 +00007700 result = stringlib_find_slice(
7701 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7702 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7703 start, end
7704 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705
7706 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007707
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708 if (result < 0) {
7709 PyErr_SetString(PyExc_ValueError, "substring not found");
7710 return NULL;
7711 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007712
Christian Heimes217cfd12007-12-02 14:31:20 +00007713 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714}
7715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007716PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007719Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007720at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721
7722static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007723unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724{
7725 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7726 register const Py_UNICODE *e;
7727 int cased;
7728
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 /* Shortcut for single character strings */
7730 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007733 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007734 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007736
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737 e = p + PyUnicode_GET_SIZE(self);
7738 cased = 0;
7739 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007741
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7743 return PyBool_FromLong(0);
7744 else if (!cased && Py_UNICODE_ISLOWER(ch))
7745 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007747 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748}
7749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007750PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007753Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007754at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
7756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007757unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758{
7759 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7760 register const Py_UNICODE *e;
7761 int cased;
7762
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 /* Shortcut for single character strings */
7764 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007767 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007768 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007770
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 e = p + PyUnicode_GET_SIZE(self);
7772 cased = 0;
7773 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007775
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7777 return PyBool_FromLong(0);
7778 else if (!cased && Py_UNICODE_ISUPPER(ch))
7779 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007781 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782}
7783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007784PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007787Return True if S is a titlecased string and there is at least one\n\
7788character in S, i.e. upper- and titlecase characters may only\n\
7789follow uncased characters and lowercase characters only cased ones.\n\
7790Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791
7792static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007793unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794{
7795 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7796 register const Py_UNICODE *e;
7797 int cased, previous_is_cased;
7798
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 /* Shortcut for single character strings */
7800 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7802 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007804 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007805 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007807
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 e = p + PyUnicode_GET_SIZE(self);
7809 cased = 0;
7810 previous_is_cased = 0;
7811 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007813
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7815 if (previous_is_cased)
7816 return PyBool_FromLong(0);
7817 previous_is_cased = 1;
7818 cased = 1;
7819 }
7820 else if (Py_UNICODE_ISLOWER(ch)) {
7821 if (!previous_is_cased)
7822 return PyBool_FromLong(0);
7823 previous_is_cased = 1;
7824 cased = 1;
7825 }
7826 else
7827 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007829 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830}
7831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007832PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007835Return True if all characters in S are whitespace\n\
7836and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837
7838static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007839unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840{
7841 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7842 register const Py_UNICODE *e;
7843
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 /* Shortcut for single character strings */
7845 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 Py_UNICODE_ISSPACE(*p))
7847 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007849 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007850 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007852
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 e = p + PyUnicode_GET_SIZE(self);
7854 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 if (!Py_UNICODE_ISSPACE(*p))
7856 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007858 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859}
7860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007861PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007863\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007864Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007865and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007866
7867static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007868unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007869{
7870 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7871 register const Py_UNICODE *e;
7872
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007873 /* Shortcut for single character strings */
7874 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 Py_UNICODE_ISALPHA(*p))
7876 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007877
7878 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007879 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007881
7882 e = p + PyUnicode_GET_SIZE(self);
7883 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 if (!Py_UNICODE_ISALPHA(*p))
7885 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007886 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007887 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007888}
7889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007890PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007892\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007893Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007894and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007895
7896static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007897unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007898{
7899 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7900 register const Py_UNICODE *e;
7901
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007902 /* Shortcut for single character strings */
7903 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 Py_UNICODE_ISALNUM(*p))
7905 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007906
7907 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007908 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007910
7911 e = p + PyUnicode_GET_SIZE(self);
7912 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 if (!Py_UNICODE_ISALNUM(*p))
7914 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007915 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007916 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007917}
7918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007919PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007922Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007923False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924
7925static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007926unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927{
7928 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7929 register const Py_UNICODE *e;
7930
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931 /* Shortcut for single character strings */
7932 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 Py_UNICODE_ISDECIMAL(*p))
7934 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007936 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007937 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007939
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940 e = p + PyUnicode_GET_SIZE(self);
7941 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 if (!Py_UNICODE_ISDECIMAL(*p))
7943 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007945 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946}
7947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007948PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007951Return True if all characters in S are digits\n\
7952and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953
7954static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007955unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956{
7957 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7958 register const Py_UNICODE *e;
7959
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 /* Shortcut for single character strings */
7961 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 Py_UNICODE_ISDIGIT(*p))
7963 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007965 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007966 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007968
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 e = p + PyUnicode_GET_SIZE(self);
7970 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 if (!Py_UNICODE_ISDIGIT(*p))
7972 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007974 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975}
7976
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007977PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007980Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007981False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982
7983static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007984unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985{
7986 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7987 register const Py_UNICODE *e;
7988
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 /* Shortcut for single character strings */
7990 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 Py_UNICODE_ISNUMERIC(*p))
7992 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007994 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007995 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007997
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 e = p + PyUnicode_GET_SIZE(self);
7999 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 if (!Py_UNICODE_ISNUMERIC(*p))
8001 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008003 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004}
8005
Martin v. Löwis47383402007-08-15 07:32:56 +00008006int
8007PyUnicode_IsIdentifier(PyObject *self)
8008{
8009 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8010 register const Py_UNICODE *e;
8011
8012 /* Special case for empty strings */
8013 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008015
8016 /* PEP 3131 says that the first character must be in
8017 XID_Start and subsequent characters in XID_Continue,
8018 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008019 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008020 letters, digits, underscore). However, given the current
8021 definition of XID_Start and XID_Continue, it is sufficient
8022 to check just for these, except that _ must be allowed
8023 as starting an identifier. */
8024 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8025 return 0;
8026
8027 e = p + PyUnicode_GET_SIZE(self);
8028 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 if (!_PyUnicode_IsXidContinue(*p))
8030 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008031 }
8032 return 1;
8033}
8034
8035PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008037\n\
8038Return True if S is a valid identifier according\n\
8039to the language definition.");
8040
8041static PyObject*
8042unicode_isidentifier(PyObject *self)
8043{
8044 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8045}
8046
Georg Brandl559e5d72008-06-11 18:37:52 +00008047PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008049\n\
8050Return True if all characters in S are considered\n\
8051printable in repr() or S is empty, False otherwise.");
8052
8053static PyObject*
8054unicode_isprintable(PyObject *self)
8055{
8056 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8057 register const Py_UNICODE *e;
8058
8059 /* Shortcut for single character strings */
8060 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8061 Py_RETURN_TRUE;
8062 }
8063
8064 e = p + PyUnicode_GET_SIZE(self);
8065 for (; p < e; p++) {
8066 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8067 Py_RETURN_FALSE;
8068 }
8069 }
8070 Py_RETURN_TRUE;
8071}
8072
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008073PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008074 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075\n\
8076Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008077iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078
8079static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008080unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008082 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083}
8084
Martin v. Löwis18e16552006-02-15 17:27:45 +00008085static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086unicode_length(PyUnicodeObject *self)
8087{
8088 return self->length;
8089}
8090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008091PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008094Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008095done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096
8097static PyObject *
8098unicode_ljust(PyUnicodeObject *self, PyObject *args)
8099{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008100 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008101 Py_UNICODE fillchar = ' ';
8102
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008103 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 return NULL;
8105
Tim Peters7a29bd52001-09-12 03:03:31 +00008106 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 Py_INCREF(self);
8108 return (PyObject*) self;
8109 }
8110
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008111 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112}
8113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008114PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008117Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118
8119static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008120unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122 return fixup(self, fixlower);
8123}
8124
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008125#define LEFTSTRIP 0
8126#define RIGHTSTRIP 1
8127#define BOTHSTRIP 2
8128
8129/* Arrays indexed by above */
8130static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8131
8132#define STRIPNAME(i) (stripformat[i]+3)
8133
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008134/* externally visible for str.strip(unicode) */
8135PyObject *
8136_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8137{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008138 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8139 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8140 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8141 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8142 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008143
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008145
Benjamin Peterson14339b62009-01-31 16:36:08 +00008146 i = 0;
8147 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8149 i++;
8150 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008151 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008152
Benjamin Peterson14339b62009-01-31 16:36:08 +00008153 j = len;
8154 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 do {
8156 j--;
8157 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8158 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008159 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008160
Benjamin Peterson14339b62009-01-31 16:36:08 +00008161 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 Py_INCREF(self);
8163 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 }
8165 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008167}
8168
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169
8170static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008171do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8174 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008175
Benjamin Peterson14339b62009-01-31 16:36:08 +00008176 i = 0;
8177 if (striptype != RIGHTSTRIP) {
8178 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8179 i++;
8180 }
8181 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008182
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 j = len;
8184 if (striptype != LEFTSTRIP) {
8185 do {
8186 j--;
8187 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8188 j++;
8189 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008190
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8192 Py_INCREF(self);
8193 return (PyObject*)self;
8194 }
8195 else
8196 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197}
8198
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008199
8200static PyObject *
8201do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8202{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008204
Benjamin Peterson14339b62009-01-31 16:36:08 +00008205 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8206 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008207
Benjamin Peterson14339b62009-01-31 16:36:08 +00008208 if (sep != NULL && sep != Py_None) {
8209 if (PyUnicode_Check(sep))
8210 return _PyUnicode_XStrip(self, striptype, sep);
8211 else {
8212 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008213 "%s arg must be None or str",
8214 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008215 return NULL;
8216 }
8217 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008218
Benjamin Peterson14339b62009-01-31 16:36:08 +00008219 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008220}
8221
8222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008223PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008225\n\
8226Return a copy of the string S with leading and trailing\n\
8227whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008228If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008229
8230static PyObject *
8231unicode_strip(PyUnicodeObject *self, PyObject *args)
8232{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008233 if (PyTuple_GET_SIZE(args) == 0)
8234 return do_strip(self, BOTHSTRIP); /* Common case */
8235 else
8236 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008237}
8238
8239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008240PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008242\n\
8243Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008244If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008245
8246static PyObject *
8247unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8248{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008249 if (PyTuple_GET_SIZE(args) == 0)
8250 return do_strip(self, LEFTSTRIP); /* Common case */
8251 else
8252 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008253}
8254
8255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008256PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008258\n\
8259Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008260If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008261
8262static PyObject *
8263unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8264{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008265 if (PyTuple_GET_SIZE(args) == 0)
8266 return do_strip(self, RIGHTSTRIP); /* Common case */
8267 else
8268 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008269}
8270
8271
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008273unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274{
8275 PyUnicodeObject *u;
8276 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008277 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008278 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279
Georg Brandl222de0f2009-04-12 12:01:50 +00008280 if (len < 1) {
8281 Py_INCREF(unicode_empty);
8282 return (PyObject *)unicode_empty;
8283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284
Tim Peters7a29bd52001-09-12 03:03:31 +00008285 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 /* no repeat, return original string */
8287 Py_INCREF(str);
8288 return (PyObject*) str;
8289 }
Tim Peters8f422462000-09-09 06:13:41 +00008290
8291 /* ensure # of chars needed doesn't overflow int and # of bytes
8292 * needed doesn't overflow size_t
8293 */
8294 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008295 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008296 PyErr_SetString(PyExc_OverflowError,
8297 "repeated string is too long");
8298 return NULL;
8299 }
8300 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8301 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8302 PyErr_SetString(PyExc_OverflowError,
8303 "repeated string is too long");
8304 return NULL;
8305 }
8306 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 if (!u)
8308 return NULL;
8309
8310 p = u->str;
8311
Georg Brandl222de0f2009-04-12 12:01:50 +00008312 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008313 Py_UNICODE_FILL(p, str->str[0], len);
8314 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008315 Py_ssize_t done = str->length; /* number of characters copied this far */
8316 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008318 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008319 Py_UNICODE_COPY(p+done, p, n);
8320 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 }
8323
8324 return (PyObject*) u;
8325}
8326
Alexander Belopolsky40018472011-02-26 01:02:56 +00008327PyObject *
8328PyUnicode_Replace(PyObject *obj,
8329 PyObject *subobj,
8330 PyObject *replobj,
8331 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332{
8333 PyObject *self;
8334 PyObject *str1;
8335 PyObject *str2;
8336 PyObject *result;
8337
8338 self = PyUnicode_FromObject(obj);
8339 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 str1 = PyUnicode_FromObject(subobj);
8342 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 Py_DECREF(self);
8344 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 }
8346 str2 = PyUnicode_FromObject(replobj);
8347 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 Py_DECREF(self);
8349 Py_DECREF(str1);
8350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 }
Tim Petersced69f82003-09-16 20:30:58 +00008352 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 (PyUnicodeObject *)str1,
8354 (PyUnicodeObject *)str2,
8355 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 Py_DECREF(self);
8357 Py_DECREF(str1);
8358 Py_DECREF(str2);
8359 return result;
8360}
8361
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008362PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008363 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364\n\
8365Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008366old replaced by new. If the optional argument count is\n\
8367given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368
8369static PyObject*
8370unicode_replace(PyUnicodeObject *self, PyObject *args)
8371{
8372 PyUnicodeObject *str1;
8373 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008374 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 PyObject *result;
8376
Martin v. Löwis18e16552006-02-15 17:27:45 +00008377 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 return NULL;
8379 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8380 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008383 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 Py_DECREF(str1);
8385 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008386 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387
8388 result = replace(self, str1, str2, maxcount);
8389
8390 Py_DECREF(str1);
8391 Py_DECREF(str2);
8392 return result;
8393}
8394
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395static PyObject *
8396unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008398 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008399 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008400 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8401 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8402
8403 /* XXX(nnorwitz): rather than over-allocating, it would be
8404 better to choose a different scheme. Perhaps scan the
8405 first N-chars of the string and allocate based on that size.
8406 */
8407 /* Initial allocation is based on the longest-possible unichr
8408 escape.
8409
8410 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8411 unichr, so in this case it's the longest unichr escape. In
8412 narrow (UTF-16) builds this is five chars per source unichr
8413 since there are two unichrs in the surrogate pair, so in narrow
8414 (UTF-16) builds it's not the longest unichr escape.
8415
8416 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8417 so in the narrow (UTF-16) build case it's the longest unichr
8418 escape.
8419 */
8420
Walter Dörwald1ab83302007-05-18 17:15:44 +00008421 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008423#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008425#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008427#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008429 if (repr == NULL)
8430 return NULL;
8431
Walter Dörwald1ab83302007-05-18 17:15:44 +00008432 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008433
8434 /* Add quote */
8435 *p++ = (findchar(s, size, '\'') &&
8436 !findchar(s, size, '"')) ? '"' : '\'';
8437 while (size-- > 0) {
8438 Py_UNICODE ch = *s++;
8439
8440 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008441 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008442 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008443 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008444 continue;
8445 }
8446
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008448 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008449 *p++ = '\\';
8450 *p++ = 't';
8451 }
8452 else if (ch == '\n') {
8453 *p++ = '\\';
8454 *p++ = 'n';
8455 }
8456 else if (ch == '\r') {
8457 *p++ = '\\';
8458 *p++ = 'r';
8459 }
8460
8461 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008462 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008463 *p++ = '\\';
8464 *p++ = 'x';
8465 *p++ = hexdigits[(ch >> 4) & 0x000F];
8466 *p++ = hexdigits[ch & 0x000F];
8467 }
8468
Georg Brandl559e5d72008-06-11 18:37:52 +00008469 /* Copy ASCII characters as-is */
8470 else if (ch < 0x7F) {
8471 *p++ = ch;
8472 }
8473
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008475 else {
8476 Py_UCS4 ucs = ch;
8477
8478#ifndef Py_UNICODE_WIDE
8479 Py_UNICODE ch2 = 0;
8480 /* Get code point from surrogate pair */
8481 if (size > 0) {
8482 ch2 = *s;
8483 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008485 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008487 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008488 size--;
8489 }
8490 }
8491#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008492 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008493 (categories Z* and C* except ASCII space)
8494 */
8495 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8496 /* Map 8-bit characters to '\xhh' */
8497 if (ucs <= 0xff) {
8498 *p++ = '\\';
8499 *p++ = 'x';
8500 *p++ = hexdigits[(ch >> 4) & 0x000F];
8501 *p++ = hexdigits[ch & 0x000F];
8502 }
8503 /* Map 21-bit characters to '\U00xxxxxx' */
8504 else if (ucs >= 0x10000) {
8505 *p++ = '\\';
8506 *p++ = 'U';
8507 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8508 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8509 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8510 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8511 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8512 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8513 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8514 *p++ = hexdigits[ucs & 0x0000000F];
8515 }
8516 /* Map 16-bit characters to '\uxxxx' */
8517 else {
8518 *p++ = '\\';
8519 *p++ = 'u';
8520 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8521 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8522 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8523 *p++ = hexdigits[ucs & 0x000F];
8524 }
8525 }
8526 /* Copy characters as-is */
8527 else {
8528 *p++ = ch;
8529#ifndef Py_UNICODE_WIDE
8530 if (ucs >= 0x10000)
8531 *p++ = ch2;
8532#endif
8533 }
8534 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008535 }
8536 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008537 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008538
8539 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008540 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008541 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542}
8543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008544PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546\n\
8547Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008548such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549arguments start and end are interpreted as in slice notation.\n\
8550\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008551Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552
8553static PyObject *
8554unicode_rfind(PyUnicodeObject *self, PyObject *args)
8555{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008556 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008557 Py_ssize_t start;
8558 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008559 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560
Christian Heimes9cd17752007-11-18 19:35:23 +00008561 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563
Thomas Wouters477c8d52006-05-27 19:21:47 +00008564 result = stringlib_rfind_slice(
8565 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8566 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8567 start, end
8568 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569
8570 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008571
Christian Heimes217cfd12007-12-02 14:31:20 +00008572 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573}
8574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008575PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008578Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579
8580static PyObject *
8581unicode_rindex(PyUnicodeObject *self, PyObject *args)
8582{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008583 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008584 Py_ssize_t start;
8585 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008586 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587
Christian Heimes9cd17752007-11-18 19:35:23 +00008588 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590
Thomas Wouters477c8d52006-05-27 19:21:47 +00008591 result = stringlib_rfind_slice(
8592 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8593 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8594 start, end
8595 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596
8597 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008598
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 if (result < 0) {
8600 PyErr_SetString(PyExc_ValueError, "substring not found");
8601 return NULL;
8602 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008603 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604}
8605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008606PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008609Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008610done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
8612static PyObject *
8613unicode_rjust(PyUnicodeObject *self, PyObject *args)
8614{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008615 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008616 Py_UNICODE fillchar = ' ';
8617
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008618 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 return NULL;
8620
Tim Peters7a29bd52001-09-12 03:03:31 +00008621 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 Py_INCREF(self);
8623 return (PyObject*) self;
8624 }
8625
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008626 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627}
8628
Alexander Belopolsky40018472011-02-26 01:02:56 +00008629PyObject *
8630PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631{
8632 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008633
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 s = PyUnicode_FromObject(s);
8635 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008636 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 if (sep != NULL) {
8638 sep = PyUnicode_FromObject(sep);
8639 if (sep == NULL) {
8640 Py_DECREF(s);
8641 return NULL;
8642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 }
8644
8645 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8646
8647 Py_DECREF(s);
8648 Py_XDECREF(sep);
8649 return result;
8650}
8651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008652PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654\n\
8655Return a list of the words in S, using sep as the\n\
8656delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008657splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008658whitespace string is a separator and empty strings are\n\
8659removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660
8661static PyObject*
8662unicode_split(PyUnicodeObject *self, PyObject *args)
8663{
8664 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008665 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666
Martin v. Löwis18e16552006-02-15 17:27:45 +00008667 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 return NULL;
8669
8670 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676}
8677
Thomas Wouters477c8d52006-05-27 19:21:47 +00008678PyObject *
8679PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8680{
8681 PyObject* str_obj;
8682 PyObject* sep_obj;
8683 PyObject* out;
8684
8685 str_obj = PyUnicode_FromObject(str_in);
8686 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008688 sep_obj = PyUnicode_FromObject(sep_in);
8689 if (!sep_obj) {
8690 Py_DECREF(str_obj);
8691 return NULL;
8692 }
8693
8694 out = stringlib_partition(
8695 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8696 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8697 );
8698
8699 Py_DECREF(sep_obj);
8700 Py_DECREF(str_obj);
8701
8702 return out;
8703}
8704
8705
8706PyObject *
8707PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8708{
8709 PyObject* str_obj;
8710 PyObject* sep_obj;
8711 PyObject* out;
8712
8713 str_obj = PyUnicode_FromObject(str_in);
8714 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008716 sep_obj = PyUnicode_FromObject(sep_in);
8717 if (!sep_obj) {
8718 Py_DECREF(str_obj);
8719 return NULL;
8720 }
8721
8722 out = stringlib_rpartition(
8723 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8724 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8725 );
8726
8727 Py_DECREF(sep_obj);
8728 Py_DECREF(str_obj);
8729
8730 return out;
8731}
8732
8733PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008735\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008736Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008737the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008738found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008739
8740static PyObject*
8741unicode_partition(PyUnicodeObject *self, PyObject *separator)
8742{
8743 return PyUnicode_Partition((PyObject *)self, separator);
8744}
8745
8746PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008747 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008748\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008749Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008750the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008751separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008752
8753static PyObject*
8754unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8755{
8756 return PyUnicode_RPartition((PyObject *)self, separator);
8757}
8758
Alexander Belopolsky40018472011-02-26 01:02:56 +00008759PyObject *
8760PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008761{
8762 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008763
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008764 s = PyUnicode_FromObject(s);
8765 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008766 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 if (sep != NULL) {
8768 sep = PyUnicode_FromObject(sep);
8769 if (sep == NULL) {
8770 Py_DECREF(s);
8771 return NULL;
8772 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008773 }
8774
8775 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8776
8777 Py_DECREF(s);
8778 Py_XDECREF(sep);
8779 return result;
8780}
8781
8782PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008784\n\
8785Return a list of the words in S, using sep as the\n\
8786delimiter string, starting at the end of the string and\n\
8787working to the front. If maxsplit is given, at most maxsplit\n\
8788splits are done. If sep is not specified, any whitespace string\n\
8789is a separator.");
8790
8791static PyObject*
8792unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8793{
8794 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008795 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008796
Martin v. Löwis18e16552006-02-15 17:27:45 +00008797 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008798 return NULL;
8799
8800 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008802 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008804 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008806}
8807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008808PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810\n\
8811Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008812Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008813is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814
8815static PyObject*
8816unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8817{
Guido van Rossum86662912000-04-11 15:38:46 +00008818 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819
Guido van Rossum86662912000-04-11 15:38:46 +00008820 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821 return NULL;
8822
Guido van Rossum86662912000-04-11 15:38:46 +00008823 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824}
8825
8826static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008827PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828{
Walter Dörwald346737f2007-05-31 10:44:43 +00008829 if (PyUnicode_CheckExact(self)) {
8830 Py_INCREF(self);
8831 return self;
8832 } else
8833 /* Subtype -- return genuine unicode string with the same value. */
8834 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8835 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836}
8837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008838PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840\n\
8841Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008842and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843
8844static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008845unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 return fixup(self, fixswapcase);
8848}
8849
Georg Brandlceee0772007-11-27 23:48:05 +00008850PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008852\n\
8853Return a translation table usable for str.translate().\n\
8854If there is only one argument, it must be a dictionary mapping Unicode\n\
8855ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008856Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008857If there are two arguments, they must be strings of equal length, and\n\
8858in the resulting dictionary, each character in x will be mapped to the\n\
8859character at the same position in y. If there is a third argument, it\n\
8860must be a string, whose characters will be mapped to None in the result.");
8861
8862static PyObject*
8863unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8864{
8865 PyObject *x, *y = NULL, *z = NULL;
8866 PyObject *new = NULL, *key, *value;
8867 Py_ssize_t i = 0;
8868 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008869
Georg Brandlceee0772007-11-27 23:48:05 +00008870 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8871 return NULL;
8872 new = PyDict_New();
8873 if (!new)
8874 return NULL;
8875 if (y != NULL) {
8876 /* x must be a string too, of equal length */
8877 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8878 if (!PyUnicode_Check(x)) {
8879 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8880 "be a string if there is a second argument");
8881 goto err;
8882 }
8883 if (PyUnicode_GET_SIZE(x) != ylen) {
8884 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8885 "arguments must have equal length");
8886 goto err;
8887 }
8888 /* create entries for translating chars in x to those in y */
8889 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008890 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8891 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008892 if (!key || !value)
8893 goto err;
8894 res = PyDict_SetItem(new, key, value);
8895 Py_DECREF(key);
8896 Py_DECREF(value);
8897 if (res < 0)
8898 goto err;
8899 }
8900 /* create entries for deleting chars in z */
8901 if (z != NULL) {
8902 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008903 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008904 if (!key)
8905 goto err;
8906 res = PyDict_SetItem(new, key, Py_None);
8907 Py_DECREF(key);
8908 if (res < 0)
8909 goto err;
8910 }
8911 }
8912 } else {
8913 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008914 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008915 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8916 "to maketrans it must be a dict");
8917 goto err;
8918 }
8919 /* copy entries into the new dict, converting string keys to int keys */
8920 while (PyDict_Next(x, &i, &key, &value)) {
8921 if (PyUnicode_Check(key)) {
8922 /* convert string keys to integer keys */
8923 PyObject *newkey;
8924 if (PyUnicode_GET_SIZE(key) != 1) {
8925 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8926 "table must be of length 1");
8927 goto err;
8928 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008929 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008930 if (!newkey)
8931 goto err;
8932 res = PyDict_SetItem(new, newkey, value);
8933 Py_DECREF(newkey);
8934 if (res < 0)
8935 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008936 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008937 /* just keep integer keys */
8938 if (PyDict_SetItem(new, key, value) < 0)
8939 goto err;
8940 } else {
8941 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8942 "be strings or integers");
8943 goto err;
8944 }
8945 }
8946 }
8947 return new;
8948 err:
8949 Py_DECREF(new);
8950 return NULL;
8951}
8952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008953PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955\n\
8956Return a copy of the string S, where all characters have been mapped\n\
8957through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008958Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008959Unmapped characters are left untouched. Characters mapped to None\n\
8960are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961
8962static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008963unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964{
Georg Brandlceee0772007-11-27 23:48:05 +00008965 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966}
8967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008968PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008971Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972
8973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008974unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 return fixup(self, fixupper);
8977}
8978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008979PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008982Pad a numeric string S with zeros on the left, to fill a field\n\
8983of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984
8985static PyObject *
8986unicode_zfill(PyUnicodeObject *self, PyObject *args)
8987{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008988 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 PyUnicodeObject *u;
8990
Martin v. Löwis18e16552006-02-15 17:27:45 +00008991 Py_ssize_t width;
8992 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993 return NULL;
8994
8995 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008996 if (PyUnicode_CheckExact(self)) {
8997 Py_INCREF(self);
8998 return (PyObject*) self;
8999 }
9000 else
9001 return PyUnicode_FromUnicode(
9002 PyUnicode_AS_UNICODE(self),
9003 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 }
9006
9007 fill = width - self->length;
9008
9009 u = pad(self, fill, 0, '0');
9010
Walter Dörwald068325e2002-04-15 13:36:47 +00009011 if (u == NULL)
9012 return NULL;
9013
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 if (u->str[fill] == '+' || u->str[fill] == '-') {
9015 /* move sign to beginning of string */
9016 u->str[0] = u->str[fill];
9017 u->str[fill] = '0';
9018 }
9019
9020 return (PyObject*) u;
9021}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022
9023#if 0
9024static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009025unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026{
Christian Heimes2202f872008-02-06 14:31:34 +00009027 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009029
9030static PyObject *
9031unicode__decimal2ascii(PyObject *self)
9032{
9033 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9034 PyUnicode_GET_SIZE(self));
9035}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036#endif
9037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009038PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009039 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009041Return True if S starts with the specified prefix, False otherwise.\n\
9042With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009043With optional end, stop comparing S at that position.\n\
9044prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045
9046static PyObject *
9047unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009048 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009050 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009052 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009053 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009054 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009056 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9058 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009059 if (PyTuple_Check(subobj)) {
9060 Py_ssize_t i;
9061 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9062 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009063 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009064 if (substring == NULL)
9065 return NULL;
9066 result = tailmatch(self, substring, start, end, -1);
9067 Py_DECREF(substring);
9068 if (result) {
9069 Py_RETURN_TRUE;
9070 }
9071 }
9072 /* nothing matched */
9073 Py_RETURN_FALSE;
9074 }
9075 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009078 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009080 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081}
9082
9083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009084PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009087Return True if S ends with the specified suffix, False otherwise.\n\
9088With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009089With optional end, stop comparing S at that position.\n\
9090suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091
9092static PyObject *
9093unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009096 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009098 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009099 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009100 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009102 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9104 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009105 if (PyTuple_Check(subobj)) {
9106 Py_ssize_t i;
9107 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9108 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009109 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009110 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009112 result = tailmatch(self, substring, start, end, +1);
9113 Py_DECREF(substring);
9114 if (result) {
9115 Py_RETURN_TRUE;
9116 }
9117 }
9118 Py_RETURN_FALSE;
9119 }
9120 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009124 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009126 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127}
9128
Eric Smith8c663262007-08-25 02:26:07 +00009129#include "stringlib/string_format.h"
9130
9131PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009133\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009134Return a formatted version of S, using substitutions from args and kwargs.\n\
9135The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009136
Eric Smith27bbca62010-11-04 17:06:58 +00009137PyDoc_STRVAR(format_map__doc__,
9138 "S.format_map(mapping) -> str\n\
9139\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009140Return a formatted version of S, using substitutions from mapping.\n\
9141The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009142
Eric Smith4a7d76d2008-05-30 18:10:19 +00009143static PyObject *
9144unicode__format__(PyObject* self, PyObject* args)
9145{
9146 PyObject *format_spec;
9147
9148 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9149 return NULL;
9150
9151 return _PyUnicode_FormatAdvanced(self,
9152 PyUnicode_AS_UNICODE(format_spec),
9153 PyUnicode_GET_SIZE(format_spec));
9154}
9155
Eric Smith8c663262007-08-25 02:26:07 +00009156PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009158\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009159Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009160
9161static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009162unicode__sizeof__(PyUnicodeObject *v)
9163{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009164 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9165 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009166}
9167
9168PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009170
9171static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009172unicode_getnewargs(PyUnicodeObject *v)
9173{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009174 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009175}
9176
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177static PyMethodDef unicode_methods[] = {
9178
9179 /* Order is according to common usage: often used methods should
9180 appear first, since lookup is done sequentially. */
9181
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009182 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009183 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9184 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009185 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009186 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9187 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9188 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9189 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9190 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9191 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9192 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009193 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009194 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9195 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9196 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009197 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009198 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9199 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9200 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009201 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009202 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009203 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009204 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009205 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9206 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9207 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9208 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9209 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9210 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9211 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9212 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9213 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9214 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9215 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9216 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9217 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9218 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009219 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009220 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009221 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009222 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009223 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009224 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009225 {"maketrans", (PyCFunction) unicode_maketrans,
9226 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009227 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009228#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009229 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230#endif
9231
9232#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009233 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009234 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009235 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236#endif
9237
Benjamin Peterson14339b62009-01-31 16:36:08 +00009238 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239 {NULL, NULL}
9240};
9241
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009242static PyObject *
9243unicode_mod(PyObject *v, PyObject *w)
9244{
Benjamin Peterson29060642009-01-31 22:14:21 +00009245 if (!PyUnicode_Check(v)) {
9246 Py_INCREF(Py_NotImplemented);
9247 return Py_NotImplemented;
9248 }
9249 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009250}
9251
9252static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009253 0, /*nb_add*/
9254 0, /*nb_subtract*/
9255 0, /*nb_multiply*/
9256 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009257};
9258
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009260 (lenfunc) unicode_length, /* sq_length */
9261 PyUnicode_Concat, /* sq_concat */
9262 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9263 (ssizeargfunc) unicode_getitem, /* sq_item */
9264 0, /* sq_slice */
9265 0, /* sq_ass_item */
9266 0, /* sq_ass_slice */
9267 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268};
9269
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009270static PyObject*
9271unicode_subscript(PyUnicodeObject* self, PyObject* item)
9272{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009273 if (PyIndex_Check(item)) {
9274 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009275 if (i == -1 && PyErr_Occurred())
9276 return NULL;
9277 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009278 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009279 return unicode_getitem(self, i);
9280 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009281 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009282 Py_UNICODE* source_buf;
9283 Py_UNICODE* result_buf;
9284 PyObject* result;
9285
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009286 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009287 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009288 return NULL;
9289 }
9290
9291 if (slicelength <= 0) {
9292 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009293 } else if (start == 0 && step == 1 && slicelength == self->length &&
9294 PyUnicode_CheckExact(self)) {
9295 Py_INCREF(self);
9296 return (PyObject *)self;
9297 } else if (step == 1) {
9298 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009299 } else {
9300 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009301 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9302 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009303
Benjamin Peterson29060642009-01-31 22:14:21 +00009304 if (result_buf == NULL)
9305 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009306
9307 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9308 result_buf[i] = source_buf[cur];
9309 }
Tim Petersced69f82003-09-16 20:30:58 +00009310
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009311 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009312 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009313 return result;
9314 }
9315 } else {
9316 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9317 return NULL;
9318 }
9319}
9320
9321static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009322 (lenfunc)unicode_length, /* mp_length */
9323 (binaryfunc)unicode_subscript, /* mp_subscript */
9324 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009325};
9326
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328/* Helpers for PyUnicode_Format() */
9329
9330static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009331getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009333 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009335 (*p_argidx)++;
9336 if (arglen < 0)
9337 return args;
9338 else
9339 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340 }
9341 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343 return NULL;
9344}
9345
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009346/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009348static PyObject *
9349formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009351 char *p;
9352 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009354
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355 x = PyFloat_AsDouble(v);
9356 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009357 return NULL;
9358
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009360 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009361
Eric Smith0923d1d2009-04-16 20:16:10 +00009362 p = PyOS_double_to_string(x, type, prec,
9363 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009364 if (p == NULL)
9365 return NULL;
9366 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009367 PyMem_Free(p);
9368 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369}
9370
Tim Peters38fd5b62000-09-21 05:43:11 +00009371static PyObject*
9372formatlong(PyObject *val, int flags, int prec, int type)
9373{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009374 char *buf;
9375 int len;
9376 PyObject *str; /* temporary string object. */
9377 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009378
Benjamin Peterson14339b62009-01-31 16:36:08 +00009379 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9380 if (!str)
9381 return NULL;
9382 result = PyUnicode_FromStringAndSize(buf, len);
9383 Py_DECREF(str);
9384 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009385}
9386
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387static int
9388formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009389 size_t buflen,
9390 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009392 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009393 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 if (PyUnicode_GET_SIZE(v) == 1) {
9395 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9396 buf[1] = '\0';
9397 return 1;
9398 }
9399#ifndef Py_UNICODE_WIDE
9400 if (PyUnicode_GET_SIZE(v) == 2) {
9401 /* Decode a valid surrogate pair */
9402 int c0 = PyUnicode_AS_UNICODE(v)[0];
9403 int c1 = PyUnicode_AS_UNICODE(v)[1];
9404 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9405 0xDC00 <= c1 && c1 <= 0xDFFF) {
9406 buf[0] = c0;
9407 buf[1] = c1;
9408 buf[2] = '\0';
9409 return 2;
9410 }
9411 }
9412#endif
9413 goto onError;
9414 }
9415 else {
9416 /* Integer input truncated to a character */
9417 long x;
9418 x = PyLong_AsLong(v);
9419 if (x == -1 && PyErr_Occurred())
9420 goto onError;
9421
9422 if (x < 0 || x > 0x10ffff) {
9423 PyErr_SetString(PyExc_OverflowError,
9424 "%c arg not in range(0x110000)");
9425 return -1;
9426 }
9427
9428#ifndef Py_UNICODE_WIDE
9429 if (x > 0xffff) {
9430 x -= 0x10000;
9431 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9432 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9433 return 2;
9434 }
9435#endif
9436 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009437 buf[1] = '\0';
9438 return 1;
9439 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009440
Benjamin Peterson29060642009-01-31 22:14:21 +00009441 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009442 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009443 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009444 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445}
9446
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009447/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009448 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009449*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009450#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009451
Alexander Belopolsky40018472011-02-26 01:02:56 +00009452PyObject *
9453PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454{
9455 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009456 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457 int args_owned = 0;
9458 PyUnicodeObject *result = NULL;
9459 PyObject *dict = NULL;
9460 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009461
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009463 PyErr_BadInternalCall();
9464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465 }
9466 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009467 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 fmt = PyUnicode_AS_UNICODE(uformat);
9470 fmtcnt = PyUnicode_GET_SIZE(uformat);
9471
9472 reslen = rescnt = fmtcnt + 100;
9473 result = _PyUnicode_New(reslen);
9474 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 res = PyUnicode_AS_UNICODE(result);
9477
9478 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009479 arglen = PyTuple_Size(args);
9480 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 }
9482 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009483 arglen = -1;
9484 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009486 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009487 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009488 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489
9490 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009491 if (*fmt != '%') {
9492 if (--rescnt < 0) {
9493 rescnt = fmtcnt + 100;
9494 reslen += rescnt;
9495 if (_PyUnicode_Resize(&result, reslen) < 0)
9496 goto onError;
9497 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9498 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009499 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009500 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009501 }
9502 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 /* Got a format specifier */
9504 int flags = 0;
9505 Py_ssize_t width = -1;
9506 int prec = -1;
9507 Py_UNICODE c = '\0';
9508 Py_UNICODE fill;
9509 int isnumok;
9510 PyObject *v = NULL;
9511 PyObject *temp = NULL;
9512 Py_UNICODE *pbuf;
9513 Py_UNICODE sign;
9514 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009515 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 fmt++;
9518 if (*fmt == '(') {
9519 Py_UNICODE *keystart;
9520 Py_ssize_t keylen;
9521 PyObject *key;
9522 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009523
Benjamin Peterson29060642009-01-31 22:14:21 +00009524 if (dict == NULL) {
9525 PyErr_SetString(PyExc_TypeError,
9526 "format requires a mapping");
9527 goto onError;
9528 }
9529 ++fmt;
9530 --fmtcnt;
9531 keystart = fmt;
9532 /* Skip over balanced parentheses */
9533 while (pcount > 0 && --fmtcnt >= 0) {
9534 if (*fmt == ')')
9535 --pcount;
9536 else if (*fmt == '(')
9537 ++pcount;
9538 fmt++;
9539 }
9540 keylen = fmt - keystart - 1;
9541 if (fmtcnt < 0 || pcount > 0) {
9542 PyErr_SetString(PyExc_ValueError,
9543 "incomplete format key");
9544 goto onError;
9545 }
9546#if 0
9547 /* keys are converted to strings using UTF-8 and
9548 then looked up since Python uses strings to hold
9549 variables names etc. in its namespaces and we
9550 wouldn't want to break common idioms. */
9551 key = PyUnicode_EncodeUTF8(keystart,
9552 keylen,
9553 NULL);
9554#else
9555 key = PyUnicode_FromUnicode(keystart, keylen);
9556#endif
9557 if (key == NULL)
9558 goto onError;
9559 if (args_owned) {
9560 Py_DECREF(args);
9561 args_owned = 0;
9562 }
9563 args = PyObject_GetItem(dict, key);
9564 Py_DECREF(key);
9565 if (args == NULL) {
9566 goto onError;
9567 }
9568 args_owned = 1;
9569 arglen = -1;
9570 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 while (--fmtcnt >= 0) {
9573 switch (c = *fmt++) {
9574 case '-': flags |= F_LJUST; continue;
9575 case '+': flags |= F_SIGN; continue;
9576 case ' ': flags |= F_BLANK; continue;
9577 case '#': flags |= F_ALT; continue;
9578 case '0': flags |= F_ZERO; continue;
9579 }
9580 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009581 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 if (c == '*') {
9583 v = getnextarg(args, arglen, &argidx);
9584 if (v == NULL)
9585 goto onError;
9586 if (!PyLong_Check(v)) {
9587 PyErr_SetString(PyExc_TypeError,
9588 "* wants int");
9589 goto onError;
9590 }
9591 width = PyLong_AsLong(v);
9592 if (width == -1 && PyErr_Occurred())
9593 goto onError;
9594 if (width < 0) {
9595 flags |= F_LJUST;
9596 width = -width;
9597 }
9598 if (--fmtcnt >= 0)
9599 c = *fmt++;
9600 }
9601 else if (c >= '0' && c <= '9') {
9602 width = c - '0';
9603 while (--fmtcnt >= 0) {
9604 c = *fmt++;
9605 if (c < '0' || c > '9')
9606 break;
9607 if ((width*10) / 10 != width) {
9608 PyErr_SetString(PyExc_ValueError,
9609 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009610 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009611 }
9612 width = width*10 + (c - '0');
9613 }
9614 }
9615 if (c == '.') {
9616 prec = 0;
9617 if (--fmtcnt >= 0)
9618 c = *fmt++;
9619 if (c == '*') {
9620 v = getnextarg(args, arglen, &argidx);
9621 if (v == NULL)
9622 goto onError;
9623 if (!PyLong_Check(v)) {
9624 PyErr_SetString(PyExc_TypeError,
9625 "* wants int");
9626 goto onError;
9627 }
9628 prec = PyLong_AsLong(v);
9629 if (prec == -1 && PyErr_Occurred())
9630 goto onError;
9631 if (prec < 0)
9632 prec = 0;
9633 if (--fmtcnt >= 0)
9634 c = *fmt++;
9635 }
9636 else if (c >= '0' && c <= '9') {
9637 prec = c - '0';
9638 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009639 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009640 if (c < '0' || c > '9')
9641 break;
9642 if ((prec*10) / 10 != prec) {
9643 PyErr_SetString(PyExc_ValueError,
9644 "prec too big");
9645 goto onError;
9646 }
9647 prec = prec*10 + (c - '0');
9648 }
9649 }
9650 } /* prec */
9651 if (fmtcnt >= 0) {
9652 if (c == 'h' || c == 'l' || c == 'L') {
9653 if (--fmtcnt >= 0)
9654 c = *fmt++;
9655 }
9656 }
9657 if (fmtcnt < 0) {
9658 PyErr_SetString(PyExc_ValueError,
9659 "incomplete format");
9660 goto onError;
9661 }
9662 if (c != '%') {
9663 v = getnextarg(args, arglen, &argidx);
9664 if (v == NULL)
9665 goto onError;
9666 }
9667 sign = 0;
9668 fill = ' ';
9669 switch (c) {
9670
9671 case '%':
9672 pbuf = formatbuf;
9673 /* presume that buffer length is at least 1 */
9674 pbuf[0] = '%';
9675 len = 1;
9676 break;
9677
9678 case 's':
9679 case 'r':
9680 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009681 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009682 temp = v;
9683 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009684 }
9685 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009686 if (c == 's')
9687 temp = PyObject_Str(v);
9688 else if (c == 'r')
9689 temp = PyObject_Repr(v);
9690 else
9691 temp = PyObject_ASCII(v);
9692 if (temp == NULL)
9693 goto onError;
9694 if (PyUnicode_Check(temp))
9695 /* nothing to do */;
9696 else {
9697 Py_DECREF(temp);
9698 PyErr_SetString(PyExc_TypeError,
9699 "%s argument has non-string str()");
9700 goto onError;
9701 }
9702 }
9703 pbuf = PyUnicode_AS_UNICODE(temp);
9704 len = PyUnicode_GET_SIZE(temp);
9705 if (prec >= 0 && len > prec)
9706 len = prec;
9707 break;
9708
9709 case 'i':
9710 case 'd':
9711 case 'u':
9712 case 'o':
9713 case 'x':
9714 case 'X':
9715 if (c == 'i')
9716 c = 'd';
9717 isnumok = 0;
9718 if (PyNumber_Check(v)) {
9719 PyObject *iobj=NULL;
9720
9721 if (PyLong_Check(v)) {
9722 iobj = v;
9723 Py_INCREF(iobj);
9724 }
9725 else {
9726 iobj = PyNumber_Long(v);
9727 }
9728 if (iobj!=NULL) {
9729 if (PyLong_Check(iobj)) {
9730 isnumok = 1;
9731 temp = formatlong(iobj, flags, prec, c);
9732 Py_DECREF(iobj);
9733 if (!temp)
9734 goto onError;
9735 pbuf = PyUnicode_AS_UNICODE(temp);
9736 len = PyUnicode_GET_SIZE(temp);
9737 sign = 1;
9738 }
9739 else {
9740 Py_DECREF(iobj);
9741 }
9742 }
9743 }
9744 if (!isnumok) {
9745 PyErr_Format(PyExc_TypeError,
9746 "%%%c format: a number is required, "
9747 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9748 goto onError;
9749 }
9750 if (flags & F_ZERO)
9751 fill = '0';
9752 break;
9753
9754 case 'e':
9755 case 'E':
9756 case 'f':
9757 case 'F':
9758 case 'g':
9759 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009760 temp = formatfloat(v, flags, prec, c);
9761 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009762 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009763 pbuf = PyUnicode_AS_UNICODE(temp);
9764 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009765 sign = 1;
9766 if (flags & F_ZERO)
9767 fill = '0';
9768 break;
9769
9770 case 'c':
9771 pbuf = formatbuf;
9772 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9773 if (len < 0)
9774 goto onError;
9775 break;
9776
9777 default:
9778 PyErr_Format(PyExc_ValueError,
9779 "unsupported format character '%c' (0x%x) "
9780 "at index %zd",
9781 (31<=c && c<=126) ? (char)c : '?',
9782 (int)c,
9783 (Py_ssize_t)(fmt - 1 -
9784 PyUnicode_AS_UNICODE(uformat)));
9785 goto onError;
9786 }
9787 if (sign) {
9788 if (*pbuf == '-' || *pbuf == '+') {
9789 sign = *pbuf++;
9790 len--;
9791 }
9792 else if (flags & F_SIGN)
9793 sign = '+';
9794 else if (flags & F_BLANK)
9795 sign = ' ';
9796 else
9797 sign = 0;
9798 }
9799 if (width < len)
9800 width = len;
9801 if (rescnt - (sign != 0) < width) {
9802 reslen -= rescnt;
9803 rescnt = width + fmtcnt + 100;
9804 reslen += rescnt;
9805 if (reslen < 0) {
9806 Py_XDECREF(temp);
9807 PyErr_NoMemory();
9808 goto onError;
9809 }
9810 if (_PyUnicode_Resize(&result, reslen) < 0) {
9811 Py_XDECREF(temp);
9812 goto onError;
9813 }
9814 res = PyUnicode_AS_UNICODE(result)
9815 + reslen - rescnt;
9816 }
9817 if (sign) {
9818 if (fill != ' ')
9819 *res++ = sign;
9820 rescnt--;
9821 if (width > len)
9822 width--;
9823 }
9824 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9825 assert(pbuf[0] == '0');
9826 assert(pbuf[1] == c);
9827 if (fill != ' ') {
9828 *res++ = *pbuf++;
9829 *res++ = *pbuf++;
9830 }
9831 rescnt -= 2;
9832 width -= 2;
9833 if (width < 0)
9834 width = 0;
9835 len -= 2;
9836 }
9837 if (width > len && !(flags & F_LJUST)) {
9838 do {
9839 --rescnt;
9840 *res++ = fill;
9841 } while (--width > len);
9842 }
9843 if (fill == ' ') {
9844 if (sign)
9845 *res++ = sign;
9846 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9847 assert(pbuf[0] == '0');
9848 assert(pbuf[1] == c);
9849 *res++ = *pbuf++;
9850 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009851 }
9852 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009853 Py_UNICODE_COPY(res, pbuf, len);
9854 res += len;
9855 rescnt -= len;
9856 while (--width >= len) {
9857 --rescnt;
9858 *res++ = ' ';
9859 }
9860 if (dict && (argidx < arglen) && c != '%') {
9861 PyErr_SetString(PyExc_TypeError,
9862 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009863 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009864 goto onError;
9865 }
9866 Py_XDECREF(temp);
9867 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868 } /* until end */
9869 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009870 PyErr_SetString(PyExc_TypeError,
9871 "not all arguments converted during string formatting");
9872 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873 }
9874
Thomas Woutersa96affe2006-03-12 00:29:36 +00009875 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009876 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009878 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879 }
9880 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881 return (PyObject *)result;
9882
Benjamin Peterson29060642009-01-31 22:14:21 +00009883 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884 Py_XDECREF(result);
9885 Py_DECREF(uformat);
9886 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009887 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888 }
9889 return NULL;
9890}
9891
Jeremy Hylton938ace62002-07-17 16:30:39 +00009892static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009893unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9894
Tim Peters6d6c1a32001-08-02 04:15:00 +00009895static PyObject *
9896unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9897{
Benjamin Peterson29060642009-01-31 22:14:21 +00009898 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009899 static char *kwlist[] = {"object", "encoding", "errors", 0};
9900 char *encoding = NULL;
9901 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009902
Benjamin Peterson14339b62009-01-31 16:36:08 +00009903 if (type != &PyUnicode_Type)
9904 return unicode_subtype_new(type, args, kwds);
9905 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009906 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009907 return NULL;
9908 if (x == NULL)
9909 return (PyObject *)_PyUnicode_New(0);
9910 if (encoding == NULL && errors == NULL)
9911 return PyObject_Str(x);
9912 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009914}
9915
Guido van Rossume023fe02001-08-30 03:12:59 +00009916static PyObject *
9917unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9918{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009919 PyUnicodeObject *tmp, *pnew;
9920 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009921
Benjamin Peterson14339b62009-01-31 16:36:08 +00009922 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9923 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9924 if (tmp == NULL)
9925 return NULL;
9926 assert(PyUnicode_Check(tmp));
9927 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9928 if (pnew == NULL) {
9929 Py_DECREF(tmp);
9930 return NULL;
9931 }
9932 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9933 if (pnew->str == NULL) {
9934 _Py_ForgetReference((PyObject *)pnew);
9935 PyObject_Del(pnew);
9936 Py_DECREF(tmp);
9937 return PyErr_NoMemory();
9938 }
9939 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9940 pnew->length = n;
9941 pnew->hash = tmp->hash;
9942 Py_DECREF(tmp);
9943 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009944}
9945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009946PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009947 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009948\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009949Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009950encoding defaults to the current default string encoding.\n\
9951errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009952
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009953static PyObject *unicode_iter(PyObject *seq);
9954
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009956 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009957 "str", /* tp_name */
9958 sizeof(PyUnicodeObject), /* tp_size */
9959 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009961 (destructor)unicode_dealloc, /* tp_dealloc */
9962 0, /* tp_print */
9963 0, /* tp_getattr */
9964 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009965 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009966 unicode_repr, /* tp_repr */
9967 &unicode_as_number, /* tp_as_number */
9968 &unicode_as_sequence, /* tp_as_sequence */
9969 &unicode_as_mapping, /* tp_as_mapping */
9970 (hashfunc) unicode_hash, /* tp_hash*/
9971 0, /* tp_call*/
9972 (reprfunc) unicode_str, /* tp_str */
9973 PyObject_GenericGetAttr, /* tp_getattro */
9974 0, /* tp_setattro */
9975 0, /* tp_as_buffer */
9976 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009977 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009978 unicode_doc, /* tp_doc */
9979 0, /* tp_traverse */
9980 0, /* tp_clear */
9981 PyUnicode_RichCompare, /* tp_richcompare */
9982 0, /* tp_weaklistoffset */
9983 unicode_iter, /* tp_iter */
9984 0, /* tp_iternext */
9985 unicode_methods, /* tp_methods */
9986 0, /* tp_members */
9987 0, /* tp_getset */
9988 &PyBaseObject_Type, /* tp_base */
9989 0, /* tp_dict */
9990 0, /* tp_descr_get */
9991 0, /* tp_descr_set */
9992 0, /* tp_dictoffset */
9993 0, /* tp_init */
9994 0, /* tp_alloc */
9995 unicode_new, /* tp_new */
9996 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997};
9998
9999/* Initialize the Unicode implementation */
10000
Thomas Wouters78890102000-07-22 19:25:51 +000010001void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010003 int i;
10004
Thomas Wouters477c8d52006-05-27 19:21:47 +000010005 /* XXX - move this array to unicodectype.c ? */
10006 Py_UNICODE linebreak[] = {
10007 0x000A, /* LINE FEED */
10008 0x000D, /* CARRIAGE RETURN */
10009 0x001C, /* FILE SEPARATOR */
10010 0x001D, /* GROUP SEPARATOR */
10011 0x001E, /* RECORD SEPARATOR */
10012 0x0085, /* NEXT LINE */
10013 0x2028, /* LINE SEPARATOR */
10014 0x2029, /* PARAGRAPH SEPARATOR */
10015 };
10016
Fred Drakee4315f52000-05-09 19:53:39 +000010017 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010018 free_list = NULL;
10019 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010021 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010022 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010023
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010024 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010025 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010026 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010027 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010028
10029 /* initialize the linebreak bloom filter */
10030 bloom_linebreak = make_bloom_mask(
10031 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10032 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010033
10034 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035}
10036
10037/* Finalize the Unicode implementation */
10038
Christian Heimesa156e092008-02-16 07:38:31 +000010039int
10040PyUnicode_ClearFreeList(void)
10041{
10042 int freelist_size = numfree;
10043 PyUnicodeObject *u;
10044
10045 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010046 PyUnicodeObject *v = u;
10047 u = *(PyUnicodeObject **)u;
10048 if (v->str)
10049 PyObject_DEL(v->str);
10050 Py_XDECREF(v->defenc);
10051 PyObject_Del(v);
10052 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010053 }
10054 free_list = NULL;
10055 assert(numfree == 0);
10056 return freelist_size;
10057}
10058
Guido van Rossumd57fd912000-03-10 22:53:23 +000010059void
Thomas Wouters78890102000-07-22 19:25:51 +000010060_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010062 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010064 Py_XDECREF(unicode_empty);
10065 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010066
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010067 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010068 if (unicode_latin1[i]) {
10069 Py_DECREF(unicode_latin1[i]);
10070 unicode_latin1[i] = NULL;
10071 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010072 }
Christian Heimesa156e092008-02-16 07:38:31 +000010073 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010074}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010075
Walter Dörwald16807132007-05-25 13:52:07 +000010076void
10077PyUnicode_InternInPlace(PyObject **p)
10078{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010079 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10080 PyObject *t;
10081 if (s == NULL || !PyUnicode_Check(s))
10082 Py_FatalError(
10083 "PyUnicode_InternInPlace: unicode strings only please!");
10084 /* If it's a subclass, we don't really know what putting
10085 it in the interned dict might do. */
10086 if (!PyUnicode_CheckExact(s))
10087 return;
10088 if (PyUnicode_CHECK_INTERNED(s))
10089 return;
10090 if (interned == NULL) {
10091 interned = PyDict_New();
10092 if (interned == NULL) {
10093 PyErr_Clear(); /* Don't leave an exception */
10094 return;
10095 }
10096 }
10097 /* It might be that the GetItem call fails even
10098 though the key is present in the dictionary,
10099 namely when this happens during a stack overflow. */
10100 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010101 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010102 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010103
Benjamin Peterson29060642009-01-31 22:14:21 +000010104 if (t) {
10105 Py_INCREF(t);
10106 Py_DECREF(*p);
10107 *p = t;
10108 return;
10109 }
Walter Dörwald16807132007-05-25 13:52:07 +000010110
Benjamin Peterson14339b62009-01-31 16:36:08 +000010111 PyThreadState_GET()->recursion_critical = 1;
10112 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10113 PyErr_Clear();
10114 PyThreadState_GET()->recursion_critical = 0;
10115 return;
10116 }
10117 PyThreadState_GET()->recursion_critical = 0;
10118 /* The two references in interned are not counted by refcnt.
10119 The deallocator will take care of this */
10120 Py_REFCNT(s) -= 2;
10121 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010122}
10123
10124void
10125PyUnicode_InternImmortal(PyObject **p)
10126{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010127 PyUnicode_InternInPlace(p);
10128 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10129 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10130 Py_INCREF(*p);
10131 }
Walter Dörwald16807132007-05-25 13:52:07 +000010132}
10133
10134PyObject *
10135PyUnicode_InternFromString(const char *cp)
10136{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010137 PyObject *s = PyUnicode_FromString(cp);
10138 if (s == NULL)
10139 return NULL;
10140 PyUnicode_InternInPlace(&s);
10141 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010142}
10143
Alexander Belopolsky40018472011-02-26 01:02:56 +000010144void
10145_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000010146{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010147 PyObject *keys;
10148 PyUnicodeObject *s;
10149 Py_ssize_t i, n;
10150 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010151
Benjamin Peterson14339b62009-01-31 16:36:08 +000010152 if (interned == NULL || !PyDict_Check(interned))
10153 return;
10154 keys = PyDict_Keys(interned);
10155 if (keys == NULL || !PyList_Check(keys)) {
10156 PyErr_Clear();
10157 return;
10158 }
Walter Dörwald16807132007-05-25 13:52:07 +000010159
Benjamin Peterson14339b62009-01-31 16:36:08 +000010160 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10161 detector, interned unicode strings are not forcibly deallocated;
10162 rather, we give them their stolen references back, and then clear
10163 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010164
Benjamin Peterson14339b62009-01-31 16:36:08 +000010165 n = PyList_GET_SIZE(keys);
10166 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010167 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010168 for (i = 0; i < n; i++) {
10169 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10170 switch (s->state) {
10171 case SSTATE_NOT_INTERNED:
10172 /* XXX Shouldn't happen */
10173 break;
10174 case SSTATE_INTERNED_IMMORTAL:
10175 Py_REFCNT(s) += 1;
10176 immortal_size += s->length;
10177 break;
10178 case SSTATE_INTERNED_MORTAL:
10179 Py_REFCNT(s) += 2;
10180 mortal_size += s->length;
10181 break;
10182 default:
10183 Py_FatalError("Inconsistent interned string state.");
10184 }
10185 s->state = SSTATE_NOT_INTERNED;
10186 }
10187 fprintf(stderr, "total size of all interned strings: "
10188 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10189 "mortal/immortal\n", mortal_size, immortal_size);
10190 Py_DECREF(keys);
10191 PyDict_Clear(interned);
10192 Py_DECREF(interned);
10193 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010194}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010195
10196
10197/********************* Unicode Iterator **************************/
10198
10199typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010200 PyObject_HEAD
10201 Py_ssize_t it_index;
10202 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010203} unicodeiterobject;
10204
10205static void
10206unicodeiter_dealloc(unicodeiterobject *it)
10207{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010208 _PyObject_GC_UNTRACK(it);
10209 Py_XDECREF(it->it_seq);
10210 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010211}
10212
10213static int
10214unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10215{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010216 Py_VISIT(it->it_seq);
10217 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010218}
10219
10220static PyObject *
10221unicodeiter_next(unicodeiterobject *it)
10222{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010223 PyUnicodeObject *seq;
10224 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010225
Benjamin Peterson14339b62009-01-31 16:36:08 +000010226 assert(it != NULL);
10227 seq = it->it_seq;
10228 if (seq == NULL)
10229 return NULL;
10230 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010231
Benjamin Peterson14339b62009-01-31 16:36:08 +000010232 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10233 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010234 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010235 if (item != NULL)
10236 ++it->it_index;
10237 return item;
10238 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010239
Benjamin Peterson14339b62009-01-31 16:36:08 +000010240 Py_DECREF(seq);
10241 it->it_seq = NULL;
10242 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010243}
10244
10245static PyObject *
10246unicodeiter_len(unicodeiterobject *it)
10247{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010248 Py_ssize_t len = 0;
10249 if (it->it_seq)
10250 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10251 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010252}
10253
10254PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10255
10256static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010257 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010258 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010259 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010260};
10261
10262PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010263 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10264 "str_iterator", /* tp_name */
10265 sizeof(unicodeiterobject), /* tp_basicsize */
10266 0, /* tp_itemsize */
10267 /* methods */
10268 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10269 0, /* tp_print */
10270 0, /* tp_getattr */
10271 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010272 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010273 0, /* tp_repr */
10274 0, /* tp_as_number */
10275 0, /* tp_as_sequence */
10276 0, /* tp_as_mapping */
10277 0, /* tp_hash */
10278 0, /* tp_call */
10279 0, /* tp_str */
10280 PyObject_GenericGetAttr, /* tp_getattro */
10281 0, /* tp_setattro */
10282 0, /* tp_as_buffer */
10283 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10284 0, /* tp_doc */
10285 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10286 0, /* tp_clear */
10287 0, /* tp_richcompare */
10288 0, /* tp_weaklistoffset */
10289 PyObject_SelfIter, /* tp_iter */
10290 (iternextfunc)unicodeiter_next, /* tp_iternext */
10291 unicodeiter_methods, /* tp_methods */
10292 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010293};
10294
10295static PyObject *
10296unicode_iter(PyObject *seq)
10297{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010298 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010299
Benjamin Peterson14339b62009-01-31 16:36:08 +000010300 if (!PyUnicode_Check(seq)) {
10301 PyErr_BadInternalCall();
10302 return NULL;
10303 }
10304 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10305 if (it == NULL)
10306 return NULL;
10307 it->it_index = 0;
10308 Py_INCREF(seq);
10309 it->it_seq = (PyUnicodeObject *)seq;
10310 _PyObject_GC_TRACK(it);
10311 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010312}
10313
Martin v. Löwis5b222132007-06-10 09:51:05 +000010314size_t
10315Py_UNICODE_strlen(const Py_UNICODE *u)
10316{
10317 int res = 0;
10318 while(*u++)
10319 res++;
10320 return res;
10321}
10322
10323Py_UNICODE*
10324Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10325{
10326 Py_UNICODE *u = s1;
10327 while ((*u++ = *s2++));
10328 return s1;
10329}
10330
10331Py_UNICODE*
10332Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10333{
10334 Py_UNICODE *u = s1;
10335 while ((*u++ = *s2++))
10336 if (n-- == 0)
10337 break;
10338 return s1;
10339}
10340
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010341Py_UNICODE*
10342Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10343{
10344 Py_UNICODE *u1 = s1;
10345 u1 += Py_UNICODE_strlen(u1);
10346 Py_UNICODE_strcpy(u1, s2);
10347 return s1;
10348}
10349
Martin v. Löwis5b222132007-06-10 09:51:05 +000010350int
10351Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10352{
10353 while (*s1 && *s2 && *s1 == *s2)
10354 s1++, s2++;
10355 if (*s1 && *s2)
10356 return (*s1 < *s2) ? -1 : +1;
10357 if (*s1)
10358 return 1;
10359 if (*s2)
10360 return -1;
10361 return 0;
10362}
10363
Victor Stinneref8d95c2010-08-16 22:03:11 +000010364int
10365Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10366{
10367 register Py_UNICODE u1, u2;
10368 for (; n != 0; n--) {
10369 u1 = *s1;
10370 u2 = *s2;
10371 if (u1 != u2)
10372 return (u1 < u2) ? -1 : +1;
10373 if (u1 == '\0')
10374 return 0;
10375 s1++;
10376 s2++;
10377 }
10378 return 0;
10379}
10380
Martin v. Löwis5b222132007-06-10 09:51:05 +000010381Py_UNICODE*
10382Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10383{
10384 const Py_UNICODE *p;
10385 for (p = s; *p; p++)
10386 if (*p == c)
10387 return (Py_UNICODE*)p;
10388 return NULL;
10389}
10390
Victor Stinner331ea922010-08-10 16:37:20 +000010391Py_UNICODE*
10392Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10393{
10394 const Py_UNICODE *p;
10395 p = s + Py_UNICODE_strlen(s);
10396 while (p != s) {
10397 p--;
10398 if (*p == c)
10399 return (Py_UNICODE*)p;
10400 }
10401 return NULL;
10402}
10403
Victor Stinner71133ff2010-09-01 23:43:53 +000010404Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010405PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010406{
10407 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10408 Py_UNICODE *copy;
10409 Py_ssize_t size;
10410
10411 /* Ensure we won't overflow the size. */
10412 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10413 PyErr_NoMemory();
10414 return NULL;
10415 }
10416 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10417 size *= sizeof(Py_UNICODE);
10418 copy = PyMem_Malloc(size);
10419 if (copy == NULL) {
10420 PyErr_NoMemory();
10421 return NULL;
10422 }
10423 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10424 return copy;
10425}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010426
Georg Brandl66c221e2010-10-14 07:04:07 +000010427/* A _string module, to export formatter_parser and formatter_field_name_split
10428 to the string.Formatter class implemented in Python. */
10429
10430static PyMethodDef _string_methods[] = {
10431 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10432 METH_O, PyDoc_STR("split the argument as a field name")},
10433 {"formatter_parser", (PyCFunction) formatter_parser,
10434 METH_O, PyDoc_STR("parse the argument as a format string")},
10435 {NULL, NULL}
10436};
10437
10438static struct PyModuleDef _string_module = {
10439 PyModuleDef_HEAD_INIT,
10440 "_string",
10441 PyDoc_STR("string helper module"),
10442 0,
10443 _string_methods,
10444 NULL,
10445 NULL,
10446 NULL,
10447 NULL
10448};
10449
10450PyMODINIT_FUNC
10451PyInit__string(void)
10452{
10453 return PyModule_Create(&_string_module);
10454}
10455
10456
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010457#ifdef __cplusplus
10458}
10459#endif