blob: 95823ad827a0b3b6a6a0204e87169f612b6fbc91 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Christian Heimes190d79e2008-01-30 11:58:22 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000119 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000120/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000121/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000122/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000123/* case 0x000C: * FORM FEED */
124/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000127/* case 0x001C: * FILE SEPARATOR */
128/* case 0x001D: * GROUP SEPARATOR */
129/* case 0x001E: * RECORD SEPARATOR */
130/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000131 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson14339b62009-01-31 16:36:08 +0000138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000146};
147
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149 PyObject **errorHandler,const char *encoding, const char *reason,
150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
Victor Stinner31be90b2010-04-22 19:38:16 +0000153static void raise_encode_exception(PyObject **exceptionObject,
154 const char *encoding,
155 const Py_UNICODE *unicode, Py_ssize_t size,
156 Py_ssize_t startpos, Py_ssize_t endpos,
157 const char *reason);
158
Christian Heimes190d79e2008-01-30 11:58:22 +0000159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrouf068f942010-01-13 14:19:12 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
Antoine Pitrouf068f942010-01-13 14:19:12 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
Benjamin Peterson29060642009-01-31 22:14:21 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252}
253
Benjamin Peterson29060642009-01-31 22:14:21 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000278 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000309 Ux0000 terminated; some code (e.g. new_identifier)
310 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000313 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314
315*/
316
317static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319{
320 register PyUnicodeObject *unicode;
321
Thomas Wouters477c8d52006-05-27 19:21:47 +0000322 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 if (length == 0 && unicode_empty != NULL) {
324 Py_INCREF(unicode_empty);
325 return unicode_empty;
326 }
327
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000328 /* Ensure we won't overflow the size. */
329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330 return (PyUnicodeObject *)PyErr_NoMemory();
331 }
332
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000334 if (free_list) {
335 unicode = free_list;
336 free_list = *(PyUnicodeObject **)unicode;
337 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000338 if (unicode->str) {
339 /* Keep-Alive optimization: we only upsize the buffer,
340 never downsize it. */
341 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000342 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000343 PyObject_DEL(unicode->str);
344 unicode->str = NULL;
345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000350 }
351 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 }
353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 if (unicode == NULL)
357 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 }
361
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 PyErr_NoMemory();
364 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000365 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000366 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000367 * the caller fails before initializing str -- unicode_resize()
368 * reads str[0], and the Keep-Alive optimization can keep memory
369 * allocated for str alive across a call to unicode_dealloc(unicode).
370 * We don't want unicode_resize to read uninitialized memory in
371 * that case.
372 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000373 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000377 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000378 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000380
Benjamin Peterson29060642009-01-31 22:14:21 +0000381 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000382 /* XXX UNREF/NEWREF interface should be more symmetrical */
383 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000385 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387}
388
389static
Guido van Rossum9475a232001-10-05 20:51:39 +0000390void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391{
Walter Dörwald16807132007-05-25 13:52:07 +0000392 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_NOT_INTERNED:
394 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000395
Benjamin Peterson29060642009-01-31 22:14:21 +0000396 case SSTATE_INTERNED_MORTAL:
397 /* revive dead object temporarily for DelItem */
398 Py_REFCNT(unicode) = 3;
399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400 Py_FatalError(
401 "deletion of interned string failed");
402 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 case SSTATE_INTERNED_IMMORTAL:
405 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406
Benjamin Peterson29060642009-01-31 22:14:21 +0000407 default:
408 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000409 }
410
Guido van Rossum604ddf82001-12-06 20:03:56 +0000411 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000412 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000413 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415 PyObject_DEL(unicode->str);
416 unicode->str = NULL;
417 unicode->length = 0;
418 }
419 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000420 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 }
422 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000423 *(PyUnicodeObject **)unicode = free_list;
424 free_list = unicode;
425 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000426 }
427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyObject_DEL(unicode->str);
429 Py_XDECREF(unicode->defenc);
430 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431 }
432}
433
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000436{
437 register PyUnicodeObject *v;
438
439 /* Argument checks */
440 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 PyErr_BadInternalCall();
442 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000443 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 PyErr_BadInternalCall();
447 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 }
449
450 /* Resizing unicode_empty and single character objects is not
451 possible since these are being shared. We simply return a fresh
452 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000453 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000454 (v == unicode_empty || v->length == 1)) {
455 PyUnicodeObject *w = _PyUnicode_New(length);
456 if (w == NULL)
457 return -1;
458 Py_UNICODE_COPY(w->str, v->str,
459 length < v->length ? length : v->length);
460 Py_DECREF(*unicode);
461 *unicode = w;
462 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000463 }
464
465 /* Note that we don't have to modify *unicode for unshared Unicode
466 objects, since we can modify them in-place. */
467 return unicode_resize(v, length);
468}
469
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477{
478 PyUnicodeObject *unicode;
479
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects. */
482 if (u != NULL) {
483
Benjamin Peterson29060642009-01-31 22:14:21 +0000484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000489
490 /* Single character Unicode objects in the Latin-1 range are
491 shared when using this constructor */
492 if (size == 1 && *u < 256) {
493 unicode = unicode_latin1[*u];
494 if (!unicode) {
495 unicode = _PyUnicode_New(1);
496 if (!unicode)
497 return NULL;
498 unicode->str[0] = *u;
499 unicode_latin1[*u] = unicode;
500 }
501 Py_INCREF(unicode);
502 return (PyObject *)unicode;
503 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000504 }
Tim Petersced69f82003-09-16 20:30:58 +0000505
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 unicode = _PyUnicode_New(size);
507 if (!unicode)
508 return NULL;
509
510 /* Copy the Unicode data into the new object */
511 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000512 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 return (PyObject *)unicode;
515}
516
Walter Dörwaldd2034312007-05-18 16:29:38 +0000517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518{
519 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000520
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 if (size < 0) {
522 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 return NULL;
525 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000526
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000528 some optimizations which share commonly used objects.
529 Also, this means the input must be UTF-8, so fall back to the
530 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (u != NULL) {
532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533 /* Optimization for empty strings */
534 if (size == 0 && unicode_empty != NULL) {
535 Py_INCREF(unicode_empty);
536 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000537 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000538
539 /* Single characters are shared when using this constructor.
540 Restrict to ASCII, since the input must be UTF-8. */
541 if (size == 1 && Py_CHARMASK(*u) < 128) {
542 unicode = unicode_latin1[Py_CHARMASK(*u)];
543 if (!unicode) {
544 unicode = _PyUnicode_New(1);
545 if (!unicode)
546 return NULL;
547 unicode->str[0] = Py_CHARMASK(*u);
548 unicode_latin1[Py_CHARMASK(*u)] = unicode;
549 }
550 Py_INCREF(unicode);
551 return (PyObject *)unicode;
552 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000553
554 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 }
556
Walter Dörwald55507312007-05-18 13:12:10 +0000557 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 if (!unicode)
559 return NULL;
560
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 return (PyObject *)unicode;
562}
563
Walter Dörwaldd2034312007-05-18 16:29:38 +0000564PyObject *PyUnicode_FromString(const char *u)
565{
566 size_t size = strlen(u);
567 if (size > PY_SSIZE_T_MAX) {
568 PyErr_SetString(PyExc_OverflowError, "input too long");
569 return NULL;
570 }
571
572 return PyUnicode_FromStringAndSize(u, size);
573}
574
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575#ifdef HAVE_WCHAR_H
576
Mark Dickinson081dfee2009-03-18 14:47:41 +0000577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584 to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
588{
589 PyUnicodeObject *unicode;
590 register Py_ssize_t i;
591 Py_ssize_t alloc;
592 const wchar_t *orig_w;
593
594 if (w == NULL) {
595 if (size == 0)
596 return PyUnicode_FromStringAndSize(NULL, 0);
597 PyErr_BadInternalCall();
598 return NULL;
599 }
600
601 if (size == -1) {
602 size = wcslen(w);
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000643 if (size == 0)
644 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000645 PyErr_BadInternalCall();
646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647 }
648
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == -1) {
650 size = wcslen(w);
651 }
652
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 unicode = _PyUnicode_New(size);
654 if (!unicode)
655 return NULL;
656
657 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000660#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000662 register Py_UNICODE *u;
663 register Py_ssize_t i;
664 u = PyUnicode_AS_UNICODE(unicode);
665 for (i = size; i > 0; i--)
666 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 }
668#endif
669
670 return (PyObject *)unicode;
671}
672
Mark Dickinson081dfee2009-03-18 14:47:41 +0000673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
Walter Dörwald346737f2007-05-31 10:44:43 +0000677static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000680{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000681 *fmt++ = '%';
682 if (width) {
683 if (zeropad)
684 *fmt++ = '0';
685 fmt += sprintf(fmt, "%d", width);
686 }
687 if (precision)
688 fmt += sprintf(fmt, ".%d", precision);
689 if (longflag)
690 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000691 else if (longlongflag) {
692 /* longlongflag should only ever be nonzero on machines with
693 HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695 char *f = PY_FORMAT_LONG_LONG;
696 while (*f)
697 *fmt++ = *f++;
698#else
699 /* we shouldn't ever get here */
700 assert(0);
701 *fmt++ = 'l';
702#endif
703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000704 else if (size_tflag) {
705 char *f = PY_FORMAT_SIZE_T;
706 while (*f)
707 *fmt++ = *f++;
708 }
709 *fmt++ = c;
710 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000711}
712
Walter Dörwaldd2034312007-05-18 16:29:38 +0000713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld. 21 characters
718 allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000728 va_list count;
729 Py_ssize_t callcount = 0;
730 PyObject **callresults = NULL;
731 PyObject **callresult = NULL;
732 Py_ssize_t n = 0;
733 int width = 0;
734 int precision = 0;
735 int zeropad;
736 const char* f;
737 Py_UNICODE *s;
738 PyObject *string;
739 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000740 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000741 /* use abuffer instead of buffer, if we need more space
742 * (which can happen if there's a format specifier with width). */
743 char *abuffer = NULL;
744 char *realbuffer;
745 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000749 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000750 /* step 1: count the number of %S/%R/%A/%s format specifications
751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000754 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000755 if (*f == '%') {
756 if (*(f+1)=='%')
757 continue;
758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759 ++callcount;
760 while (ISDIGIT((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763 ;
764 if (*f == 's')
765 ++callcount;
766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000767 }
768 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000770 if (callcount) {
771 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
772 if (!callresults) {
773 PyErr_NoMemory();
774 return NULL;
775 }
776 callresult = callresults;
777 }
778 /* step 3: figure out how large a buffer we need */
779 for (f = format; *f; f++) {
780 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000781#ifdef HAVE_LONG_LONG
782 int longlongflag = 0;
783#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000784 const char* p = f;
785 width = 0;
786 while (ISDIGIT((unsigned)*f))
787 width = (width*10) + *f++ - '0';
788 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
789 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000790
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
792 * they don't affect the amount of space we reserve.
793 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000794 if (*f == 'l') {
795 if (f[1] == 'd' || f[1] == 'u') {
796 ++f;
797 }
798#ifdef HAVE_LONG_LONG
799 else if (f[1] == 'l' &&
800 (f[2] == 'd' || f[2] == 'u')) {
801 longlongflag = 1;
802 f += 2;
803 }
804#endif
805 }
806 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000807 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000808 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 switch (*f) {
811 case 'c':
812 (void)va_arg(count, int);
813 /* fall through... */
814 case '%':
815 n++;
816 break;
817 case 'd': case 'u': case 'i': case 'x':
818 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000819#ifdef HAVE_LONG_LONG
820 if (longlongflag) {
821 if (width < MAX_LONG_LONG_CHARS)
822 width = MAX_LONG_LONG_CHARS;
823 }
824 else
825#endif
826 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
827 including sign. Decimal takes the most space. This
828 isn't enough for octal. If a width is specified we
829 need more (which we allocate later). */
830 if (width < MAX_LONG_CHARS)
831 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000832 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000833 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000834 if (abuffersize < width)
835 abuffersize = width;
836 break;
837 case 's':
838 {
839 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000840 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000841 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
842 if (!str)
843 goto fail;
844 n += PyUnicode_GET_SIZE(str);
845 /* Remember the str and switch to the next slot */
846 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 break;
848 }
849 case 'U':
850 {
851 PyObject *obj = va_arg(count, PyObject *);
852 assert(obj && PyUnicode_Check(obj));
853 n += PyUnicode_GET_SIZE(obj);
854 break;
855 }
856 case 'V':
857 {
858 PyObject *obj = va_arg(count, PyObject *);
859 const char *str = va_arg(count, const char *);
860 assert(obj || str);
861 assert(!obj || PyUnicode_Check(obj));
862 if (obj)
863 n += PyUnicode_GET_SIZE(obj);
864 else
865 n += strlen(str);
866 break;
867 }
868 case 'S':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 PyObject *str;
872 assert(obj);
873 str = PyObject_Str(obj);
874 if (!str)
875 goto fail;
876 n += PyUnicode_GET_SIZE(str);
877 /* Remember the str and switch to the next slot */
878 *callresult++ = str;
879 break;
880 }
881 case 'R':
882 {
883 PyObject *obj = va_arg(count, PyObject *);
884 PyObject *repr;
885 assert(obj);
886 repr = PyObject_Repr(obj);
887 if (!repr)
888 goto fail;
889 n += PyUnicode_GET_SIZE(repr);
890 /* Remember the repr and switch to the next slot */
891 *callresult++ = repr;
892 break;
893 }
894 case 'A':
895 {
896 PyObject *obj = va_arg(count, PyObject *);
897 PyObject *ascii;
898 assert(obj);
899 ascii = PyObject_ASCII(obj);
900 if (!ascii)
901 goto fail;
902 n += PyUnicode_GET_SIZE(ascii);
903 /* Remember the repr and switch to the next slot */
904 *callresult++ = ascii;
905 break;
906 }
907 case 'p':
908 (void) va_arg(count, int);
909 /* maximum 64-bit pointer representation:
910 * 0xffffffffffffffff
911 * so 19 characters is enough.
912 * XXX I count 18 -- what's the extra for?
913 */
914 n += 19;
915 break;
916 default:
917 /* if we stumble upon an unknown
918 formatting code, copy the rest of
919 the format string to the output
920 string. (we cannot just skip the
921 code, since there's no way to know
922 what's in the argument list) */
923 n += strlen(p);
924 goto expand;
925 }
926 } else
927 n++;
928 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000929 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000930 if (abuffersize > ITEM_BUFFER_LEN) {
931 /* add 1 for sprintf's trailing null byte */
932 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000933 if (!abuffer) {
934 PyErr_NoMemory();
935 goto fail;
936 }
937 realbuffer = abuffer;
938 }
939 else
940 realbuffer = buffer;
941 /* step 4: fill the buffer */
942 /* Since we've analyzed how much space we need for the worst case,
943 we don't have to resize the string.
944 There can be no errors beyond this point. */
945 string = PyUnicode_FromUnicode(NULL, n);
946 if (!string)
947 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000948
Benjamin Peterson14339b62009-01-31 16:36:08 +0000949 s = PyUnicode_AS_UNICODE(string);
950 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000951
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 for (f = format; *f; f++) {
953 if (*f == '%') {
954 const char* p = f++;
955 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000956 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000957 int size_tflag = 0;
958 zeropad = (*f == '0');
959 /* parse the width.precision part */
960 width = 0;
961 while (ISDIGIT((unsigned)*f))
962 width = (width*10) + *f++ - '0';
963 precision = 0;
964 if (*f == '.') {
965 f++;
966 while (ISDIGIT((unsigned)*f))
967 precision = (precision*10) + *f++ - '0';
968 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000969 /* Handle %ld, %lu, %lld and %llu. */
970 if (*f == 'l') {
971 if (f[1] == 'd' || f[1] == 'u') {
972 longflag = 1;
973 ++f;
974 }
975#ifdef HAVE_LONG_LONG
976 else if (f[1] == 'l' &&
977 (f[2] == 'd' || f[2] == 'u')) {
978 longlongflag = 1;
979 f += 2;
980 }
981#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000982 }
983 /* handle the size_t flag. */
984 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
985 size_tflag = 1;
986 ++f;
987 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000988
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 switch (*f) {
990 case 'c':
991 *s++ = va_arg(vargs, int);
992 break;
993 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000994 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
995 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +0000996 if (longflag)
997 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000998#ifdef HAVE_LONG_LONG
999 else if (longlongflag)
1000 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1001#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 else if (size_tflag)
1003 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1004 else
1005 sprintf(realbuffer, fmt, va_arg(vargs, int));
1006 appendstring(realbuffer);
1007 break;
1008 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001009 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1010 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001011 if (longflag)
1012 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013#ifdef HAVE_LONG_LONG
1014 else if (longlongflag)
1015 sprintf(realbuffer, fmt, va_arg(vargs,
1016 unsigned PY_LONG_LONG));
1017#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 else if (size_tflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1020 else
1021 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1022 appendstring(realbuffer);
1023 break;
1024 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001025 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001026 sprintf(realbuffer, fmt, va_arg(vargs, int));
1027 appendstring(realbuffer);
1028 break;
1029 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001030 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001031 sprintf(realbuffer, fmt, va_arg(vargs, int));
1032 appendstring(realbuffer);
1033 break;
1034 case 's':
1035 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001036 /* unused, since we already have the result */
1037 (void) va_arg(vargs, char *);
1038 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1039 PyUnicode_GET_SIZE(*callresult));
1040 s += PyUnicode_GET_SIZE(*callresult);
1041 /* We're done with the unicode()/repr() => forget it */
1042 Py_DECREF(*callresult);
1043 /* switch to next unicode()/repr() result */
1044 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 break;
1046 }
1047 case 'U':
1048 {
1049 PyObject *obj = va_arg(vargs, PyObject *);
1050 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1051 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1052 s += size;
1053 break;
1054 }
1055 case 'V':
1056 {
1057 PyObject *obj = va_arg(vargs, PyObject *);
1058 const char *str = va_arg(vargs, const char *);
1059 if (obj) {
1060 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1061 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1062 s += size;
1063 } else {
1064 appendstring(str);
1065 }
1066 break;
1067 }
1068 case 'S':
1069 case 'R':
1070 {
1071 Py_UNICODE *ucopy;
1072 Py_ssize_t usize;
1073 Py_ssize_t upos;
1074 /* unused, since we already have the result */
1075 (void) va_arg(vargs, PyObject *);
1076 ucopy = PyUnicode_AS_UNICODE(*callresult);
1077 usize = PyUnicode_GET_SIZE(*callresult);
1078 for (upos = 0; upos<usize;)
1079 *s++ = ucopy[upos++];
1080 /* We're done with the unicode()/repr() => forget it */
1081 Py_DECREF(*callresult);
1082 /* switch to next unicode()/repr() result */
1083 ++callresult;
1084 break;
1085 }
1086 case 'p':
1087 sprintf(buffer, "%p", va_arg(vargs, void*));
1088 /* %p is ill-defined: ensure leading 0x. */
1089 if (buffer[1] == 'X')
1090 buffer[1] = 'x';
1091 else if (buffer[1] != 'x') {
1092 memmove(buffer+2, buffer, strlen(buffer)+1);
1093 buffer[0] = '0';
1094 buffer[1] = 'x';
1095 }
1096 appendstring(buffer);
1097 break;
1098 case '%':
1099 *s++ = '%';
1100 break;
1101 default:
1102 appendstring(p);
1103 goto end;
1104 }
1105 } else
1106 *s++ = *f;
1107 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001108
Benjamin Peterson29060642009-01-31 22:14:21 +00001109 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001110 if (callresults)
1111 PyObject_Free(callresults);
1112 if (abuffer)
1113 PyObject_Free(abuffer);
1114 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1115 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001116 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001117 if (callresults) {
1118 PyObject **callresult2 = callresults;
1119 while (callresult2 < callresult) {
1120 Py_DECREF(*callresult2);
1121 ++callresult2;
1122 }
1123 PyObject_Free(callresults);
1124 }
1125 if (abuffer)
1126 PyObject_Free(abuffer);
1127 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001128}
1129
1130#undef appendstring
1131
1132PyObject *
1133PyUnicode_FromFormat(const char *format, ...)
1134{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001135 PyObject* ret;
1136 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001137
1138#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001139 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001140#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001141 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001142#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001143 ret = PyUnicode_FromFormatV(format, vargs);
1144 va_end(vargs);
1145 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001146}
1147
Martin v. Löwis18e16552006-02-15 17:27:45 +00001148Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001149 wchar_t *w,
1150 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151{
1152 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001153 PyErr_BadInternalCall();
1154 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001155 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001156
1157 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001159 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001160
Daniel Stutzbach8515eae2010-08-24 21:57:33 +00001161#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 memcpy(w, unicode->str, size * sizeof(wchar_t));
1163#else
1164 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001165 register Py_UNICODE *u;
1166 register Py_ssize_t i;
1167 u = PyUnicode_AS_UNICODE(unicode);
1168 for (i = size; i > 0; i--)
1169 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 }
1171#endif
1172
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001173 if (size > PyUnicode_GET_SIZE(unicode))
1174 return PyUnicode_GET_SIZE(unicode);
1175 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001176 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177}
1178
1179#endif
1180
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001181PyObject *PyUnicode_FromOrdinal(int ordinal)
1182{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001183 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001184
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001185 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001186 PyErr_SetString(PyExc_ValueError,
1187 "chr() arg not in range(0x110000)");
1188 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001189 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001190
1191#ifndef Py_UNICODE_WIDE
1192 if (ordinal > 0xffff) {
1193 ordinal -= 0x10000;
1194 s[0] = 0xD800 | (ordinal >> 10);
1195 s[1] = 0xDC00 | (ordinal & 0x3FF);
1196 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001197 }
1198#endif
1199
Hye-Shik Chang40574832004-04-06 07:24:51 +00001200 s[0] = (Py_UNICODE)ordinal;
1201 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001202}
1203
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204PyObject *PyUnicode_FromObject(register PyObject *obj)
1205{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001206 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001207 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001208 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001209 Py_INCREF(obj);
1210 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001211 }
1212 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001213 /* For a Unicode subtype that's not a Unicode object,
1214 return a true Unicode object with the same data. */
1215 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1216 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001217 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001218 PyErr_Format(PyExc_TypeError,
1219 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001220 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001221 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001222}
1223
1224PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001225 const char *encoding,
1226 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001227{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001228 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001229 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001230
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 PyErr_BadInternalCall();
1233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001235
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001236 /* Decoding bytes objects is the most common case and should be fast */
1237 if (PyBytes_Check(obj)) {
1238 if (PyBytes_GET_SIZE(obj) == 0) {
1239 Py_INCREF(unicode_empty);
1240 v = (PyObject *) unicode_empty;
1241 }
1242 else {
1243 v = PyUnicode_Decode(
1244 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1245 encoding, errors);
1246 }
1247 return v;
1248 }
1249
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001250 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001251 PyErr_SetString(PyExc_TypeError,
1252 "decoding str is not supported");
1253 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001254 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001255
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001256 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1257 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1258 PyErr_Format(PyExc_TypeError,
1259 "coercing to str: need bytes, bytearray "
1260 "or buffer-like object, %.80s found",
1261 Py_TYPE(obj)->tp_name);
1262 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001263 }
Tim Petersced69f82003-09-16 20:30:58 +00001264
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001265 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001266 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001267 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 }
Tim Petersced69f82003-09-16 20:30:58 +00001269 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001270 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001271
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001272 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001273 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274}
1275
Victor Stinner600d3be2010-06-10 12:00:55 +00001276/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001277 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1278 1 on success. */
1279static int
1280normalize_encoding(const char *encoding,
1281 char *lower,
1282 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001284 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001285 char *l;
1286 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001287
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001288 e = encoding;
1289 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001290 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001291 while (*e) {
1292 if (l == l_end)
1293 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001294 if (ISUPPER(*e)) {
1295 *l++ = TOLOWER(*e++);
1296 }
1297 else if (*e == '_') {
1298 *l++ = '-';
1299 e++;
1300 }
1301 else {
1302 *l++ = *e++;
1303 }
1304 }
1305 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001306 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001307}
1308
1309PyObject *PyUnicode_Decode(const char *s,
1310 Py_ssize_t size,
1311 const char *encoding,
1312 const char *errors)
1313{
1314 PyObject *buffer = NULL, *unicode;
1315 Py_buffer info;
1316 char lower[11]; /* Enough for any encoding shortcut */
1317
1318 if (encoding == NULL)
1319 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001320
1321 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001322 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1323 if (strcmp(lower, "utf-8") == 0)
1324 return PyUnicode_DecodeUTF8(s, size, errors);
1325 else if ((strcmp(lower, "latin-1") == 0) ||
1326 (strcmp(lower, "iso-8859-1") == 0))
1327 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001328#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001329 else if (strcmp(lower, "mbcs") == 0)
1330 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001331#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001332 else if (strcmp(lower, "ascii") == 0)
1333 return PyUnicode_DecodeASCII(s, size, errors);
1334 else if (strcmp(lower, "utf-16") == 0)
1335 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1336 else if (strcmp(lower, "utf-32") == 0)
1337 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339
1340 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001341 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001342 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001343 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001344 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345 if (buffer == NULL)
1346 goto onError;
1347 unicode = PyCodec_Decode(buffer, encoding, errors);
1348 if (unicode == NULL)
1349 goto onError;
1350 if (!PyUnicode_Check(unicode)) {
1351 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001352 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001353 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 Py_DECREF(unicode);
1355 goto onError;
1356 }
1357 Py_DECREF(buffer);
1358 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001359
Benjamin Peterson29060642009-01-31 22:14:21 +00001360 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 Py_XDECREF(buffer);
1362 return NULL;
1363}
1364
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001365PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1366 const char *encoding,
1367 const char *errors)
1368{
1369 PyObject *v;
1370
1371 if (!PyUnicode_Check(unicode)) {
1372 PyErr_BadArgument();
1373 goto onError;
1374 }
1375
1376 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001377 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001378
1379 /* Decode via the codec registry */
1380 v = PyCodec_Decode(unicode, encoding, errors);
1381 if (v == NULL)
1382 goto onError;
1383 return v;
1384
Benjamin Peterson29060642009-01-31 22:14:21 +00001385 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001386 return NULL;
1387}
1388
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001389PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1390 const char *encoding,
1391 const char *errors)
1392{
1393 PyObject *v;
1394
1395 if (!PyUnicode_Check(unicode)) {
1396 PyErr_BadArgument();
1397 goto onError;
1398 }
1399
1400 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001401 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001402
1403 /* Decode via the codec registry */
1404 v = PyCodec_Decode(unicode, encoding, errors);
1405 if (v == NULL)
1406 goto onError;
1407 if (!PyUnicode_Check(v)) {
1408 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001409 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001410 Py_TYPE(v)->tp_name);
1411 Py_DECREF(v);
1412 goto onError;
1413 }
1414 return v;
1415
Benjamin Peterson29060642009-01-31 22:14:21 +00001416 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001417 return NULL;
1418}
1419
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 Py_ssize_t size,
1422 const char *encoding,
1423 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424{
1425 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001426
Guido van Rossumd57fd912000-03-10 22:53:23 +00001427 unicode = PyUnicode_FromUnicode(s, size);
1428 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1431 Py_DECREF(unicode);
1432 return v;
1433}
1434
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001435PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1436 const char *encoding,
1437 const char *errors)
1438{
1439 PyObject *v;
1440
1441 if (!PyUnicode_Check(unicode)) {
1442 PyErr_BadArgument();
1443 goto onError;
1444 }
1445
1446 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001447 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001448
1449 /* Encode via the codec registry */
1450 v = PyCodec_Encode(unicode, encoding, errors);
1451 if (v == NULL)
1452 goto onError;
1453 return v;
1454
Benjamin Peterson29060642009-01-31 22:14:21 +00001455 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001456 return NULL;
1457}
1458
Victor Stinnerae6265f2010-05-15 16:27:27 +00001459PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1460{
Victor Stinner313a1202010-06-11 23:56:51 +00001461 if (Py_FileSystemDefaultEncoding) {
1462#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1463 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1464 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1465 PyUnicode_GET_SIZE(unicode),
1466 NULL);
1467#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001468 return PyUnicode_AsEncodedString(unicode,
1469 Py_FileSystemDefaultEncoding,
1470 "surrogateescape");
Victor Stinner313a1202010-06-11 23:56:51 +00001471 } else
Victor Stinnerae6265f2010-05-15 16:27:27 +00001472 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Victor Stinner3119ed72010-08-18 22:26:50 +00001473 PyUnicode_GET_SIZE(unicode),
1474 "surrogateescape");
Victor Stinnerae6265f2010-05-15 16:27:27 +00001475}
1476
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1478 const char *encoding,
1479 const char *errors)
1480{
1481 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001482 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001483
Guido van Rossumd57fd912000-03-10 22:53:23 +00001484 if (!PyUnicode_Check(unicode)) {
1485 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487 }
Fred Drakee4315f52000-05-09 19:53:39 +00001488
Tim Petersced69f82003-09-16 20:30:58 +00001489 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001490 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001491
1492 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001493 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1494 if (strcmp(lower, "utf-8") == 0)
1495 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1496 PyUnicode_GET_SIZE(unicode),
1497 errors);
1498 else if ((strcmp(lower, "latin-1") == 0) ||
1499 (strcmp(lower, "iso-8859-1") == 0))
1500 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1501 PyUnicode_GET_SIZE(unicode),
1502 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001503#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001504 else if (strcmp(lower, "mbcs") == 0)
1505 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1506 PyUnicode_GET_SIZE(unicode),
1507 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001508#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001509 else if (strcmp(lower, "ascii") == 0)
1510 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1511 PyUnicode_GET_SIZE(unicode),
1512 errors);
1513 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001514 /* During bootstrap, we may need to find the encodings
1515 package, to load the file system encoding, and require the
1516 file system encoding in order to load the encodings
1517 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001518
Victor Stinner59e62db2010-05-15 13:14:32 +00001519 Break out of this dependency by assuming that the path to
1520 the encodings module is ASCII-only. XXX could try wcstombs
1521 instead, if the file system encoding is the locale's
1522 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001523 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001524 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1525 !PyThreadState_GET()->interp->codecs_initialized)
1526 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1527 PyUnicode_GET_SIZE(unicode),
1528 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529
1530 /* Encode via the codec registry */
1531 v = PyCodec_Encode(unicode, encoding, errors);
1532 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001533 return NULL;
1534
1535 /* The normal path */
1536 if (PyBytes_Check(v))
1537 return v;
1538
1539 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001540 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001541 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001542 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001543
1544 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1545 "encoder %s returned bytearray instead of bytes",
1546 encoding);
1547 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001548 Py_DECREF(v);
1549 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001550 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001551
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001552 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1553 Py_DECREF(v);
1554 return b;
1555 }
1556
1557 PyErr_Format(PyExc_TypeError,
1558 "encoder did not return a bytes object (type=%.400s)",
1559 Py_TYPE(v)->tp_name);
1560 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001561 return NULL;
1562}
1563
1564PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1565 const char *encoding,
1566 const char *errors)
1567{
1568 PyObject *v;
1569
1570 if (!PyUnicode_Check(unicode)) {
1571 PyErr_BadArgument();
1572 goto onError;
1573 }
1574
1575 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001576 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001577
1578 /* Encode via the codec registry */
1579 v = PyCodec_Encode(unicode, encoding, errors);
1580 if (v == NULL)
1581 goto onError;
1582 if (!PyUnicode_Check(v)) {
1583 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001584 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001585 Py_TYPE(v)->tp_name);
1586 Py_DECREF(v);
1587 goto onError;
1588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001590
Benjamin Peterson29060642009-01-31 22:14:21 +00001591 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 return NULL;
1593}
1594
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001595PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001596 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001597{
1598 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001599 if (v)
1600 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001601 if (errors != NULL)
1602 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001603 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001604 PyUnicode_GET_SIZE(unicode),
1605 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001606 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001607 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001608 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001609 return v;
1610}
1611
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001612PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001613PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001614 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001615 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1616}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001617
Christian Heimes5894ba72007-11-04 11:43:14 +00001618PyObject*
1619PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1620{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001621 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1622 can be undefined. If it is case, decode using UTF-8. The following assumes
1623 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1624 bootstrapping process where the codecs aren't ready yet.
1625 */
1626 if (Py_FileSystemDefaultEncoding) {
1627#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001628 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001629 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001630 }
1631#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001632 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001633 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001634 }
1635#endif
1636 return PyUnicode_Decode(s, size,
1637 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001638 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001639 }
1640 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001641 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001642 }
1643}
1644
Martin v. Löwis011e8422009-05-05 04:43:17 +00001645
1646int
1647PyUnicode_FSConverter(PyObject* arg, void* addr)
1648{
1649 PyObject *output = NULL;
1650 Py_ssize_t size;
1651 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001652 if (arg == NULL) {
1653 Py_DECREF(*(PyObject**)addr);
1654 return 1;
1655 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001656 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001657 output = arg;
1658 Py_INCREF(output);
1659 }
1660 else {
1661 arg = PyUnicode_FromObject(arg);
1662 if (!arg)
1663 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001664 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001665 Py_DECREF(arg);
1666 if (!output)
1667 return 0;
1668 if (!PyBytes_Check(output)) {
1669 Py_DECREF(output);
1670 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1671 return 0;
1672 }
1673 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001674 size = PyBytes_GET_SIZE(output);
1675 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001676 if (size != strlen(data)) {
1677 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1678 Py_DECREF(output);
1679 return 0;
1680 }
1681 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001682 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001683}
1684
1685
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001686int
1687PyUnicode_FSDecoder(PyObject* arg, void* addr)
1688{
1689 PyObject *output = NULL;
1690 Py_ssize_t size;
1691 void *data;
1692 if (arg == NULL) {
1693 Py_DECREF(*(PyObject**)addr);
1694 return 1;
1695 }
1696 if (PyUnicode_Check(arg)) {
1697 output = arg;
1698 Py_INCREF(output);
1699 }
1700 else {
1701 arg = PyBytes_FromObject(arg);
1702 if (!arg)
1703 return 0;
1704 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1705 PyBytes_GET_SIZE(arg));
1706 Py_DECREF(arg);
1707 if (!output)
1708 return 0;
1709 if (!PyUnicode_Check(output)) {
1710 Py_DECREF(output);
1711 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1712 return 0;
1713 }
1714 }
1715 size = PyUnicode_GET_SIZE(output);
1716 data = PyUnicode_AS_UNICODE(output);
1717 if (size != Py_UNICODE_strlen(data)) {
1718 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1719 Py_DECREF(output);
1720 return 0;
1721 }
1722 *(PyObject**)addr = output;
1723 return Py_CLEANUP_SUPPORTED;
1724}
1725
1726
Martin v. Löwis5b222132007-06-10 09:51:05 +00001727char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001728_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001729{
Christian Heimesf3863112007-11-22 07:46:41 +00001730 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001731 if (!PyUnicode_Check(unicode)) {
1732 PyErr_BadArgument();
1733 return NULL;
1734 }
Christian Heimesf3863112007-11-22 07:46:41 +00001735 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1736 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001737 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001738 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001739 *psize = PyBytes_GET_SIZE(bytes);
1740 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001741}
1742
1743char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001744_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001745{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001746 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001747}
1748
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1750{
1751 if (!PyUnicode_Check(unicode)) {
1752 PyErr_BadArgument();
1753 goto onError;
1754 }
1755 return PyUnicode_AS_UNICODE(unicode);
1756
Benjamin Peterson29060642009-01-31 22:14:21 +00001757 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758 return NULL;
1759}
1760
Martin v. Löwis18e16552006-02-15 17:27:45 +00001761Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762{
1763 if (!PyUnicode_Check(unicode)) {
1764 PyErr_BadArgument();
1765 goto onError;
1766 }
1767 return PyUnicode_GET_SIZE(unicode);
1768
Benjamin Peterson29060642009-01-31 22:14:21 +00001769 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770 return -1;
1771}
1772
Thomas Wouters78890102000-07-22 19:25:51 +00001773const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001774{
Victor Stinner42cb4622010-09-01 19:39:01 +00001775 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001776}
1777
Victor Stinner554f3f02010-06-16 23:33:54 +00001778/* create or adjust a UnicodeDecodeError */
1779static void
1780make_decode_exception(PyObject **exceptionObject,
1781 const char *encoding,
1782 const char *input, Py_ssize_t length,
1783 Py_ssize_t startpos, Py_ssize_t endpos,
1784 const char *reason)
1785{
1786 if (*exceptionObject == NULL) {
1787 *exceptionObject = PyUnicodeDecodeError_Create(
1788 encoding, input, length, startpos, endpos, reason);
1789 }
1790 else {
1791 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1792 goto onError;
1793 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1794 goto onError;
1795 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1796 goto onError;
1797 }
1798 return;
1799
1800onError:
1801 Py_DECREF(*exceptionObject);
1802 *exceptionObject = NULL;
1803}
1804
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001805/* error handling callback helper:
1806 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001807 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001808 and adjust various state variables.
1809 return 0 on success, -1 on error
1810*/
1811
1812static
1813int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001814 const char *encoding, const char *reason,
1815 const char **input, const char **inend, Py_ssize_t *startinpos,
1816 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1817 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001818{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001819 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001820
1821 PyObject *restuple = NULL;
1822 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001823 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001824 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001825 Py_ssize_t requiredsize;
1826 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001828 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001829 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 int res = -1;
1831
1832 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001833 *errorHandler = PyCodec_LookupError(errors);
1834 if (*errorHandler == NULL)
1835 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001836 }
1837
Victor Stinner554f3f02010-06-16 23:33:54 +00001838 make_decode_exception(exceptionObject,
1839 encoding,
1840 *input, *inend - *input,
1841 *startinpos, *endinpos,
1842 reason);
1843 if (*exceptionObject == NULL)
1844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001845
1846 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1847 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001848 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001849 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001850 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001851 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001852 }
1853 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001854 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001855
1856 /* Copy back the bytes variables, which might have been modified by the
1857 callback */
1858 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1859 if (!inputobj)
1860 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001861 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001862 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001863 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001864 *input = PyBytes_AS_STRING(inputobj);
1865 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001866 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001867 /* we can DECREF safely, as the exception has another reference,
1868 so the object won't go away. */
1869 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001870
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001871 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001872 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001873 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001874 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1875 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001876 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877
1878 /* need more space? (at least enough for what we
1879 have+the replacement+the rest of the string (starting
1880 at the new input position), so we won't have to check space
1881 when there are no errors in the rest of the string) */
1882 repptr = PyUnicode_AS_UNICODE(repunicode);
1883 repsize = PyUnicode_GET_SIZE(repunicode);
1884 requiredsize = *outpos + repsize + insize-newpos;
1885 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001886 if (requiredsize<2*outsize)
1887 requiredsize = 2*outsize;
1888 if (_PyUnicode_Resize(output, requiredsize) < 0)
1889 goto onError;
1890 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 }
1892 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001893 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001894 Py_UNICODE_COPY(*outptr, repptr, repsize);
1895 *outptr += repsize;
1896 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001897
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001898 /* we made it! */
1899 res = 0;
1900
Benjamin Peterson29060642009-01-31 22:14:21 +00001901 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001902 Py_XDECREF(restuple);
1903 return res;
1904}
1905
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001906/* --- UTF-7 Codec -------------------------------------------------------- */
1907
Antoine Pitrou244651a2009-05-04 18:56:13 +00001908/* See RFC2152 for details. We encode conservatively and decode liberally. */
1909
1910/* Three simple macros defining base-64. */
1911
1912/* Is c a base-64 character? */
1913
1914#define IS_BASE64(c) \
1915 (((c) >= 'A' && (c) <= 'Z') || \
1916 ((c) >= 'a' && (c) <= 'z') || \
1917 ((c) >= '0' && (c) <= '9') || \
1918 (c) == '+' || (c) == '/')
1919
1920/* given that c is a base-64 character, what is its base-64 value? */
1921
1922#define FROM_BASE64(c) \
1923 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1924 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1925 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1926 (c) == '+' ? 62 : 63)
1927
1928/* What is the base-64 character of the bottom 6 bits of n? */
1929
1930#define TO_BASE64(n) \
1931 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1932
1933/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1934 * decoded as itself. We are permissive on decoding; the only ASCII
1935 * byte not decoding to itself is the + which begins a base64
1936 * string. */
1937
1938#define DECODE_DIRECT(c) \
1939 ((c) <= 127 && (c) != '+')
1940
1941/* The UTF-7 encoder treats ASCII characters differently according to
1942 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1943 * the above). See RFC2152. This array identifies these different
1944 * sets:
1945 * 0 : "Set D"
1946 * alphanumeric and '(),-./:?
1947 * 1 : "Set O"
1948 * !"#$%&*;<=>@[]^_`{|}
1949 * 2 : "whitespace"
1950 * ht nl cr sp
1951 * 3 : special (must be base64 encoded)
1952 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1953 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001954
Tim Petersced69f82003-09-16 20:30:58 +00001955static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001956char utf7_category[128] = {
1957/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1958 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1959/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1960 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1961/* sp ! " # $ % & ' ( ) * + , - . / */
1962 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1963/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1965/* @ A B C D E F G H I J K L M N O */
1966 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1967/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1969/* ` a b c d e f g h i j k l m n o */
1970 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1971/* p q r s t u v w x y z { | } ~ del */
1972 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001973};
1974
Antoine Pitrou244651a2009-05-04 18:56:13 +00001975/* ENCODE_DIRECT: this character should be encoded as itself. The
1976 * answer depends on whether we are encoding set O as itself, and also
1977 * on whether we are encoding whitespace as itself. RFC2152 makes it
1978 * clear that the answers to these questions vary between
1979 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001980
Antoine Pitrou244651a2009-05-04 18:56:13 +00001981#define ENCODE_DIRECT(c, directO, directWS) \
1982 ((c) < 128 && (c) > 0 && \
1983 ((utf7_category[(c)] == 0) || \
1984 (directWS && (utf7_category[(c)] == 2)) || \
1985 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001986
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001987PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001988 Py_ssize_t size,
1989 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001990{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001991 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1992}
1993
Antoine Pitrou244651a2009-05-04 18:56:13 +00001994/* The decoder. The only state we preserve is our read position,
1995 * i.e. how many characters we have consumed. So if we end in the
1996 * middle of a shift sequence we have to back off the read position
1997 * and the output to the beginning of the sequence, otherwise we lose
1998 * all the shift state (seen bits, number of bits seen, high
1999 * surrogate). */
2000
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002001PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002002 Py_ssize_t size,
2003 const char *errors,
2004 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002005{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002007 Py_ssize_t startinpos;
2008 Py_ssize_t endinpos;
2009 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002010 const char *e;
2011 PyUnicodeObject *unicode;
2012 Py_UNICODE *p;
2013 const char *errmsg = "";
2014 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002015 Py_UNICODE *shiftOutStart;
2016 unsigned int base64bits = 0;
2017 unsigned long base64buffer = 0;
2018 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 PyObject *errorHandler = NULL;
2020 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002021
2022 unicode = _PyUnicode_New(size);
2023 if (!unicode)
2024 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002025 if (size == 0) {
2026 if (consumed)
2027 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002028 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002029 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002030
2031 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002032 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002033 e = s + size;
2034
2035 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002036 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002037 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002038 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002039
Antoine Pitrou244651a2009-05-04 18:56:13 +00002040 if (inShift) { /* in a base-64 section */
2041 if (IS_BASE64(ch)) { /* consume a base-64 character */
2042 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2043 base64bits += 6;
2044 s++;
2045 if (base64bits >= 16) {
2046 /* we have enough bits for a UTF-16 value */
2047 Py_UNICODE outCh = (Py_UNICODE)
2048 (base64buffer >> (base64bits-16));
2049 base64bits -= 16;
2050 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2051 if (surrogate) {
2052 /* expecting a second surrogate */
2053 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2054#ifdef Py_UNICODE_WIDE
2055 *p++ = (((surrogate & 0x3FF)<<10)
2056 | (outCh & 0x3FF)) + 0x10000;
2057#else
2058 *p++ = surrogate;
2059 *p++ = outCh;
2060#endif
2061 surrogate = 0;
2062 }
2063 else {
2064 surrogate = 0;
2065 errmsg = "second surrogate missing";
2066 goto utf7Error;
2067 }
2068 }
2069 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2070 /* first surrogate */
2071 surrogate = outCh;
2072 }
2073 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2074 errmsg = "unexpected second surrogate";
2075 goto utf7Error;
2076 }
2077 else {
2078 *p++ = outCh;
2079 }
2080 }
2081 }
2082 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002083 inShift = 0;
2084 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002085 if (surrogate) {
2086 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002087 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002088 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002089 if (base64bits > 0) { /* left-over bits */
2090 if (base64bits >= 6) {
2091 /* We've seen at least one base-64 character */
2092 errmsg = "partial character in shift sequence";
2093 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002094 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002095 else {
2096 /* Some bits remain; they should be zero */
2097 if (base64buffer != 0) {
2098 errmsg = "non-zero padding bits in shift sequence";
2099 goto utf7Error;
2100 }
2101 }
2102 }
2103 if (ch != '-') {
2104 /* '-' is absorbed; other terminating
2105 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002106 *p++ = ch;
2107 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002108 }
2109 }
2110 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002111 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002112 s++; /* consume '+' */
2113 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002114 s++;
2115 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002116 }
2117 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002118 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002119 shiftOutStart = p;
2120 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002121 }
2122 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002123 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002124 *p++ = ch;
2125 s++;
2126 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002127 else {
2128 startinpos = s-starts;
2129 s++;
2130 errmsg = "unexpected special character";
2131 goto utf7Error;
2132 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002134utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002135 outpos = p-PyUnicode_AS_UNICODE(unicode);
2136 endinpos = s-starts;
2137 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002138 errors, &errorHandler,
2139 "utf7", errmsg,
2140 &starts, &e, &startinpos, &endinpos, &exc, &s,
2141 &unicode, &outpos, &p))
2142 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002143 }
2144
Antoine Pitrou244651a2009-05-04 18:56:13 +00002145 /* end of string */
2146
2147 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2148 /* if we're in an inconsistent state, that's an error */
2149 if (surrogate ||
2150 (base64bits >= 6) ||
2151 (base64bits > 0 && base64buffer != 0)) {
2152 outpos = p-PyUnicode_AS_UNICODE(unicode);
2153 endinpos = size;
2154 if (unicode_decode_call_errorhandler(
2155 errors, &errorHandler,
2156 "utf7", "unterminated shift sequence",
2157 &starts, &e, &startinpos, &endinpos, &exc, &s,
2158 &unicode, &outpos, &p))
2159 goto onError;
2160 if (s < e)
2161 goto restart;
2162 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002163 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002164
2165 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002166 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002167 if (inShift) {
2168 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002169 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002170 }
2171 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002172 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002173 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002174 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002175
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002176 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002177 goto onError;
2178
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002179 Py_XDECREF(errorHandler);
2180 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002181 return (PyObject *)unicode;
2182
Benjamin Peterson29060642009-01-31 22:14:21 +00002183 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002184 Py_XDECREF(errorHandler);
2185 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002186 Py_DECREF(unicode);
2187 return NULL;
2188}
2189
2190
2191PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002192 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002193 int base64SetO,
2194 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002195 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002196{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002197 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002198 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002199 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002200 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002201 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002202 unsigned int base64bits = 0;
2203 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002204 char * out;
2205 char * start;
2206
2207 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002208 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002210 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002211 return PyErr_NoMemory();
2212
Antoine Pitrou244651a2009-05-04 18:56:13 +00002213 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002214 if (v == NULL)
2215 return NULL;
2216
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002217 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002218 for (;i < size; ++i) {
2219 Py_UNICODE ch = s[i];
2220
Antoine Pitrou244651a2009-05-04 18:56:13 +00002221 if (inShift) {
2222 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2223 /* shifting out */
2224 if (base64bits) { /* output remaining bits */
2225 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2226 base64buffer = 0;
2227 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002228 }
2229 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002230 /* Characters not in the BASE64 set implicitly unshift the sequence
2231 so no '-' is required, except if the character is itself a '-' */
2232 if (IS_BASE64(ch) || ch == '-') {
2233 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002234 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002235 *out++ = (char) ch;
2236 }
2237 else {
2238 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002239 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002240 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002241 else { /* not in a shift sequence */
2242 if (ch == '+') {
2243 *out++ = '+';
2244 *out++ = '-';
2245 }
2246 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2247 *out++ = (char) ch;
2248 }
2249 else {
2250 *out++ = '+';
2251 inShift = 1;
2252 goto encode_char;
2253 }
2254 }
2255 continue;
2256encode_char:
2257#ifdef Py_UNICODE_WIDE
2258 if (ch >= 0x10000) {
2259 /* code first surrogate */
2260 base64bits += 16;
2261 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2262 while (base64bits >= 6) {
2263 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2264 base64bits -= 6;
2265 }
2266 /* prepare second surrogate */
2267 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2268 }
2269#endif
2270 base64bits += 16;
2271 base64buffer = (base64buffer << 16) | ch;
2272 while (base64bits >= 6) {
2273 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2274 base64bits -= 6;
2275 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002276 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002277 if (base64bits)
2278 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2279 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002280 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002281 if (_PyBytes_Resize(&v, out - start) < 0)
2282 return NULL;
2283 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002284}
2285
Antoine Pitrou244651a2009-05-04 18:56:13 +00002286#undef IS_BASE64
2287#undef FROM_BASE64
2288#undef TO_BASE64
2289#undef DECODE_DIRECT
2290#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002291
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292/* --- UTF-8 Codec -------------------------------------------------------- */
2293
Tim Petersced69f82003-09-16 20:30:58 +00002294static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002296 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2297 illegal prefix. See RFC 3629 for details */
2298 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2299 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002300 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2302 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2303 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2304 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002305 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2306 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2308 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002309 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2310 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2311 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2312 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2313 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002314};
2315
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002317 Py_ssize_t size,
2318 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319{
Walter Dörwald69652032004-09-07 20:24:22 +00002320 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2321}
2322
Antoine Pitrouab868312009-01-10 15:40:25 +00002323/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2324#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2325
2326/* Mask to quickly check whether a C 'long' contains a
2327 non-ASCII, UTF8-encoded char. */
2328#if (SIZEOF_LONG == 8)
2329# define ASCII_CHAR_MASK 0x8080808080808080L
2330#elif (SIZEOF_LONG == 4)
2331# define ASCII_CHAR_MASK 0x80808080L
2332#else
2333# error C 'long' size should be either 4 or 8!
2334#endif
2335
Walter Dörwald69652032004-09-07 20:24:22 +00002336PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002337 Py_ssize_t size,
2338 const char *errors,
2339 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002340{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002341 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002343 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002344 Py_ssize_t startinpos;
2345 Py_ssize_t endinpos;
2346 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002347 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002348 PyUnicodeObject *unicode;
2349 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002350 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002351 PyObject *errorHandler = NULL;
2352 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353
2354 /* Note: size will always be longer than the resulting Unicode
2355 character count */
2356 unicode = _PyUnicode_New(size);
2357 if (!unicode)
2358 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002359 if (size == 0) {
2360 if (consumed)
2361 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002363 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002364
2365 /* Unpack UTF-8 encoded data */
2366 p = unicode->str;
2367 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002368 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369
2370 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002371 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372
2373 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002374 /* Fast path for runs of ASCII characters. Given that common UTF-8
2375 input will consist of an overwhelming majority of ASCII
2376 characters, we try to optimize for this case by checking
2377 as many characters as a C 'long' can contain.
2378 First, check if we can do an aligned read, as most CPUs have
2379 a penalty for unaligned reads.
2380 */
2381 if (!((size_t) s & LONG_PTR_MASK)) {
2382 /* Help register allocation */
2383 register const char *_s = s;
2384 register Py_UNICODE *_p = p;
2385 while (_s < aligned_end) {
2386 /* Read a whole long at a time (either 4 or 8 bytes),
2387 and do a fast unrolled copy if it only contains ASCII
2388 characters. */
2389 unsigned long data = *(unsigned long *) _s;
2390 if (data & ASCII_CHAR_MASK)
2391 break;
2392 _p[0] = (unsigned char) _s[0];
2393 _p[1] = (unsigned char) _s[1];
2394 _p[2] = (unsigned char) _s[2];
2395 _p[3] = (unsigned char) _s[3];
2396#if (SIZEOF_LONG == 8)
2397 _p[4] = (unsigned char) _s[4];
2398 _p[5] = (unsigned char) _s[5];
2399 _p[6] = (unsigned char) _s[6];
2400 _p[7] = (unsigned char) _s[7];
2401#endif
2402 _s += SIZEOF_LONG;
2403 _p += SIZEOF_LONG;
2404 }
2405 s = _s;
2406 p = _p;
2407 if (s == e)
2408 break;
2409 ch = (unsigned char)*s;
2410 }
2411 }
2412
2413 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002414 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415 s++;
2416 continue;
2417 }
2418
2419 n = utf8_code_length[ch];
2420
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002421 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002422 if (consumed)
2423 break;
2424 else {
2425 errmsg = "unexpected end of data";
2426 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002427 endinpos = startinpos+1;
2428 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2429 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002430 goto utf8Error;
2431 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002433
2434 switch (n) {
2435
2436 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002437 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002438 startinpos = s-starts;
2439 endinpos = startinpos+1;
2440 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441
2442 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002443 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002444 startinpos = s-starts;
2445 endinpos = startinpos+1;
2446 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447
2448 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002449 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002450 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002451 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002452 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002453 goto utf8Error;
2454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002456 assert ((ch > 0x007F) && (ch <= 0x07FF));
2457 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 break;
2459
2460 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002461 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2462 will result in surrogates in range d800-dfff. Surrogates are
2463 not valid UTF-8 so they are rejected.
2464 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2465 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002466 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002467 (s[2] & 0xc0) != 0x80 ||
2468 ((unsigned char)s[0] == 0xE0 &&
2469 (unsigned char)s[1] < 0xA0) ||
2470 ((unsigned char)s[0] == 0xED &&
2471 (unsigned char)s[1] > 0x9F)) {
2472 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002473 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002474 endinpos = startinpos + 1;
2475
2476 /* if s[1] first two bits are 1 and 0, then the invalid
2477 continuation byte is s[2], so increment endinpos by 1,
2478 if not, s[1] is invalid and endinpos doesn't need to
2479 be incremented. */
2480 if ((s[1] & 0xC0) == 0x80)
2481 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002482 goto utf8Error;
2483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002485 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2486 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002487 break;
2488
2489 case 4:
2490 if ((s[1] & 0xc0) != 0x80 ||
2491 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002492 (s[3] & 0xc0) != 0x80 ||
2493 ((unsigned char)s[0] == 0xF0 &&
2494 (unsigned char)s[1] < 0x90) ||
2495 ((unsigned char)s[0] == 0xF4 &&
2496 (unsigned char)s[1] > 0x8F)) {
2497 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002498 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002499 endinpos = startinpos + 1;
2500 if ((s[1] & 0xC0) == 0x80) {
2501 endinpos++;
2502 if ((s[2] & 0xC0) == 0x80)
2503 endinpos++;
2504 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002505 goto utf8Error;
2506 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002507 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002508 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2509 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2510
Fredrik Lundh8f455852001-06-27 18:59:43 +00002511#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002512 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002513#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002514 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002515
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002516 /* translate from 10000..10FFFF to 0..FFFF */
2517 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002518
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002519 /* high surrogate = top 10 bits added to D800 */
2520 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002521
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002522 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002523 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002524#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 }
2527 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002528 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002529
Benjamin Peterson29060642009-01-31 22:14:21 +00002530 utf8Error:
2531 outpos = p-PyUnicode_AS_UNICODE(unicode);
2532 if (unicode_decode_call_errorhandler(
2533 errors, &errorHandler,
2534 "utf8", errmsg,
2535 &starts, &e, &startinpos, &endinpos, &exc, &s,
2536 &unicode, &outpos, &p))
2537 goto onError;
2538 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 }
Walter Dörwald69652032004-09-07 20:24:22 +00002540 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002541 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542
2543 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002544 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 goto onError;
2546
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002547 Py_XDECREF(errorHandler);
2548 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 return (PyObject *)unicode;
2550
Benjamin Peterson29060642009-01-31 22:14:21 +00002551 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002552 Py_XDECREF(errorHandler);
2553 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002554 Py_DECREF(unicode);
2555 return NULL;
2556}
2557
Antoine Pitrouab868312009-01-10 15:40:25 +00002558#undef ASCII_CHAR_MASK
2559
2560
Tim Peters602f7402002-04-27 18:03:26 +00002561/* Allocation strategy: if the string is short, convert into a stack buffer
2562 and allocate exactly as much space needed at the end. Else allocate the
2563 maximum possible needed (4 result bytes per Unicode character), and return
2564 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002565*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002566PyObject *
2567PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002568 Py_ssize_t size,
2569 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570{
Tim Peters602f7402002-04-27 18:03:26 +00002571#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002572
Guido van Rossum98297ee2007-11-06 21:34:58 +00002573 Py_ssize_t i; /* index into s of next input byte */
2574 PyObject *result; /* result string object */
2575 char *p; /* next free byte in output buffer */
2576 Py_ssize_t nallocated; /* number of result bytes allocated */
2577 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002578 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002579 PyObject *errorHandler = NULL;
2580 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002581
Tim Peters602f7402002-04-27 18:03:26 +00002582 assert(s != NULL);
2583 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584
Tim Peters602f7402002-04-27 18:03:26 +00002585 if (size <= MAX_SHORT_UNICHARS) {
2586 /* Write into the stack buffer; nallocated can't overflow.
2587 * At the end, we'll allocate exactly as much heap space as it
2588 * turns out we need.
2589 */
2590 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002591 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002592 p = stackbuf;
2593 }
2594 else {
2595 /* Overallocate on the heap, and give the excess back at the end. */
2596 nallocated = size * 4;
2597 if (nallocated / 4 != size) /* overflow! */
2598 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002599 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002600 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002601 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002602 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002603 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002604
Tim Peters602f7402002-04-27 18:03:26 +00002605 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002606 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002607
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002608 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002609 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002611
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002613 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002614 *p++ = (char)(0xc0 | (ch >> 6));
2615 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002616 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002617#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002618 /* Special case: check for high and low surrogate */
2619 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2620 Py_UCS4 ch2 = s[i];
2621 /* Combine the two surrogates to form a UCS4 value */
2622 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2623 i++;
2624
2625 /* Encode UCS4 Unicode ordinals */
2626 *p++ = (char)(0xf0 | (ch >> 18));
2627 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002628 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2629 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002630 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002631#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002632 Py_ssize_t newpos;
2633 PyObject *rep;
2634 Py_ssize_t repsize, k;
2635 rep = unicode_encode_call_errorhandler
2636 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2637 s, size, &exc, i-1, i, &newpos);
2638 if (!rep)
2639 goto error;
2640
2641 if (PyBytes_Check(rep))
2642 repsize = PyBytes_GET_SIZE(rep);
2643 else
2644 repsize = PyUnicode_GET_SIZE(rep);
2645
2646 if (repsize > 4) {
2647 Py_ssize_t offset;
2648
2649 if (result == NULL)
2650 offset = p - stackbuf;
2651 else
2652 offset = p - PyBytes_AS_STRING(result);
2653
2654 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2655 /* integer overflow */
2656 PyErr_NoMemory();
2657 goto error;
2658 }
2659 nallocated += repsize - 4;
2660 if (result != NULL) {
2661 if (_PyBytes_Resize(&result, nallocated) < 0)
2662 goto error;
2663 } else {
2664 result = PyBytes_FromStringAndSize(NULL, nallocated);
2665 if (result == NULL)
2666 goto error;
2667 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2668 }
2669 p = PyBytes_AS_STRING(result) + offset;
2670 }
2671
2672 if (PyBytes_Check(rep)) {
2673 char *prep = PyBytes_AS_STRING(rep);
2674 for(k = repsize; k > 0; k--)
2675 *p++ = *prep++;
2676 } else /* rep is unicode */ {
2677 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2678 Py_UNICODE c;
2679
2680 for(k=0; k<repsize; k++) {
2681 c = prep[k];
2682 if (0x80 <= c) {
2683 raise_encode_exception(&exc, "utf-8", s, size,
2684 i-1, i, "surrogates not allowed");
2685 goto error;
2686 }
2687 *p++ = (char)prep[k];
2688 }
2689 }
2690 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002691#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002692 }
Victor Stinner445a6232010-04-22 20:01:57 +00002693#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002694 } else if (ch < 0x10000) {
2695 *p++ = (char)(0xe0 | (ch >> 12));
2696 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2697 *p++ = (char)(0x80 | (ch & 0x3f));
2698 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002699 /* Encode UCS4 Unicode ordinals */
2700 *p++ = (char)(0xf0 | (ch >> 18));
2701 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2702 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2703 *p++ = (char)(0x80 | (ch & 0x3f));
2704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002706
Guido van Rossum98297ee2007-11-06 21:34:58 +00002707 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002708 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002709 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002710 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002711 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002712 }
2713 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002714 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002715 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002716 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002717 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002718 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002719 Py_XDECREF(errorHandler);
2720 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002721 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002722 error:
2723 Py_XDECREF(errorHandler);
2724 Py_XDECREF(exc);
2725 Py_XDECREF(result);
2726 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002727
Tim Peters602f7402002-04-27 18:03:26 +00002728#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729}
2730
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2732{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 if (!PyUnicode_Check(unicode)) {
2734 PyErr_BadArgument();
2735 return NULL;
2736 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002737 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 PyUnicode_GET_SIZE(unicode),
2739 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740}
2741
Walter Dörwald41980ca2007-08-16 21:55:45 +00002742/* --- UTF-32 Codec ------------------------------------------------------- */
2743
2744PyObject *
2745PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002746 Py_ssize_t size,
2747 const char *errors,
2748 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002749{
2750 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2751}
2752
2753PyObject *
2754PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002755 Py_ssize_t size,
2756 const char *errors,
2757 int *byteorder,
2758 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002759{
2760 const char *starts = s;
2761 Py_ssize_t startinpos;
2762 Py_ssize_t endinpos;
2763 Py_ssize_t outpos;
2764 PyUnicodeObject *unicode;
2765 Py_UNICODE *p;
2766#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002767 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002768 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002769#else
2770 const int pairs = 0;
2771#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002772 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002773 int bo = 0; /* assume native ordering by default */
2774 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002775 /* Offsets from q for retrieving bytes in the right order. */
2776#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2777 int iorder[] = {0, 1, 2, 3};
2778#else
2779 int iorder[] = {3, 2, 1, 0};
2780#endif
2781 PyObject *errorHandler = NULL;
2782 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002783
Walter Dörwald41980ca2007-08-16 21:55:45 +00002784 q = (unsigned char *)s;
2785 e = q + size;
2786
2787 if (byteorder)
2788 bo = *byteorder;
2789
2790 /* Check for BOM marks (U+FEFF) in the input and adjust current
2791 byte order setting accordingly. In native mode, the leading BOM
2792 mark is skipped, in all other modes, it is copied to the output
2793 stream as-is (giving a ZWNBSP character). */
2794 if (bo == 0) {
2795 if (size >= 4) {
2796 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002798#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002799 if (bom == 0x0000FEFF) {
2800 q += 4;
2801 bo = -1;
2802 }
2803 else if (bom == 0xFFFE0000) {
2804 q += 4;
2805 bo = 1;
2806 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002807#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002808 if (bom == 0x0000FEFF) {
2809 q += 4;
2810 bo = 1;
2811 }
2812 else if (bom == 0xFFFE0000) {
2813 q += 4;
2814 bo = -1;
2815 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002816#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002817 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002818 }
2819
2820 if (bo == -1) {
2821 /* force LE */
2822 iorder[0] = 0;
2823 iorder[1] = 1;
2824 iorder[2] = 2;
2825 iorder[3] = 3;
2826 }
2827 else if (bo == 1) {
2828 /* force BE */
2829 iorder[0] = 3;
2830 iorder[1] = 2;
2831 iorder[2] = 1;
2832 iorder[3] = 0;
2833 }
2834
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002835 /* On narrow builds we split characters outside the BMP into two
2836 codepoints => count how much extra space we need. */
2837#ifndef Py_UNICODE_WIDE
2838 for (qq = q; qq < e; qq += 4)
2839 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2840 pairs++;
2841#endif
2842
2843 /* This might be one to much, because of a BOM */
2844 unicode = _PyUnicode_New((size+3)/4+pairs);
2845 if (!unicode)
2846 return NULL;
2847 if (size == 0)
2848 return (PyObject *)unicode;
2849
2850 /* Unpack UTF-32 encoded data */
2851 p = unicode->str;
2852
Walter Dörwald41980ca2007-08-16 21:55:45 +00002853 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002854 Py_UCS4 ch;
2855 /* remaining bytes at the end? (size should be divisible by 4) */
2856 if (e-q<4) {
2857 if (consumed)
2858 break;
2859 errmsg = "truncated data";
2860 startinpos = ((const char *)q)-starts;
2861 endinpos = ((const char *)e)-starts;
2862 goto utf32Error;
2863 /* The remaining input chars are ignored if the callback
2864 chooses to skip the input */
2865 }
2866 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2867 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002868
Benjamin Peterson29060642009-01-31 22:14:21 +00002869 if (ch >= 0x110000)
2870 {
2871 errmsg = "codepoint not in range(0x110000)";
2872 startinpos = ((const char *)q)-starts;
2873 endinpos = startinpos+4;
2874 goto utf32Error;
2875 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002876#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002877 if (ch >= 0x10000)
2878 {
2879 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2880 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2881 }
2882 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002883#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002884 *p++ = ch;
2885 q += 4;
2886 continue;
2887 utf32Error:
2888 outpos = p-PyUnicode_AS_UNICODE(unicode);
2889 if (unicode_decode_call_errorhandler(
2890 errors, &errorHandler,
2891 "utf32", errmsg,
2892 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2893 &unicode, &outpos, &p))
2894 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002895 }
2896
2897 if (byteorder)
2898 *byteorder = bo;
2899
2900 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002902
2903 /* Adjust length */
2904 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2905 goto onError;
2906
2907 Py_XDECREF(errorHandler);
2908 Py_XDECREF(exc);
2909 return (PyObject *)unicode;
2910
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002912 Py_DECREF(unicode);
2913 Py_XDECREF(errorHandler);
2914 Py_XDECREF(exc);
2915 return NULL;
2916}
2917
2918PyObject *
2919PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002920 Py_ssize_t size,
2921 const char *errors,
2922 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002923{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002924 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002925 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002926 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002927#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002928 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002929#else
2930 const int pairs = 0;
2931#endif
2932 /* Offsets from p for storing byte pairs in the right order. */
2933#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2934 int iorder[] = {0, 1, 2, 3};
2935#else
2936 int iorder[] = {3, 2, 1, 0};
2937#endif
2938
Benjamin Peterson29060642009-01-31 22:14:21 +00002939#define STORECHAR(CH) \
2940 do { \
2941 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2942 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2943 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2944 p[iorder[0]] = (CH) & 0xff; \
2945 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002946 } while(0)
2947
2948 /* In narrow builds we can output surrogate pairs as one codepoint,
2949 so we need less space. */
2950#ifndef Py_UNICODE_WIDE
2951 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002952 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2953 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2954 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002955#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002956 nsize = (size - pairs + (byteorder == 0));
2957 bytesize = nsize * 4;
2958 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002959 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002960 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002961 if (v == NULL)
2962 return NULL;
2963
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002964 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002965 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002966 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002967 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002968 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002969
2970 if (byteorder == -1) {
2971 /* force LE */
2972 iorder[0] = 0;
2973 iorder[1] = 1;
2974 iorder[2] = 2;
2975 iorder[3] = 3;
2976 }
2977 else if (byteorder == 1) {
2978 /* force BE */
2979 iorder[0] = 3;
2980 iorder[1] = 2;
2981 iorder[2] = 1;
2982 iorder[3] = 0;
2983 }
2984
2985 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002986 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002987#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002988 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2989 Py_UCS4 ch2 = *s;
2990 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2991 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2992 s++;
2993 size--;
2994 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002995 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002996#endif
2997 STORECHAR(ch);
2998 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002999
3000 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003001 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003002#undef STORECHAR
3003}
3004
3005PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3006{
3007 if (!PyUnicode_Check(unicode)) {
3008 PyErr_BadArgument();
3009 return NULL;
3010 }
3011 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003012 PyUnicode_GET_SIZE(unicode),
3013 NULL,
3014 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003015}
3016
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017/* --- UTF-16 Codec ------------------------------------------------------- */
3018
Tim Peters772747b2001-08-09 22:21:55 +00003019PyObject *
3020PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003021 Py_ssize_t size,
3022 const char *errors,
3023 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024{
Walter Dörwald69652032004-09-07 20:24:22 +00003025 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3026}
3027
Antoine Pitrouab868312009-01-10 15:40:25 +00003028/* Two masks for fast checking of whether a C 'long' may contain
3029 UTF16-encoded surrogate characters. This is an efficient heuristic,
3030 assuming that non-surrogate characters with a code point >= 0x8000 are
3031 rare in most input.
3032 FAST_CHAR_MASK is used when the input is in native byte ordering,
3033 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003034*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003035#if (SIZEOF_LONG == 8)
3036# define FAST_CHAR_MASK 0x8000800080008000L
3037# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3038#elif (SIZEOF_LONG == 4)
3039# define FAST_CHAR_MASK 0x80008000L
3040# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3041#else
3042# error C 'long' size should be either 4 or 8!
3043#endif
3044
Walter Dörwald69652032004-09-07 20:24:22 +00003045PyObject *
3046PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 Py_ssize_t size,
3048 const char *errors,
3049 int *byteorder,
3050 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003051{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003053 Py_ssize_t startinpos;
3054 Py_ssize_t endinpos;
3055 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 PyUnicodeObject *unicode;
3057 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003058 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003059 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003060 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003061 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003062 /* Offsets from q for retrieving byte pairs in the right order. */
3063#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3064 int ihi = 1, ilo = 0;
3065#else
3066 int ihi = 0, ilo = 1;
3067#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068 PyObject *errorHandler = NULL;
3069 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003070
3071 /* Note: size will always be longer than the resulting Unicode
3072 character count */
3073 unicode = _PyUnicode_New(size);
3074 if (!unicode)
3075 return NULL;
3076 if (size == 0)
3077 return (PyObject *)unicode;
3078
3079 /* Unpack UTF-16 encoded data */
3080 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003081 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003082 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083
3084 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003085 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003087 /* Check for BOM marks (U+FEFF) in the input and adjust current
3088 byte order setting accordingly. In native mode, the leading BOM
3089 mark is skipped, in all other modes, it is copied to the output
3090 stream as-is (giving a ZWNBSP character). */
3091 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003092 if (size >= 2) {
3093 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003094#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003095 if (bom == 0xFEFF) {
3096 q += 2;
3097 bo = -1;
3098 }
3099 else if (bom == 0xFFFE) {
3100 q += 2;
3101 bo = 1;
3102 }
Tim Petersced69f82003-09-16 20:30:58 +00003103#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003104 if (bom == 0xFEFF) {
3105 q += 2;
3106 bo = 1;
3107 }
3108 else if (bom == 0xFFFE) {
3109 q += 2;
3110 bo = -1;
3111 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003112#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003113 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003114 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115
Tim Peters772747b2001-08-09 22:21:55 +00003116 if (bo == -1) {
3117 /* force LE */
3118 ihi = 1;
3119 ilo = 0;
3120 }
3121 else if (bo == 1) {
3122 /* force BE */
3123 ihi = 0;
3124 ilo = 1;
3125 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003126#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3127 native_ordering = ilo < ihi;
3128#else
3129 native_ordering = ilo > ihi;
3130#endif
Tim Peters772747b2001-08-09 22:21:55 +00003131
Antoine Pitrouab868312009-01-10 15:40:25 +00003132 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003133 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003134 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003135 /* First check for possible aligned read of a C 'long'. Unaligned
3136 reads are more expensive, better to defer to another iteration. */
3137 if (!((size_t) q & LONG_PTR_MASK)) {
3138 /* Fast path for runs of non-surrogate chars. */
3139 register const unsigned char *_q = q;
3140 Py_UNICODE *_p = p;
3141 if (native_ordering) {
3142 /* Native ordering is simple: as long as the input cannot
3143 possibly contain a surrogate char, do an unrolled copy
3144 of several 16-bit code points to the target object.
3145 The non-surrogate check is done on several input bytes
3146 at a time (as many as a C 'long' can contain). */
3147 while (_q < aligned_end) {
3148 unsigned long data = * (unsigned long *) _q;
3149 if (data & FAST_CHAR_MASK)
3150 break;
3151 _p[0] = ((unsigned short *) _q)[0];
3152 _p[1] = ((unsigned short *) _q)[1];
3153#if (SIZEOF_LONG == 8)
3154 _p[2] = ((unsigned short *) _q)[2];
3155 _p[3] = ((unsigned short *) _q)[3];
3156#endif
3157 _q += SIZEOF_LONG;
3158 _p += SIZEOF_LONG / 2;
3159 }
3160 }
3161 else {
3162 /* Byteswapped ordering is similar, but we must decompose
3163 the copy bytewise, and take care of zero'ing out the
3164 upper bytes if the target object is in 32-bit units
3165 (that is, in UCS-4 builds). */
3166 while (_q < aligned_end) {
3167 unsigned long data = * (unsigned long *) _q;
3168 if (data & SWAPPED_FAST_CHAR_MASK)
3169 break;
3170 /* Zero upper bytes in UCS-4 builds */
3171#if (Py_UNICODE_SIZE > 2)
3172 _p[0] = 0;
3173 _p[1] = 0;
3174#if (SIZEOF_LONG == 8)
3175 _p[2] = 0;
3176 _p[3] = 0;
3177#endif
3178#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003179 /* Issue #4916; UCS-4 builds on big endian machines must
3180 fill the two last bytes of each 4-byte unit. */
3181#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3182# define OFF 2
3183#else
3184# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003185#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003186 ((unsigned char *) _p)[OFF + 1] = _q[0];
3187 ((unsigned char *) _p)[OFF + 0] = _q[1];
3188 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3189 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3190#if (SIZEOF_LONG == 8)
3191 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3192 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3193 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3194 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3195#endif
3196#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003197 _q += SIZEOF_LONG;
3198 _p += SIZEOF_LONG / 2;
3199 }
3200 }
3201 p = _p;
3202 q = _q;
3203 if (q >= e)
3204 break;
3205 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003207
Benjamin Peterson14339b62009-01-31 16:36:08 +00003208 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003209
3210 if (ch < 0xD800 || ch > 0xDFFF) {
3211 *p++ = ch;
3212 continue;
3213 }
3214
3215 /* UTF-16 code pair: */
3216 if (q > e) {
3217 errmsg = "unexpected end of data";
3218 startinpos = (((const char *)q) - 2) - starts;
3219 endinpos = ((const char *)e) + 1 - starts;
3220 goto utf16Error;
3221 }
3222 if (0xD800 <= ch && ch <= 0xDBFF) {
3223 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3224 q += 2;
3225 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003226#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 *p++ = ch;
3228 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003229#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003231#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003232 continue;
3233 }
3234 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003235 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 startinpos = (((const char *)q)-4)-starts;
3237 endinpos = startinpos+2;
3238 goto utf16Error;
3239 }
3240
Benjamin Peterson14339b62009-01-31 16:36:08 +00003241 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 errmsg = "illegal encoding";
3243 startinpos = (((const char *)q)-2)-starts;
3244 endinpos = startinpos+2;
3245 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003246
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 utf16Error:
3248 outpos = p - PyUnicode_AS_UNICODE(unicode);
3249 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003250 errors,
3251 &errorHandler,
3252 "utf16", errmsg,
3253 &starts,
3254 (const char **)&e,
3255 &startinpos,
3256 &endinpos,
3257 &exc,
3258 (const char **)&q,
3259 &unicode,
3260 &outpos,
3261 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003264 /* remaining byte at the end? (size should be even) */
3265 if (e == q) {
3266 if (!consumed) {
3267 errmsg = "truncated data";
3268 startinpos = ((const char *)q) - starts;
3269 endinpos = ((const char *)e) + 1 - starts;
3270 outpos = p - PyUnicode_AS_UNICODE(unicode);
3271 if (unicode_decode_call_errorhandler(
3272 errors,
3273 &errorHandler,
3274 "utf16", errmsg,
3275 &starts,
3276 (const char **)&e,
3277 &startinpos,
3278 &endinpos,
3279 &exc,
3280 (const char **)&q,
3281 &unicode,
3282 &outpos,
3283 &p))
3284 goto onError;
3285 /* The remaining input chars are ignored if the callback
3286 chooses to skip the input */
3287 }
3288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289
3290 if (byteorder)
3291 *byteorder = bo;
3292
Walter Dörwald69652032004-09-07 20:24:22 +00003293 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003294 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003295
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003297 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 goto onError;
3299
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 Py_XDECREF(errorHandler);
3301 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 return (PyObject *)unicode;
3303
Benjamin Peterson29060642009-01-31 22:14:21 +00003304 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306 Py_XDECREF(errorHandler);
3307 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 return NULL;
3309}
3310
Antoine Pitrouab868312009-01-10 15:40:25 +00003311#undef FAST_CHAR_MASK
3312#undef SWAPPED_FAST_CHAR_MASK
3313
Tim Peters772747b2001-08-09 22:21:55 +00003314PyObject *
3315PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 Py_ssize_t size,
3317 const char *errors,
3318 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003320 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003321 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003322 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003323#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003324 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003325#else
3326 const int pairs = 0;
3327#endif
Tim Peters772747b2001-08-09 22:21:55 +00003328 /* Offsets from p for storing byte pairs in the right order. */
3329#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3330 int ihi = 1, ilo = 0;
3331#else
3332 int ihi = 0, ilo = 1;
3333#endif
3334
Benjamin Peterson29060642009-01-31 22:14:21 +00003335#define STORECHAR(CH) \
3336 do { \
3337 p[ihi] = ((CH) >> 8) & 0xff; \
3338 p[ilo] = (CH) & 0xff; \
3339 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003340 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003342#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003343 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003344 if (s[i] >= 0x10000)
3345 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003346#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003347 /* 2 * (size + pairs + (byteorder == 0)) */
3348 if (size > PY_SSIZE_T_MAX ||
3349 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003350 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003351 nsize = size + pairs + (byteorder == 0);
3352 bytesize = nsize * 2;
3353 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003354 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003355 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 if (v == NULL)
3357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003359 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003361 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003362 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003363 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003364
3365 if (byteorder == -1) {
3366 /* force LE */
3367 ihi = 1;
3368 ilo = 0;
3369 }
3370 else if (byteorder == 1) {
3371 /* force BE */
3372 ihi = 0;
3373 ilo = 1;
3374 }
3375
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003376 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 Py_UNICODE ch = *s++;
3378 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003379#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003380 if (ch >= 0x10000) {
3381 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3382 ch = 0xD800 | ((ch-0x10000) >> 10);
3383 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003384#endif
Tim Peters772747b2001-08-09 22:21:55 +00003385 STORECHAR(ch);
3386 if (ch2)
3387 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003388 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003389
3390 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003391 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003392#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393}
3394
3395PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3396{
3397 if (!PyUnicode_Check(unicode)) {
3398 PyErr_BadArgument();
3399 return NULL;
3400 }
3401 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003402 PyUnicode_GET_SIZE(unicode),
3403 NULL,
3404 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405}
3406
3407/* --- Unicode Escape Codec ----------------------------------------------- */
3408
Fredrik Lundh06d12682001-01-24 07:59:11 +00003409static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003410
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003412 Py_ssize_t size,
3413 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003416 Py_ssize_t startinpos;
3417 Py_ssize_t endinpos;
3418 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003423 char* message;
3424 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003425 PyObject *errorHandler = NULL;
3426 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003427
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 /* Escaped strings will always be longer than the resulting
3429 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 length after conversion to the true value.
3431 (but if the error callback returns a long replacement string
3432 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 v = _PyUnicode_New(size);
3434 if (v == NULL)
3435 goto onError;
3436 if (size == 0)
3437 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003441
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442 while (s < end) {
3443 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003444 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003445 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446
3447 /* Non-escape characters are interpreted as Unicode ordinals */
3448 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003449 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450 continue;
3451 }
3452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 /* \ - Escapes */
3455 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003456 c = *s++;
3457 if (s > end)
3458 c = '\0'; /* Invalid after \ */
3459 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460
Benjamin Peterson29060642009-01-31 22:14:21 +00003461 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 case '\n': break;
3463 case '\\': *p++ = '\\'; break;
3464 case '\'': *p++ = '\''; break;
3465 case '\"': *p++ = '\"'; break;
3466 case 'b': *p++ = '\b'; break;
3467 case 'f': *p++ = '\014'; break; /* FF */
3468 case 't': *p++ = '\t'; break;
3469 case 'n': *p++ = '\n'; break;
3470 case 'r': *p++ = '\r'; break;
3471 case 'v': *p++ = '\013'; break; /* VT */
3472 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3473
Benjamin Peterson29060642009-01-31 22:14:21 +00003474 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 case '0': case '1': case '2': case '3':
3476 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003477 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003478 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003479 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003480 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003481 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003483 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 break;
3485
Benjamin Peterson29060642009-01-31 22:14:21 +00003486 /* hex escapes */
3487 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003489 digits = 2;
3490 message = "truncated \\xXX escape";
3491 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492
Benjamin Peterson29060642009-01-31 22:14:21 +00003493 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003495 digits = 4;
3496 message = "truncated \\uXXXX escape";
3497 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498
Benjamin Peterson29060642009-01-31 22:14:21 +00003499 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003500 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003501 digits = 8;
3502 message = "truncated \\UXXXXXXXX escape";
3503 hexescape:
3504 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 outpos = p-PyUnicode_AS_UNICODE(v);
3506 if (s+digits>end) {
3507 endinpos = size;
3508 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003509 errors, &errorHandler,
3510 "unicodeescape", "end of string in escape sequence",
3511 &starts, &end, &startinpos, &endinpos, &exc, &s,
3512 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 goto onError;
3514 goto nextByte;
3515 }
3516 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003517 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003518 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 endinpos = (s+i+1)-starts;
3520 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003521 errors, &errorHandler,
3522 "unicodeescape", message,
3523 &starts, &end, &startinpos, &endinpos, &exc, &s,
3524 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003525 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003527 }
3528 chr = (chr<<4) & ~0xF;
3529 if (c >= '0' && c <= '9')
3530 chr += c - '0';
3531 else if (c >= 'a' && c <= 'f')
3532 chr += 10 + c - 'a';
3533 else
3534 chr += 10 + c - 'A';
3535 }
3536 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003537 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 /* _decoding_error will have already written into the
3539 target buffer. */
3540 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003541 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003542 /* when we get here, chr is a 32-bit unicode character */
3543 if (chr <= 0xffff)
3544 /* UCS-2 character */
3545 *p++ = (Py_UNICODE) chr;
3546 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003547 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003548 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003549#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003550 *p++ = chr;
3551#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003552 chr -= 0x10000L;
3553 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003554 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003555#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003556 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 endinpos = s-starts;
3558 outpos = p-PyUnicode_AS_UNICODE(v);
3559 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003560 errors, &errorHandler,
3561 "unicodeescape", "illegal Unicode character",
3562 &starts, &end, &startinpos, &endinpos, &exc, &s,
3563 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003564 goto onError;
3565 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003566 break;
3567
Benjamin Peterson29060642009-01-31 22:14:21 +00003568 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003569 case 'N':
3570 message = "malformed \\N character escape";
3571 if (ucnhash_CAPI == NULL) {
3572 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003573 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003574 if (ucnhash_CAPI == NULL)
3575 goto ucnhashError;
3576 }
3577 if (*s == '{') {
3578 const char *start = s+1;
3579 /* look for the closing brace */
3580 while (*s != '}' && s < end)
3581 s++;
3582 if (s > start && s < end && *s == '}') {
3583 /* found a name. look it up in the unicode database */
3584 message = "unknown Unicode character name";
3585 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003586 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003587 goto store;
3588 }
3589 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 endinpos = s-starts;
3591 outpos = p-PyUnicode_AS_UNICODE(v);
3592 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003593 errors, &errorHandler,
3594 "unicodeescape", message,
3595 &starts, &end, &startinpos, &endinpos, &exc, &s,
3596 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003597 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003598 break;
3599
3600 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003601 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 message = "\\ at end of string";
3603 s--;
3604 endinpos = s-starts;
3605 outpos = p-PyUnicode_AS_UNICODE(v);
3606 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003607 errors, &errorHandler,
3608 "unicodeescape", message,
3609 &starts, &end, &startinpos, &endinpos, &exc, &s,
3610 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003611 goto onError;
3612 }
3613 else {
3614 *p++ = '\\';
3615 *p++ = (unsigned char)s[-1];
3616 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003617 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003619 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003622 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003624 Py_XDECREF(errorHandler);
3625 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003627
Benjamin Peterson29060642009-01-31 22:14:21 +00003628 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003629 PyErr_SetString(
3630 PyExc_UnicodeError,
3631 "\\N escapes not supported (can't load unicodedata module)"
3632 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003633 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 Py_XDECREF(errorHandler);
3635 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003636 return NULL;
3637
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 Py_XDECREF(errorHandler);
3641 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 return NULL;
3643}
3644
3645/* Return a Unicode-Escape string version of the Unicode object.
3646
3647 If quotes is true, the string is enclosed in u"" or u'' quotes as
3648 appropriate.
3649
3650*/
3651
Thomas Wouters477c8d52006-05-27 19:21:47 +00003652Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003653 Py_ssize_t size,
3654 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003655{
3656 /* like wcschr, but doesn't stop at NULL characters */
3657
3658 while (size-- > 0) {
3659 if (*s == ch)
3660 return s;
3661 s++;
3662 }
3663
3664 return NULL;
3665}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003666
Walter Dörwald79e913e2007-05-12 11:08:06 +00003667static const char *hexdigits = "0123456789abcdef";
3668
3669PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003672 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003675#ifdef Py_UNICODE_WIDE
3676 const Py_ssize_t expandsize = 10;
3677#else
3678 const Py_ssize_t expandsize = 6;
3679#endif
3680
Thomas Wouters89f507f2006-12-13 04:49:30 +00003681 /* XXX(nnorwitz): rather than over-allocating, it would be
3682 better to choose a different scheme. Perhaps scan the
3683 first N-chars of the string and allocate based on that size.
3684 */
3685 /* Initial allocation is based on the longest-possible unichr
3686 escape.
3687
3688 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3689 unichr, so in this case it's the longest unichr escape. In
3690 narrow (UTF-16) builds this is five chars per source unichr
3691 since there are two unichrs in the surrogate pair, so in narrow
3692 (UTF-16) builds it's not the longest unichr escape.
3693
3694 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3695 so in the narrow (UTF-16) build case it's the longest unichr
3696 escape.
3697 */
3698
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003699 if (size == 0)
3700 return PyBytes_FromStringAndSize(NULL, 0);
3701
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003702 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003704
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003705 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003706 2
3707 + expandsize*size
3708 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 if (repr == NULL)
3710 return NULL;
3711
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003712 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 while (size-- > 0) {
3715 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003716
Walter Dörwald79e913e2007-05-12 11:08:06 +00003717 /* Escape backslashes */
3718 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 *p++ = '\\';
3720 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003721 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003722 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003723
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003724#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003725 /* Map 21-bit characters to '\U00xxxxxx' */
3726 else if (ch >= 0x10000) {
3727 *p++ = '\\';
3728 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003729 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3730 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3731 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3732 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3733 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3734 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3735 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3736 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003737 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003738 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003739#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003740 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3741 else if (ch >= 0xD800 && ch < 0xDC00) {
3742 Py_UNICODE ch2;
3743 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003744
Benjamin Peterson29060642009-01-31 22:14:21 +00003745 ch2 = *s++;
3746 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003747 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003748 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3749 *p++ = '\\';
3750 *p++ = 'U';
3751 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3752 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3753 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3754 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3755 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3756 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3757 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3758 *p++ = hexdigits[ucs & 0x0000000F];
3759 continue;
3760 }
3761 /* Fall through: isolated surrogates are copied as-is */
3762 s--;
3763 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003764 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003765#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003766
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003768 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 *p++ = '\\';
3770 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003771 *p++ = hexdigits[(ch >> 12) & 0x000F];
3772 *p++ = hexdigits[(ch >> 8) & 0x000F];
3773 *p++ = hexdigits[(ch >> 4) & 0x000F];
3774 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003776
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003777 /* Map special whitespace to '\t', \n', '\r' */
3778 else if (ch == '\t') {
3779 *p++ = '\\';
3780 *p++ = 't';
3781 }
3782 else if (ch == '\n') {
3783 *p++ = '\\';
3784 *p++ = 'n';
3785 }
3786 else if (ch == '\r') {
3787 *p++ = '\\';
3788 *p++ = 'r';
3789 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003790
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003791 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003792 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003794 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003795 *p++ = hexdigits[(ch >> 4) & 0x000F];
3796 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003797 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003798
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799 /* Copy everything else as-is */
3800 else
3801 *p++ = (char) ch;
3802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003804 assert(p - PyBytes_AS_STRING(repr) > 0);
3805 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3806 return NULL;
3807 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808}
3809
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003810PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003812 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 if (!PyUnicode_Check(unicode)) {
3814 PyErr_BadArgument();
3815 return NULL;
3816 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003817 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3818 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003819 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820}
3821
3822/* --- Raw Unicode Escape Codec ------------------------------------------- */
3823
3824PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003825 Py_ssize_t size,
3826 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003828 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003829 Py_ssize_t startinpos;
3830 Py_ssize_t endinpos;
3831 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 const char *end;
3835 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 PyObject *errorHandler = NULL;
3837 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003838
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 /* Escaped strings will always be longer than the resulting
3840 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003841 length after conversion to the true value. (But decoding error
3842 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 v = _PyUnicode_New(size);
3844 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003845 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003847 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003848 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 end = s + size;
3850 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 unsigned char c;
3852 Py_UCS4 x;
3853 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003854 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855
Benjamin Peterson29060642009-01-31 22:14:21 +00003856 /* Non-escape characters are interpreted as Unicode ordinals */
3857 if (*s != '\\') {
3858 *p++ = (unsigned char)*s++;
3859 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003860 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003861 startinpos = s-starts;
3862
3863 /* \u-escapes are only interpreted iff the number of leading
3864 backslashes if odd */
3865 bs = s;
3866 for (;s < end;) {
3867 if (*s != '\\')
3868 break;
3869 *p++ = (unsigned char)*s++;
3870 }
3871 if (((s - bs) & 1) == 0 ||
3872 s >= end ||
3873 (*s != 'u' && *s != 'U')) {
3874 continue;
3875 }
3876 p--;
3877 count = *s=='u' ? 4 : 8;
3878 s++;
3879
3880 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3881 outpos = p-PyUnicode_AS_UNICODE(v);
3882 for (x = 0, i = 0; i < count; ++i, ++s) {
3883 c = (unsigned char)*s;
3884 if (!ISXDIGIT(c)) {
3885 endinpos = s-starts;
3886 if (unicode_decode_call_errorhandler(
3887 errors, &errorHandler,
3888 "rawunicodeescape", "truncated \\uXXXX",
3889 &starts, &end, &startinpos, &endinpos, &exc, &s,
3890 &v, &outpos, &p))
3891 goto onError;
3892 goto nextByte;
3893 }
3894 x = (x<<4) & ~0xF;
3895 if (c >= '0' && c <= '9')
3896 x += c - '0';
3897 else if (c >= 'a' && c <= 'f')
3898 x += 10 + c - 'a';
3899 else
3900 x += 10 + c - 'A';
3901 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003902 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003903 /* UCS-2 character */
3904 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003905 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003906 /* UCS-4 character. Either store directly, or as
3907 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003908#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003909 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003910#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003911 x -= 0x10000L;
3912 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3913 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003914#endif
3915 } else {
3916 endinpos = s-starts;
3917 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003918 if (unicode_decode_call_errorhandler(
3919 errors, &errorHandler,
3920 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003921 &starts, &end, &startinpos, &endinpos, &exc, &s,
3922 &v, &outpos, &p))
3923 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003924 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003925 nextByte:
3926 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003928 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 Py_XDECREF(errorHandler);
3931 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003933
Benjamin Peterson29060642009-01-31 22:14:21 +00003934 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003936 Py_XDECREF(errorHandler);
3937 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 return NULL;
3939}
3940
3941PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003942 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003944 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 char *p;
3946 char *q;
3947
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003948#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003949 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003950#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003951 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003952#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003953
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003954 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003955 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003956
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003957 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 if (repr == NULL)
3959 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003960 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003961 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003963 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 while (size-- > 0) {
3965 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003966#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003967 /* Map 32-bit characters to '\Uxxxxxxxx' */
3968 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003969 *p++ = '\\';
3970 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003971 *p++ = hexdigits[(ch >> 28) & 0xf];
3972 *p++ = hexdigits[(ch >> 24) & 0xf];
3973 *p++ = hexdigits[(ch >> 20) & 0xf];
3974 *p++ = hexdigits[(ch >> 16) & 0xf];
3975 *p++ = hexdigits[(ch >> 12) & 0xf];
3976 *p++ = hexdigits[(ch >> 8) & 0xf];
3977 *p++ = hexdigits[(ch >> 4) & 0xf];
3978 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003979 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003980 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003981#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3983 if (ch >= 0xD800 && ch < 0xDC00) {
3984 Py_UNICODE ch2;
3985 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003986
Benjamin Peterson29060642009-01-31 22:14:21 +00003987 ch2 = *s++;
3988 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003989 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3991 *p++ = '\\';
3992 *p++ = 'U';
3993 *p++ = hexdigits[(ucs >> 28) & 0xf];
3994 *p++ = hexdigits[(ucs >> 24) & 0xf];
3995 *p++ = hexdigits[(ucs >> 20) & 0xf];
3996 *p++ = hexdigits[(ucs >> 16) & 0xf];
3997 *p++ = hexdigits[(ucs >> 12) & 0xf];
3998 *p++ = hexdigits[(ucs >> 8) & 0xf];
3999 *p++ = hexdigits[(ucs >> 4) & 0xf];
4000 *p++ = hexdigits[ucs & 0xf];
4001 continue;
4002 }
4003 /* Fall through: isolated surrogates are copied as-is */
4004 s--;
4005 size++;
4006 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004007#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 /* Map 16-bit characters to '\uxxxx' */
4009 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 *p++ = '\\';
4011 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004012 *p++ = hexdigits[(ch >> 12) & 0xf];
4013 *p++ = hexdigits[(ch >> 8) & 0xf];
4014 *p++ = hexdigits[(ch >> 4) & 0xf];
4015 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004017 /* Copy everything else as-is */
4018 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019 *p++ = (char) ch;
4020 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004021 size = p - q;
4022
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004023 assert(size > 0);
4024 if (_PyBytes_Resize(&repr, size) < 0)
4025 return NULL;
4026 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027}
4028
4029PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4030{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004031 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004033 PyErr_BadArgument();
4034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004036 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4037 PyUnicode_GET_SIZE(unicode));
4038
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004039 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040}
4041
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004042/* --- Unicode Internal Codec ------------------------------------------- */
4043
4044PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004045 Py_ssize_t size,
4046 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004047{
4048 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004049 Py_ssize_t startinpos;
4050 Py_ssize_t endinpos;
4051 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004052 PyUnicodeObject *v;
4053 Py_UNICODE *p;
4054 const char *end;
4055 const char *reason;
4056 PyObject *errorHandler = NULL;
4057 PyObject *exc = NULL;
4058
Neal Norwitzd43069c2006-01-08 01:12:10 +00004059#ifdef Py_UNICODE_WIDE
4060 Py_UNICODE unimax = PyUnicode_GetMax();
4061#endif
4062
Thomas Wouters89f507f2006-12-13 04:49:30 +00004063 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004064 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4065 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004067 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004068 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004069 p = PyUnicode_AS_UNICODE(v);
4070 end = s + size;
4071
4072 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004073 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004074 /* We have to sanity check the raw data, otherwise doom looms for
4075 some malformed UCS-4 data. */
4076 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004077#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004078 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004079#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004080 end-s < Py_UNICODE_SIZE
4081 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004083 startinpos = s - starts;
4084 if (end-s < Py_UNICODE_SIZE) {
4085 endinpos = end-starts;
4086 reason = "truncated input";
4087 }
4088 else {
4089 endinpos = s - starts + Py_UNICODE_SIZE;
4090 reason = "illegal code point (> 0x10FFFF)";
4091 }
4092 outpos = p - PyUnicode_AS_UNICODE(v);
4093 if (unicode_decode_call_errorhandler(
4094 errors, &errorHandler,
4095 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004096 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004097 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004098 goto onError;
4099 }
4100 }
4101 else {
4102 p++;
4103 s += Py_UNICODE_SIZE;
4104 }
4105 }
4106
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004107 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004108 goto onError;
4109 Py_XDECREF(errorHandler);
4110 Py_XDECREF(exc);
4111 return (PyObject *)v;
4112
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004114 Py_XDECREF(v);
4115 Py_XDECREF(errorHandler);
4116 Py_XDECREF(exc);
4117 return NULL;
4118}
4119
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120/* --- Latin-1 Codec ------------------------------------------------------ */
4121
4122PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 Py_ssize_t size,
4124 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125{
4126 PyUnicodeObject *v;
4127 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004128 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004131 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 Py_UNICODE r = *(unsigned char*)s;
4133 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004134 }
4135
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 v = _PyUnicode_New(size);
4137 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004142 e = s + size;
4143 /* Unrolling the copy makes it much faster by reducing the looping
4144 overhead. This is similar to what many memcpy() implementations do. */
4145 unrolled_end = e - 4;
4146 while (s < unrolled_end) {
4147 p[0] = (unsigned char) s[0];
4148 p[1] = (unsigned char) s[1];
4149 p[2] = (unsigned char) s[2];
4150 p[3] = (unsigned char) s[3];
4151 s += 4;
4152 p += 4;
4153 }
4154 while (s < e)
4155 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004157
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 Py_XDECREF(v);
4160 return NULL;
4161}
4162
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163/* create or adjust a UnicodeEncodeError */
4164static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004165 const char *encoding,
4166 const Py_UNICODE *unicode, Py_ssize_t size,
4167 Py_ssize_t startpos, Py_ssize_t endpos,
4168 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004171 *exceptionObject = PyUnicodeEncodeError_Create(
4172 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 }
4174 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4176 goto onError;
4177 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4178 goto onError;
4179 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4180 goto onError;
4181 return;
4182 onError:
4183 Py_DECREF(*exceptionObject);
4184 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 }
4186}
4187
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188/* raises a UnicodeEncodeError */
4189static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 const char *encoding,
4191 const Py_UNICODE *unicode, Py_ssize_t size,
4192 Py_ssize_t startpos, Py_ssize_t endpos,
4193 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194{
4195 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199}
4200
4201/* error handling callback helper:
4202 build arguments, call the callback and check the arguments,
4203 put the result into newpos and return the replacement string, which
4204 has to be freed by the caller */
4205static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 PyObject **errorHandler,
4207 const char *encoding, const char *reason,
4208 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4209 Py_ssize_t startpos, Py_ssize_t endpos,
4210 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004212 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213
4214 PyObject *restuple;
4215 PyObject *resunicode;
4216
4217 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004218 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 }
4222
4223 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004224 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227
4228 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004233 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 Py_DECREF(restuple);
4235 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004237 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 &resunicode, newpos)) {
4239 Py_DECREF(restuple);
4240 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004242 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4243 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4244 Py_DECREF(restuple);
4245 return NULL;
4246 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004248 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004249 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4251 Py_DECREF(restuple);
4252 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004253 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 Py_INCREF(resunicode);
4255 Py_DECREF(restuple);
4256 return resunicode;
4257}
4258
4259static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004260 Py_ssize_t size,
4261 const char *errors,
4262 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263{
4264 /* output object */
4265 PyObject *res;
4266 /* pointers to the beginning and end+1 of input */
4267 const Py_UNICODE *startp = p;
4268 const Py_UNICODE *endp = p + size;
4269 /* pointer to the beginning of the unencodable characters */
4270 /* const Py_UNICODE *badp = NULL; */
4271 /* pointer into the output */
4272 char *str;
4273 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004274 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004275 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4276 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 PyObject *errorHandler = NULL;
4278 PyObject *exc = NULL;
4279 /* the following variable is used for caching string comparisons
4280 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4281 int known_errorHandler = -1;
4282
4283 /* allocate enough for a simple encoding without
4284 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004285 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004286 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004287 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004288 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004289 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004290 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 ressize = size;
4292
4293 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 /* can we encode this? */
4297 if (c<limit) {
4298 /* no overflow check, because we know that the space is enough */
4299 *str++ = (char)c;
4300 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004301 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 else {
4303 Py_ssize_t unicodepos = p-startp;
4304 Py_ssize_t requiredsize;
4305 PyObject *repunicode;
4306 Py_ssize_t repsize;
4307 Py_ssize_t newpos;
4308 Py_ssize_t respos;
4309 Py_UNICODE *uni2;
4310 /* startpos for collecting unencodable chars */
4311 const Py_UNICODE *collstart = p;
4312 const Py_UNICODE *collend = p;
4313 /* find all unecodable characters */
4314 while ((collend < endp) && ((*collend)>=limit))
4315 ++collend;
4316 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4317 if (known_errorHandler==-1) {
4318 if ((errors==NULL) || (!strcmp(errors, "strict")))
4319 known_errorHandler = 1;
4320 else if (!strcmp(errors, "replace"))
4321 known_errorHandler = 2;
4322 else if (!strcmp(errors, "ignore"))
4323 known_errorHandler = 3;
4324 else if (!strcmp(errors, "xmlcharrefreplace"))
4325 known_errorHandler = 4;
4326 else
4327 known_errorHandler = 0;
4328 }
4329 switch (known_errorHandler) {
4330 case 1: /* strict */
4331 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4332 goto onError;
4333 case 2: /* replace */
4334 while (collstart++<collend)
4335 *str++ = '?'; /* fall through */
4336 case 3: /* ignore */
4337 p = collend;
4338 break;
4339 case 4: /* xmlcharrefreplace */
4340 respos = str - PyBytes_AS_STRING(res);
4341 /* determine replacement size (temporarily (mis)uses p) */
4342 for (p = collstart, repsize = 0; p < collend; ++p) {
4343 if (*p<10)
4344 repsize += 2+1+1;
4345 else if (*p<100)
4346 repsize += 2+2+1;
4347 else if (*p<1000)
4348 repsize += 2+3+1;
4349 else if (*p<10000)
4350 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004351#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 else
4353 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004354#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004355 else if (*p<100000)
4356 repsize += 2+5+1;
4357 else if (*p<1000000)
4358 repsize += 2+6+1;
4359 else
4360 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004361#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004362 }
4363 requiredsize = respos+repsize+(endp-collend);
4364 if (requiredsize > ressize) {
4365 if (requiredsize<2*ressize)
4366 requiredsize = 2*ressize;
4367 if (_PyBytes_Resize(&res, requiredsize))
4368 goto onError;
4369 str = PyBytes_AS_STRING(res) + respos;
4370 ressize = requiredsize;
4371 }
4372 /* generate replacement (temporarily (mis)uses p) */
4373 for (p = collstart; p < collend; ++p) {
4374 str += sprintf(str, "&#%d;", (int)*p);
4375 }
4376 p = collend;
4377 break;
4378 default:
4379 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4380 encoding, reason, startp, size, &exc,
4381 collstart-startp, collend-startp, &newpos);
4382 if (repunicode == NULL)
4383 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004384 if (PyBytes_Check(repunicode)) {
4385 /* Directly copy bytes result to output. */
4386 repsize = PyBytes_Size(repunicode);
4387 if (repsize > 1) {
4388 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004389 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004390 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4391 Py_DECREF(repunicode);
4392 goto onError;
4393 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004394 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004395 ressize += repsize-1;
4396 }
4397 memcpy(str, PyBytes_AsString(repunicode), repsize);
4398 str += repsize;
4399 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004400 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004401 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004402 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004403 /* need more space? (at least enough for what we
4404 have+the replacement+the rest of the string, so
4405 we won't have to check space for encodable characters) */
4406 respos = str - PyBytes_AS_STRING(res);
4407 repsize = PyUnicode_GET_SIZE(repunicode);
4408 requiredsize = respos+repsize+(endp-collend);
4409 if (requiredsize > ressize) {
4410 if (requiredsize<2*ressize)
4411 requiredsize = 2*ressize;
4412 if (_PyBytes_Resize(&res, requiredsize)) {
4413 Py_DECREF(repunicode);
4414 goto onError;
4415 }
4416 str = PyBytes_AS_STRING(res) + respos;
4417 ressize = requiredsize;
4418 }
4419 /* check if there is anything unencodable in the replacement
4420 and copy it to the output */
4421 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4422 c = *uni2;
4423 if (c >= limit) {
4424 raise_encode_exception(&exc, encoding, startp, size,
4425 unicodepos, unicodepos+1, reason);
4426 Py_DECREF(repunicode);
4427 goto onError;
4428 }
4429 *str = (char)c;
4430 }
4431 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004432 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004434 }
4435 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004436 /* Resize if we allocated to much */
4437 size = str - PyBytes_AS_STRING(res);
4438 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004439 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004440 if (_PyBytes_Resize(&res, size) < 0)
4441 goto onError;
4442 }
4443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 Py_XDECREF(errorHandler);
4445 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004446 return res;
4447
4448 onError:
4449 Py_XDECREF(res);
4450 Py_XDECREF(errorHandler);
4451 Py_XDECREF(exc);
4452 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453}
4454
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 Py_ssize_t size,
4457 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460}
4461
4462PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4463{
4464 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 PyErr_BadArgument();
4466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 }
4468 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 PyUnicode_GET_SIZE(unicode),
4470 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471}
4472
4473/* --- 7-bit ASCII Codec -------------------------------------------------- */
4474
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004476 Py_ssize_t size,
4477 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 PyUnicodeObject *v;
4481 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004482 Py_ssize_t startinpos;
4483 Py_ssize_t endinpos;
4484 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 const char *e;
4486 PyObject *errorHandler = NULL;
4487 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004488
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004490 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 Py_UNICODE r = *(unsigned char*)s;
4492 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004493 }
Tim Petersced69f82003-09-16 20:30:58 +00004494
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 v = _PyUnicode_New(size);
4496 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501 e = s + size;
4502 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 register unsigned char c = (unsigned char)*s;
4504 if (c < 128) {
4505 *p++ = c;
4506 ++s;
4507 }
4508 else {
4509 startinpos = s-starts;
4510 endinpos = startinpos + 1;
4511 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4512 if (unicode_decode_call_errorhandler(
4513 errors, &errorHandler,
4514 "ascii", "ordinal not in range(128)",
4515 &starts, &e, &startinpos, &endinpos, &exc, &s,
4516 &v, &outpos, &p))
4517 goto onError;
4518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004520 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4522 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 Py_XDECREF(errorHandler);
4524 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004526
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 Py_XDECREF(errorHandler);
4530 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531 return NULL;
4532}
4533
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 Py_ssize_t size,
4536 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539}
4540
4541PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4542{
4543 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 PyErr_BadArgument();
4545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 }
4547 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 PyUnicode_GET_SIZE(unicode),
4549 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550}
4551
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004552#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004553
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004554/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004555
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004556#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004557#define NEED_RETRY
4558#endif
4559
4560/* XXX This code is limited to "true" double-byte encodings, as
4561 a) it assumes an incomplete character consists of a single byte, and
4562 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004564
4565static int is_dbcs_lead_byte(const char *s, int offset)
4566{
4567 const char *curr = s + offset;
4568
4569 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 const char *prev = CharPrev(s, curr);
4571 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004572 }
4573 return 0;
4574}
4575
4576/*
4577 * Decode MBCS string into unicode object. If 'final' is set, converts
4578 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4579 */
4580static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 const char *s, /* MBCS string */
4582 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004583 int final,
4584 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004585{
4586 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004587 Py_ssize_t n;
4588 DWORD usize;
4589 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004590
4591 assert(size >= 0);
4592
Victor Stinner554f3f02010-06-16 23:33:54 +00004593 /* check and handle 'errors' arg */
4594 if (errors==NULL || strcmp(errors, "strict")==0)
4595 flags = MB_ERR_INVALID_CHARS;
4596 else if (strcmp(errors, "ignore")==0)
4597 flags = 0;
4598 else {
4599 PyErr_Format(PyExc_ValueError,
4600 "mbcs encoding does not support errors='%s'",
4601 errors);
4602 return -1;
4603 }
4604
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004605 /* Skip trailing lead-byte unless 'final' is set */
4606 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004608
4609 /* First get the size of the result */
4610 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004611 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4612 if (usize==0)
4613 goto mbcs_decode_error;
4614 } else
4615 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004616
4617 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 /* Create unicode object */
4619 *v = _PyUnicode_New(usize);
4620 if (*v == NULL)
4621 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004622 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004623 }
4624 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 /* Extend unicode object */
4626 n = PyUnicode_GET_SIZE(*v);
4627 if (_PyUnicode_Resize(v, n + usize) < 0)
4628 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004629 }
4630
4631 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004632 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004633 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004634 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4635 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004637 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004638 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004639
4640mbcs_decode_error:
4641 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4642 we raise a UnicodeDecodeError - else it is a 'generic'
4643 windows error
4644 */
4645 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4646 /* Ideally, we should get reason from FormatMessage - this
4647 is the Windows 2000 English version of the message
4648 */
4649 PyObject *exc = NULL;
4650 const char *reason = "No mapping for the Unicode character exists "
4651 "in the target multi-byte code page.";
4652 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4653 if (exc != NULL) {
4654 PyCodec_StrictErrors(exc);
4655 Py_DECREF(exc);
4656 }
4657 } else {
4658 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4659 }
4660 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004661}
4662
4663PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 Py_ssize_t size,
4665 const char *errors,
4666 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004667{
4668 PyUnicodeObject *v = NULL;
4669 int done;
4670
4671 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004673
4674#ifdef NEED_RETRY
4675 retry:
4676 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004677 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004678 else
4679#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004680 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004681
4682 if (done < 0) {
4683 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004685 }
4686
4687 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004689
4690#ifdef NEED_RETRY
4691 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 s += done;
4693 size -= done;
4694 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004695 }
4696#endif
4697
4698 return (PyObject *)v;
4699}
4700
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004701PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004702 Py_ssize_t size,
4703 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004704{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004705 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4706}
4707
4708/*
4709 * Convert unicode into string object (MBCS).
4710 * Returns 0 if succeed, -1 otherwise.
4711 */
4712static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004713 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004714 int size, /* size of unicode */
4715 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004716{
Victor Stinner554f3f02010-06-16 23:33:54 +00004717 BOOL usedDefaultChar = FALSE;
4718 BOOL *pusedDefaultChar;
4719 int mbcssize;
4720 Py_ssize_t n;
4721 PyObject *exc = NULL;
4722 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004723
4724 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004725
Victor Stinner554f3f02010-06-16 23:33:54 +00004726 /* check and handle 'errors' arg */
4727 if (errors==NULL || strcmp(errors, "strict")==0) {
4728 flags = WC_NO_BEST_FIT_CHARS;
4729 pusedDefaultChar = &usedDefaultChar;
4730 } else if (strcmp(errors, "replace")==0) {
4731 flags = 0;
4732 pusedDefaultChar = NULL;
4733 } else {
4734 PyErr_Format(PyExc_ValueError,
4735 "mbcs encoding does not support errors='%s'",
4736 errors);
4737 return -1;
4738 }
4739
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004740 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004741 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004742 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4743 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004744 if (mbcssize == 0) {
4745 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4746 return -1;
4747 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004748 /* If we used a default char, then we failed! */
4749 if (pusedDefaultChar && *pusedDefaultChar)
4750 goto mbcs_encode_error;
4751 } else {
4752 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004753 }
4754
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004755 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 /* Create string object */
4757 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4758 if (*repr == NULL)
4759 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004760 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004761 }
4762 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004763 /* Extend string object */
4764 n = PyBytes_Size(*repr);
4765 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4766 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004767 }
4768
4769 /* Do the conversion */
4770 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004771 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004772 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4773 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004774 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4775 return -1;
4776 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004777 if (pusedDefaultChar && *pusedDefaultChar)
4778 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004779 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004780 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004781
4782mbcs_encode_error:
4783 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4784 Py_XDECREF(exc);
4785 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004786}
4787
4788PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004789 Py_ssize_t size,
4790 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004791{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004792 PyObject *repr = NULL;
4793 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004794
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004795#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004796 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004797 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004798 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004799 else
4800#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004801 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004802
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004803 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004804 Py_XDECREF(repr);
4805 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004806 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004807
4808#ifdef NEED_RETRY
4809 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 p += INT_MAX;
4811 size -= INT_MAX;
4812 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004813 }
4814#endif
4815
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004816 return repr;
4817}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004818
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004819PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4820{
4821 if (!PyUnicode_Check(unicode)) {
4822 PyErr_BadArgument();
4823 return NULL;
4824 }
4825 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004826 PyUnicode_GET_SIZE(unicode),
4827 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004828}
4829
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004830#undef NEED_RETRY
4831
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004832#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004833
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834/* --- Character Mapping Codec -------------------------------------------- */
4835
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004837 Py_ssize_t size,
4838 PyObject *mapping,
4839 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004841 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004842 Py_ssize_t startinpos;
4843 Py_ssize_t endinpos;
4844 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 PyUnicodeObject *v;
4847 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004848 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 PyObject *errorHandler = NULL;
4850 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004851 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004853
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 /* Default to Latin-1 */
4855 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004856 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857
4858 v = _PyUnicode_New(size);
4859 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004862 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004865 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 mapstring = PyUnicode_AS_UNICODE(mapping);
4867 maplen = PyUnicode_GET_SIZE(mapping);
4868 while (s < e) {
4869 unsigned char ch = *s;
4870 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871
Benjamin Peterson29060642009-01-31 22:14:21 +00004872 if (ch < maplen)
4873 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 if (x == 0xfffe) {
4876 /* undefined mapping */
4877 outpos = p-PyUnicode_AS_UNICODE(v);
4878 startinpos = s-starts;
4879 endinpos = startinpos+1;
4880 if (unicode_decode_call_errorhandler(
4881 errors, &errorHandler,
4882 "charmap", "character maps to <undefined>",
4883 &starts, &e, &startinpos, &endinpos, &exc, &s,
4884 &v, &outpos, &p)) {
4885 goto onError;
4886 }
4887 continue;
4888 }
4889 *p++ = x;
4890 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004891 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004892 }
4893 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004894 while (s < e) {
4895 unsigned char ch = *s;
4896 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004897
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4899 w = PyLong_FromLong((long)ch);
4900 if (w == NULL)
4901 goto onError;
4902 x = PyObject_GetItem(mapping, w);
4903 Py_DECREF(w);
4904 if (x == NULL) {
4905 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4906 /* No mapping found means: mapping is undefined. */
4907 PyErr_Clear();
4908 x = Py_None;
4909 Py_INCREF(x);
4910 } else
4911 goto onError;
4912 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004913
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 /* Apply mapping */
4915 if (PyLong_Check(x)) {
4916 long value = PyLong_AS_LONG(x);
4917 if (value < 0 || value > 65535) {
4918 PyErr_SetString(PyExc_TypeError,
4919 "character mapping must be in range(65536)");
4920 Py_DECREF(x);
4921 goto onError;
4922 }
4923 *p++ = (Py_UNICODE)value;
4924 }
4925 else if (x == Py_None) {
4926 /* undefined mapping */
4927 outpos = p-PyUnicode_AS_UNICODE(v);
4928 startinpos = s-starts;
4929 endinpos = startinpos+1;
4930 if (unicode_decode_call_errorhandler(
4931 errors, &errorHandler,
4932 "charmap", "character maps to <undefined>",
4933 &starts, &e, &startinpos, &endinpos, &exc, &s,
4934 &v, &outpos, &p)) {
4935 Py_DECREF(x);
4936 goto onError;
4937 }
4938 Py_DECREF(x);
4939 continue;
4940 }
4941 else if (PyUnicode_Check(x)) {
4942 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004943
Benjamin Peterson29060642009-01-31 22:14:21 +00004944 if (targetsize == 1)
4945 /* 1-1 mapping */
4946 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004947
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 else if (targetsize > 1) {
4949 /* 1-n mapping */
4950 if (targetsize > extrachars) {
4951 /* resize first */
4952 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4953 Py_ssize_t needed = (targetsize - extrachars) + \
4954 (targetsize << 2);
4955 extrachars += needed;
4956 /* XXX overflow detection missing */
4957 if (_PyUnicode_Resize(&v,
4958 PyUnicode_GET_SIZE(v) + needed) < 0) {
4959 Py_DECREF(x);
4960 goto onError;
4961 }
4962 p = PyUnicode_AS_UNICODE(v) + oldpos;
4963 }
4964 Py_UNICODE_COPY(p,
4965 PyUnicode_AS_UNICODE(x),
4966 targetsize);
4967 p += targetsize;
4968 extrachars -= targetsize;
4969 }
4970 /* 1-0 mapping: skip the character */
4971 }
4972 else {
4973 /* wrong return value */
4974 PyErr_SetString(PyExc_TypeError,
4975 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004976 Py_DECREF(x);
4977 goto onError;
4978 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 Py_DECREF(x);
4980 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004982 }
4983 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004984 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4985 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986 Py_XDECREF(errorHandler);
4987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004989
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004991 Py_XDECREF(errorHandler);
4992 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 Py_XDECREF(v);
4994 return NULL;
4995}
4996
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00004997/* Charmap encoding: the lookup table */
4998
4999struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005000 PyObject_HEAD
5001 unsigned char level1[32];
5002 int count2, count3;
5003 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005004};
5005
5006static PyObject*
5007encoding_map_size(PyObject *obj, PyObject* args)
5008{
5009 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005010 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005012}
5013
5014static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005015 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 PyDoc_STR("Return the size (in bytes) of this object") },
5017 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005018};
5019
5020static void
5021encoding_map_dealloc(PyObject* o)
5022{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005023 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005024}
5025
5026static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005027 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 "EncodingMap", /*tp_name*/
5029 sizeof(struct encoding_map), /*tp_basicsize*/
5030 0, /*tp_itemsize*/
5031 /* methods */
5032 encoding_map_dealloc, /*tp_dealloc*/
5033 0, /*tp_print*/
5034 0, /*tp_getattr*/
5035 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005036 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 0, /*tp_repr*/
5038 0, /*tp_as_number*/
5039 0, /*tp_as_sequence*/
5040 0, /*tp_as_mapping*/
5041 0, /*tp_hash*/
5042 0, /*tp_call*/
5043 0, /*tp_str*/
5044 0, /*tp_getattro*/
5045 0, /*tp_setattro*/
5046 0, /*tp_as_buffer*/
5047 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5048 0, /*tp_doc*/
5049 0, /*tp_traverse*/
5050 0, /*tp_clear*/
5051 0, /*tp_richcompare*/
5052 0, /*tp_weaklistoffset*/
5053 0, /*tp_iter*/
5054 0, /*tp_iternext*/
5055 encoding_map_methods, /*tp_methods*/
5056 0, /*tp_members*/
5057 0, /*tp_getset*/
5058 0, /*tp_base*/
5059 0, /*tp_dict*/
5060 0, /*tp_descr_get*/
5061 0, /*tp_descr_set*/
5062 0, /*tp_dictoffset*/
5063 0, /*tp_init*/
5064 0, /*tp_alloc*/
5065 0, /*tp_new*/
5066 0, /*tp_free*/
5067 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005068};
5069
5070PyObject*
5071PyUnicode_BuildEncodingMap(PyObject* string)
5072{
5073 Py_UNICODE *decode;
5074 PyObject *result;
5075 struct encoding_map *mresult;
5076 int i;
5077 int need_dict = 0;
5078 unsigned char level1[32];
5079 unsigned char level2[512];
5080 unsigned char *mlevel1, *mlevel2, *mlevel3;
5081 int count2 = 0, count3 = 0;
5082
5083 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5084 PyErr_BadArgument();
5085 return NULL;
5086 }
5087 decode = PyUnicode_AS_UNICODE(string);
5088 memset(level1, 0xFF, sizeof level1);
5089 memset(level2, 0xFF, sizeof level2);
5090
5091 /* If there isn't a one-to-one mapping of NULL to \0,
5092 or if there are non-BMP characters, we need to use
5093 a mapping dictionary. */
5094 if (decode[0] != 0)
5095 need_dict = 1;
5096 for (i = 1; i < 256; i++) {
5097 int l1, l2;
5098 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005099#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005100 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005101#endif
5102 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005103 need_dict = 1;
5104 break;
5105 }
5106 if (decode[i] == 0xFFFE)
5107 /* unmapped character */
5108 continue;
5109 l1 = decode[i] >> 11;
5110 l2 = decode[i] >> 7;
5111 if (level1[l1] == 0xFF)
5112 level1[l1] = count2++;
5113 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005114 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005115 }
5116
5117 if (count2 >= 0xFF || count3 >= 0xFF)
5118 need_dict = 1;
5119
5120 if (need_dict) {
5121 PyObject *result = PyDict_New();
5122 PyObject *key, *value;
5123 if (!result)
5124 return NULL;
5125 for (i = 0; i < 256; i++) {
5126 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005127 key = PyLong_FromLong(decode[i]);
5128 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005129 if (!key || !value)
5130 goto failed1;
5131 if (PyDict_SetItem(result, key, value) == -1)
5132 goto failed1;
5133 Py_DECREF(key);
5134 Py_DECREF(value);
5135 }
5136 return result;
5137 failed1:
5138 Py_XDECREF(key);
5139 Py_XDECREF(value);
5140 Py_DECREF(result);
5141 return NULL;
5142 }
5143
5144 /* Create a three-level trie */
5145 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5146 16*count2 + 128*count3 - 1);
5147 if (!result)
5148 return PyErr_NoMemory();
5149 PyObject_Init(result, &EncodingMapType);
5150 mresult = (struct encoding_map*)result;
5151 mresult->count2 = count2;
5152 mresult->count3 = count3;
5153 mlevel1 = mresult->level1;
5154 mlevel2 = mresult->level23;
5155 mlevel3 = mresult->level23 + 16*count2;
5156 memcpy(mlevel1, level1, 32);
5157 memset(mlevel2, 0xFF, 16*count2);
5158 memset(mlevel3, 0, 128*count3);
5159 count3 = 0;
5160 for (i = 1; i < 256; i++) {
5161 int o1, o2, o3, i2, i3;
5162 if (decode[i] == 0xFFFE)
5163 /* unmapped character */
5164 continue;
5165 o1 = decode[i]>>11;
5166 o2 = (decode[i]>>7) & 0xF;
5167 i2 = 16*mlevel1[o1] + o2;
5168 if (mlevel2[i2] == 0xFF)
5169 mlevel2[i2] = count3++;
5170 o3 = decode[i] & 0x7F;
5171 i3 = 128*mlevel2[i2] + o3;
5172 mlevel3[i3] = i;
5173 }
5174 return result;
5175}
5176
5177static int
5178encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5179{
5180 struct encoding_map *map = (struct encoding_map*)mapping;
5181 int l1 = c>>11;
5182 int l2 = (c>>7) & 0xF;
5183 int l3 = c & 0x7F;
5184 int i;
5185
5186#ifdef Py_UNICODE_WIDE
5187 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005189 }
5190#endif
5191 if (c == 0)
5192 return 0;
5193 /* level 1*/
5194 i = map->level1[l1];
5195 if (i == 0xFF) {
5196 return -1;
5197 }
5198 /* level 2*/
5199 i = map->level23[16*i+l2];
5200 if (i == 0xFF) {
5201 return -1;
5202 }
5203 /* level 3 */
5204 i = map->level23[16*map->count2 + 128*i + l3];
5205 if (i == 0) {
5206 return -1;
5207 }
5208 return i;
5209}
5210
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005211/* Lookup the character ch in the mapping. If the character
5212 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005213 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215{
Christian Heimes217cfd12007-12-02 14:31:20 +00005216 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005217 PyObject *x;
5218
5219 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005221 x = PyObject_GetItem(mapping, w);
5222 Py_DECREF(w);
5223 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5225 /* No mapping found means: mapping is undefined. */
5226 PyErr_Clear();
5227 x = Py_None;
5228 Py_INCREF(x);
5229 return x;
5230 } else
5231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005233 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005234 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005235 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 long value = PyLong_AS_LONG(x);
5237 if (value < 0 || value > 255) {
5238 PyErr_SetString(PyExc_TypeError,
5239 "character mapping must be in range(256)");
5240 Py_DECREF(x);
5241 return NULL;
5242 }
5243 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005245 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 /* wrong return value */
5249 PyErr_Format(PyExc_TypeError,
5250 "character mapping must return integer, bytes or None, not %.400s",
5251 x->ob_type->tp_name);
5252 Py_DECREF(x);
5253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 }
5255}
5256
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005257static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005258charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005259{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005260 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5261 /* exponentially overallocate to minimize reallocations */
5262 if (requiredsize < 2*outsize)
5263 requiredsize = 2*outsize;
5264 if (_PyBytes_Resize(outobj, requiredsize))
5265 return -1;
5266 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005267}
5268
Benjamin Peterson14339b62009-01-31 16:36:08 +00005269typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005271}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005272/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005273 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005274 space is available. Return a new reference to the object that
5275 was put in the output buffer, or Py_None, if the mapping was undefined
5276 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005277 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005278static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005279charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005280 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005281{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005282 PyObject *rep;
5283 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005284 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005285
Christian Heimes90aa7642007-12-19 02:45:37 +00005286 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005287 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005289 if (res == -1)
5290 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 if (outsize<requiredsize)
5292 if (charmapencode_resize(outobj, outpos, requiredsize))
5293 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005294 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 outstart[(*outpos)++] = (char)res;
5296 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005297 }
5298
5299 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005300 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005302 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 Py_DECREF(rep);
5304 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005305 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 if (PyLong_Check(rep)) {
5307 Py_ssize_t requiredsize = *outpos+1;
5308 if (outsize<requiredsize)
5309 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5310 Py_DECREF(rep);
5311 return enc_EXCEPTION;
5312 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005313 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005315 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 else {
5317 const char *repchars = PyBytes_AS_STRING(rep);
5318 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5319 Py_ssize_t requiredsize = *outpos+repsize;
5320 if (outsize<requiredsize)
5321 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5322 Py_DECREF(rep);
5323 return enc_EXCEPTION;
5324 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005325 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 memcpy(outstart + *outpos, repchars, repsize);
5327 *outpos += repsize;
5328 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005329 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005330 Py_DECREF(rep);
5331 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332}
5333
5334/* handle an error in PyUnicode_EncodeCharmap
5335 Return 0 on success, -1 on error */
5336static
5337int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005338 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005339 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005340 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005341 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005342{
5343 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005344 Py_ssize_t repsize;
5345 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346 Py_UNICODE *uni2;
5347 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005348 Py_ssize_t collstartpos = *inpos;
5349 Py_ssize_t collendpos = *inpos+1;
5350 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005351 char *encoding = "charmap";
5352 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005353 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005354
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 /* find all unencodable characters */
5356 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005357 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005358 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 int res = encoding_map_lookup(p[collendpos], mapping);
5360 if (res != -1)
5361 break;
5362 ++collendpos;
5363 continue;
5364 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005365
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 rep = charmapencode_lookup(p[collendpos], mapping);
5367 if (rep==NULL)
5368 return -1;
5369 else if (rep!=Py_None) {
5370 Py_DECREF(rep);
5371 break;
5372 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005373 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005375 }
5376 /* cache callback name lookup
5377 * (if not done yet, i.e. it's the first error) */
5378 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 if ((errors==NULL) || (!strcmp(errors, "strict")))
5380 *known_errorHandler = 1;
5381 else if (!strcmp(errors, "replace"))
5382 *known_errorHandler = 2;
5383 else if (!strcmp(errors, "ignore"))
5384 *known_errorHandler = 3;
5385 else if (!strcmp(errors, "xmlcharrefreplace"))
5386 *known_errorHandler = 4;
5387 else
5388 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 }
5390 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005391 case 1: /* strict */
5392 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5393 return -1;
5394 case 2: /* replace */
5395 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 x = charmapencode_output('?', mapping, res, respos);
5397 if (x==enc_EXCEPTION) {
5398 return -1;
5399 }
5400 else if (x==enc_FAILED) {
5401 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5402 return -1;
5403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005404 }
5405 /* fall through */
5406 case 3: /* ignore */
5407 *inpos = collendpos;
5408 break;
5409 case 4: /* xmlcharrefreplace */
5410 /* generate replacement (temporarily (mis)uses p) */
5411 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 char buffer[2+29+1+1];
5413 char *cp;
5414 sprintf(buffer, "&#%d;", (int)p[collpos]);
5415 for (cp = buffer; *cp; ++cp) {
5416 x = charmapencode_output(*cp, mapping, res, respos);
5417 if (x==enc_EXCEPTION)
5418 return -1;
5419 else if (x==enc_FAILED) {
5420 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5421 return -1;
5422 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005423 }
5424 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005425 *inpos = collendpos;
5426 break;
5427 default:
5428 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 encoding, reason, p, size, exceptionObject,
5430 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005431 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005432 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005433 if (PyBytes_Check(repunicode)) {
5434 /* Directly copy bytes result to output. */
5435 Py_ssize_t outsize = PyBytes_Size(*res);
5436 Py_ssize_t requiredsize;
5437 repsize = PyBytes_Size(repunicode);
5438 requiredsize = *respos + repsize;
5439 if (requiredsize > outsize)
5440 /* Make room for all additional bytes. */
5441 if (charmapencode_resize(res, respos, requiredsize)) {
5442 Py_DECREF(repunicode);
5443 return -1;
5444 }
5445 memcpy(PyBytes_AsString(*res) + *respos,
5446 PyBytes_AsString(repunicode), repsize);
5447 *respos += repsize;
5448 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005449 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005450 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005451 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005452 /* generate replacement */
5453 repsize = PyUnicode_GET_SIZE(repunicode);
5454 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 x = charmapencode_output(*uni2, mapping, res, respos);
5456 if (x==enc_EXCEPTION) {
5457 return -1;
5458 }
5459 else if (x==enc_FAILED) {
5460 Py_DECREF(repunicode);
5461 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5462 return -1;
5463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005464 }
5465 *inpos = newpos;
5466 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005467 }
5468 return 0;
5469}
5470
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 Py_ssize_t size,
5473 PyObject *mapping,
5474 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476 /* output object */
5477 PyObject *res = NULL;
5478 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005479 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005481 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482 PyObject *errorHandler = NULL;
5483 PyObject *exc = NULL;
5484 /* the following variable is used for caching string comparisons
5485 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5486 * 3=ignore, 4=xmlcharrefreplace */
5487 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488
5489 /* Default to Latin-1 */
5490 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 /* allocate enough for a simple encoding without
5494 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005495 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005496 if (res == NULL)
5497 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005498 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 /* try to encode it */
5503 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5504 if (x==enc_EXCEPTION) /* error */
5505 goto onError;
5506 if (x==enc_FAILED) { /* unencodable character */
5507 if (charmap_encoding_error(p, size, &inpos, mapping,
5508 &exc,
5509 &known_errorHandler, &errorHandler, errors,
5510 &res, &respos)) {
5511 goto onError;
5512 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005513 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 else
5515 /* done with this character => adjust input position */
5516 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005520 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005521 if (_PyBytes_Resize(&res, respos) < 0)
5522 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005523
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 Py_XDECREF(exc);
5525 Py_XDECREF(errorHandler);
5526 return res;
5527
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005529 Py_XDECREF(res);
5530 Py_XDECREF(exc);
5531 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 return NULL;
5533}
5534
5535PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537{
5538 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 PyErr_BadArgument();
5540 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 }
5542 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 PyUnicode_GET_SIZE(unicode),
5544 mapping,
5545 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546}
5547
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005548/* create or adjust a UnicodeTranslateError */
5549static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 const Py_UNICODE *unicode, Py_ssize_t size,
5551 Py_ssize_t startpos, Py_ssize_t endpos,
5552 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005555 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 }
5558 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5560 goto onError;
5561 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5562 goto onError;
5563 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5564 goto onError;
5565 return;
5566 onError:
5567 Py_DECREF(*exceptionObject);
5568 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 }
5570}
5571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572/* raises a UnicodeTranslateError */
5573static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 const Py_UNICODE *unicode, Py_ssize_t size,
5575 Py_ssize_t startpos, Py_ssize_t endpos,
5576 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577{
5578 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005582}
5583
5584/* error handling callback helper:
5585 build arguments, call the callback and check the arguments,
5586 put the result into newpos and return the replacement string, which
5587 has to be freed by the caller */
5588static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 PyObject **errorHandler,
5590 const char *reason,
5591 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5592 Py_ssize_t startpos, Py_ssize_t endpos,
5593 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005595 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005597 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 PyObject *restuple;
5599 PyObject *resunicode;
5600
5601 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005602 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005605 }
5606
5607 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005609 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611
5612 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005616 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005617 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 Py_DECREF(restuple);
5619 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620 }
5621 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 &resunicode, &i_newpos)) {
5623 Py_DECREF(restuple);
5624 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005626 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005628 else
5629 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005630 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5632 Py_DECREF(restuple);
5633 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005634 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005635 Py_INCREF(resunicode);
5636 Py_DECREF(restuple);
5637 return resunicode;
5638}
5639
5640/* Lookup the character ch in the mapping and put the result in result,
5641 which must be decrefed by the caller.
5642 Return 0 on success, -1 on error */
5643static
5644int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5645{
Christian Heimes217cfd12007-12-02 14:31:20 +00005646 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 PyObject *x;
5648
5649 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 x = PyObject_GetItem(mapping, w);
5652 Py_DECREF(w);
5653 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005654 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5655 /* No mapping found means: use 1:1 mapping. */
5656 PyErr_Clear();
5657 *result = NULL;
5658 return 0;
5659 } else
5660 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661 }
5662 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 *result = x;
5664 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005666 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 long value = PyLong_AS_LONG(x);
5668 long max = PyUnicode_GetMax();
5669 if (value < 0 || value > max) {
5670 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005671 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 Py_DECREF(x);
5673 return -1;
5674 }
5675 *result = x;
5676 return 0;
5677 }
5678 else if (PyUnicode_Check(x)) {
5679 *result = x;
5680 return 0;
5681 }
5682 else {
5683 /* wrong return value */
5684 PyErr_SetString(PyExc_TypeError,
5685 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005686 Py_DECREF(x);
5687 return -1;
5688 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689}
5690/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 if not reallocate and adjust various state variables.
5692 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693static
Walter Dörwald4894c302003-10-24 14:25:28 +00005694int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005697 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005698 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 /* remember old output position */
5700 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5701 /* exponentially overallocate to minimize reallocations */
5702 if (requiredsize < 2 * oldsize)
5703 requiredsize = 2 * oldsize;
5704 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5705 return -1;
5706 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707 }
5708 return 0;
5709}
5710/* lookup the character, put the result in the output string and adjust
5711 various state variables. Return a new reference to the object that
5712 was put in the output buffer in *result, or Py_None, if the mapping was
5713 undefined (in which case no character was written).
5714 The called must decref result.
5715 Return 0 on success, -1 on error. */
5716static
Walter Dörwald4894c302003-10-24 14:25:28 +00005717int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5719 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720{
Walter Dörwald4894c302003-10-24 14:25:28 +00005721 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005723 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005724 /* not found => default to 1:1 mapping */
5725 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 }
5727 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005729 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 /* no overflow check, because we know that the space is enough */
5731 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 }
5733 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5735 if (repsize==1) {
5736 /* no overflow check, because we know that the space is enough */
5737 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5738 }
5739 else if (repsize!=0) {
5740 /* more than one character */
5741 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5742 (insize - (curinp-startinp)) +
5743 repsize - 1;
5744 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5745 return -1;
5746 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5747 *outp += repsize;
5748 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005749 }
5750 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 return 0;
5753}
5754
5755PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 Py_ssize_t size,
5757 PyObject *mapping,
5758 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005760 /* output object */
5761 PyObject *res = NULL;
5762 /* pointers to the beginning and end+1 of input */
5763 const Py_UNICODE *startp = p;
5764 const Py_UNICODE *endp = p + size;
5765 /* pointer into the output */
5766 Py_UNICODE *str;
5767 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005768 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 char *reason = "character maps to <undefined>";
5770 PyObject *errorHandler = NULL;
5771 PyObject *exc = NULL;
5772 /* the following variable is used for caching string comparisons
5773 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5774 * 3=ignore, 4=xmlcharrefreplace */
5775 int known_errorHandler = -1;
5776
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 PyErr_BadArgument();
5779 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781
5782 /* allocate enough for a simple 1:1 translation without
5783 replacements, if we need more, we'll resize */
5784 res = PyUnicode_FromUnicode(NULL, size);
5785 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 /* try to encode it */
5793 PyObject *x = NULL;
5794 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5795 Py_XDECREF(x);
5796 goto onError;
5797 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005798 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 if (x!=Py_None) /* it worked => adjust input pointer */
5800 ++p;
5801 else { /* untranslatable character */
5802 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5803 Py_ssize_t repsize;
5804 Py_ssize_t newpos;
5805 Py_UNICODE *uni2;
5806 /* startpos for collecting untranslatable chars */
5807 const Py_UNICODE *collstart = p;
5808 const Py_UNICODE *collend = p+1;
5809 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 /* find all untranslatable characters */
5812 while (collend < endp) {
5813 if (charmaptranslate_lookup(*collend, mapping, &x))
5814 goto onError;
5815 Py_XDECREF(x);
5816 if (x!=Py_None)
5817 break;
5818 ++collend;
5819 }
5820 /* cache callback name lookup
5821 * (if not done yet, i.e. it's the first error) */
5822 if (known_errorHandler==-1) {
5823 if ((errors==NULL) || (!strcmp(errors, "strict")))
5824 known_errorHandler = 1;
5825 else if (!strcmp(errors, "replace"))
5826 known_errorHandler = 2;
5827 else if (!strcmp(errors, "ignore"))
5828 known_errorHandler = 3;
5829 else if (!strcmp(errors, "xmlcharrefreplace"))
5830 known_errorHandler = 4;
5831 else
5832 known_errorHandler = 0;
5833 }
5834 switch (known_errorHandler) {
5835 case 1: /* strict */
5836 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005837 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 case 2: /* replace */
5839 /* No need to check for space, this is a 1:1 replacement */
5840 for (coll = collstart; coll<collend; ++coll)
5841 *str++ = '?';
5842 /* fall through */
5843 case 3: /* ignore */
5844 p = collend;
5845 break;
5846 case 4: /* xmlcharrefreplace */
5847 /* generate replacement (temporarily (mis)uses p) */
5848 for (p = collstart; p < collend; ++p) {
5849 char buffer[2+29+1+1];
5850 char *cp;
5851 sprintf(buffer, "&#%d;", (int)*p);
5852 if (charmaptranslate_makespace(&res, &str,
5853 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5854 goto onError;
5855 for (cp = buffer; *cp; ++cp)
5856 *str++ = *cp;
5857 }
5858 p = collend;
5859 break;
5860 default:
5861 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5862 reason, startp, size, &exc,
5863 collstart-startp, collend-startp, &newpos);
5864 if (repunicode == NULL)
5865 goto onError;
5866 /* generate replacement */
5867 repsize = PyUnicode_GET_SIZE(repunicode);
5868 if (charmaptranslate_makespace(&res, &str,
5869 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5870 Py_DECREF(repunicode);
5871 goto onError;
5872 }
5873 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5874 *str++ = *uni2;
5875 p = startp + newpos;
5876 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005877 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005878 }
5879 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 /* Resize if we allocated to much */
5881 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005882 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 if (PyUnicode_Resize(&res, respos) < 0)
5884 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885 }
5886 Py_XDECREF(exc);
5887 Py_XDECREF(errorHandler);
5888 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889
Benjamin Peterson29060642009-01-31 22:14:21 +00005890 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005891 Py_XDECREF(res);
5892 Py_XDECREF(exc);
5893 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 return NULL;
5895}
5896
5897PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005898 PyObject *mapping,
5899 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900{
5901 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005902
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 str = PyUnicode_FromObject(str);
5904 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 PyUnicode_GET_SIZE(str),
5908 mapping,
5909 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 Py_DECREF(str);
5911 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005912
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 Py_XDECREF(str);
5915 return NULL;
5916}
Tim Petersced69f82003-09-16 20:30:58 +00005917
Guido van Rossum9e896b32000-04-05 20:11:21 +00005918/* --- Decimal Encoder ---------------------------------------------------- */
5919
5920int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 Py_ssize_t length,
5922 char *output,
5923 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005924{
5925 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 PyObject *errorHandler = NULL;
5927 PyObject *exc = NULL;
5928 const char *encoding = "decimal";
5929 const char *reason = "invalid decimal Unicode string";
5930 /* the following variable is used for caching string comparisons
5931 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5932 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005933
5934 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 PyErr_BadArgument();
5936 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005937 }
5938
5939 p = s;
5940 end = s + length;
5941 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 register Py_UNICODE ch = *p;
5943 int decimal;
5944 PyObject *repunicode;
5945 Py_ssize_t repsize;
5946 Py_ssize_t newpos;
5947 Py_UNICODE *uni2;
5948 Py_UNICODE *collstart;
5949 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005950
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005952 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 ++p;
5954 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005955 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 decimal = Py_UNICODE_TODECIMAL(ch);
5957 if (decimal >= 0) {
5958 *output++ = '0' + decimal;
5959 ++p;
5960 continue;
5961 }
5962 if (0 < ch && ch < 256) {
5963 *output++ = (char)ch;
5964 ++p;
5965 continue;
5966 }
5967 /* All other characters are considered unencodable */
5968 collstart = p;
5969 collend = p+1;
5970 while (collend < end) {
5971 if ((0 < *collend && *collend < 256) ||
5972 !Py_UNICODE_ISSPACE(*collend) ||
5973 Py_UNICODE_TODECIMAL(*collend))
5974 break;
5975 }
5976 /* cache callback name lookup
5977 * (if not done yet, i.e. it's the first error) */
5978 if (known_errorHandler==-1) {
5979 if ((errors==NULL) || (!strcmp(errors, "strict")))
5980 known_errorHandler = 1;
5981 else if (!strcmp(errors, "replace"))
5982 known_errorHandler = 2;
5983 else if (!strcmp(errors, "ignore"))
5984 known_errorHandler = 3;
5985 else if (!strcmp(errors, "xmlcharrefreplace"))
5986 known_errorHandler = 4;
5987 else
5988 known_errorHandler = 0;
5989 }
5990 switch (known_errorHandler) {
5991 case 1: /* strict */
5992 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5993 goto onError;
5994 case 2: /* replace */
5995 for (p = collstart; p < collend; ++p)
5996 *output++ = '?';
5997 /* fall through */
5998 case 3: /* ignore */
5999 p = collend;
6000 break;
6001 case 4: /* xmlcharrefreplace */
6002 /* generate replacement (temporarily (mis)uses p) */
6003 for (p = collstart; p < collend; ++p)
6004 output += sprintf(output, "&#%d;", (int)*p);
6005 p = collend;
6006 break;
6007 default:
6008 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6009 encoding, reason, s, length, &exc,
6010 collstart-s, collend-s, &newpos);
6011 if (repunicode == NULL)
6012 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006013 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006014 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006015 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6016 Py_DECREF(repunicode);
6017 goto onError;
6018 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 /* generate replacement */
6020 repsize = PyUnicode_GET_SIZE(repunicode);
6021 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6022 Py_UNICODE ch = *uni2;
6023 if (Py_UNICODE_ISSPACE(ch))
6024 *output++ = ' ';
6025 else {
6026 decimal = Py_UNICODE_TODECIMAL(ch);
6027 if (decimal >= 0)
6028 *output++ = '0' + decimal;
6029 else if (0 < ch && ch < 256)
6030 *output++ = (char)ch;
6031 else {
6032 Py_DECREF(repunicode);
6033 raise_encode_exception(&exc, encoding,
6034 s, length, collstart-s, collend-s, reason);
6035 goto onError;
6036 }
6037 }
6038 }
6039 p = s + newpos;
6040 Py_DECREF(repunicode);
6041 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006042 }
6043 /* 0-terminate the output string */
6044 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 Py_XDECREF(exc);
6046 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006047 return 0;
6048
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 Py_XDECREF(exc);
6051 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006052 return -1;
6053}
6054
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055/* --- Helpers ------------------------------------------------------------ */
6056
Eric Smith8c663262007-08-25 02:26:07 +00006057#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006058#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006059
Thomas Wouters477c8d52006-05-27 19:21:47 +00006060#include "stringlib/count.h"
6061#include "stringlib/find.h"
6062#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006063#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006064
Eric Smith5807c412008-05-11 21:00:57 +00006065#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006066#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006067#include "stringlib/localeutil.h"
6068
Thomas Wouters477c8d52006-05-27 19:21:47 +00006069/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006070#define ADJUST_INDICES(start, end, len) \
6071 if (end > len) \
6072 end = len; \
6073 else if (end < 0) { \
6074 end += len; \
6075 if (end < 0) \
6076 end = 0; \
6077 } \
6078 if (start < 0) { \
6079 start += len; \
6080 if (start < 0) \
6081 start = 0; \
6082 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006083
Martin v. Löwis18e16552006-02-15 17:27:45 +00006084Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006085 PyObject *substr,
6086 Py_ssize_t start,
6087 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006089 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006090 PyUnicodeObject* str_obj;
6091 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006092
Thomas Wouters477c8d52006-05-27 19:21:47 +00006093 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6094 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006096 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6097 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 Py_DECREF(str_obj);
6099 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 }
Tim Petersced69f82003-09-16 20:30:58 +00006101
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006102 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006103 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006104 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6105 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006106 );
6107
6108 Py_DECREF(sub_obj);
6109 Py_DECREF(str_obj);
6110
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 return result;
6112}
6113
Martin v. Löwis18e16552006-02-15 17:27:45 +00006114Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006115 PyObject *sub,
6116 Py_ssize_t start,
6117 Py_ssize_t end,
6118 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006120 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006121
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006123 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006125 sub = PyUnicode_FromObject(sub);
6126 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 Py_DECREF(str);
6128 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 }
Tim Petersced69f82003-09-16 20:30:58 +00006130
Thomas Wouters477c8d52006-05-27 19:21:47 +00006131 if (direction > 0)
6132 result = stringlib_find_slice(
6133 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6134 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6135 start, end
6136 );
6137 else
6138 result = stringlib_rfind_slice(
6139 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6140 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6141 start, end
6142 );
6143
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006145 Py_DECREF(sub);
6146
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 return result;
6148}
6149
Tim Petersced69f82003-09-16 20:30:58 +00006150static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 PyUnicodeObject *substring,
6153 Py_ssize_t start,
6154 Py_ssize_t end,
6155 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 if (substring->length == 0)
6158 return 1;
6159
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006160 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 end -= substring->length;
6162 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164
6165 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 if (Py_UNICODE_MATCH(self, end, substring))
6167 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 } else {
6169 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 }
6172
6173 return 0;
6174}
6175
Martin v. Löwis18e16552006-02-15 17:27:45 +00006176Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 PyObject *substr,
6178 Py_ssize_t start,
6179 Py_ssize_t end,
6180 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006182 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006183
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 str = PyUnicode_FromObject(str);
6185 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 substr = PyUnicode_FromObject(substr);
6188 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 Py_DECREF(str);
6190 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 }
Tim Petersced69f82003-09-16 20:30:58 +00006192
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 (PyUnicodeObject *)substr,
6195 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 Py_DECREF(str);
6197 Py_DECREF(substr);
6198 return result;
6199}
6200
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201/* Apply fixfct filter to the Unicode object self and return a
6202 reference to the modified object */
6203
Tim Petersced69f82003-09-16 20:30:58 +00006204static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207{
6208
6209 PyUnicodeObject *u;
6210
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006211 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006214
6215 Py_UNICODE_COPY(u->str, self->str, self->length);
6216
Tim Peters7a29bd52001-09-12 03:03:31 +00006217 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 /* fixfct should return TRUE if it modified the buffer. If
6219 FALSE, return a reference to the original buffer instead
6220 (to save space, not time) */
6221 Py_INCREF(self);
6222 Py_DECREF(u);
6223 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 }
6225 return (PyObject*) u;
6226}
6227
Tim Petersced69f82003-09-16 20:30:58 +00006228static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229int fixupper(PyUnicodeObject *self)
6230{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006231 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 Py_UNICODE *s = self->str;
6233 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006234
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006237
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 ch = Py_UNICODE_TOUPPER(*s);
6239 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 *s = ch;
6242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 s++;
6244 }
6245
6246 return status;
6247}
6248
Tim Petersced69f82003-09-16 20:30:58 +00006249static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250int fixlower(PyUnicodeObject *self)
6251{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006252 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 Py_UNICODE *s = self->str;
6254 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006255
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006258
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 ch = Py_UNICODE_TOLOWER(*s);
6260 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 *s = ch;
6263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 s++;
6265 }
6266
6267 return status;
6268}
6269
Tim Petersced69f82003-09-16 20:30:58 +00006270static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271int fixswapcase(PyUnicodeObject *self)
6272{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006273 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 Py_UNICODE *s = self->str;
6275 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006276
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 while (len-- > 0) {
6278 if (Py_UNICODE_ISUPPER(*s)) {
6279 *s = Py_UNICODE_TOLOWER(*s);
6280 status = 1;
6281 } else if (Py_UNICODE_ISLOWER(*s)) {
6282 *s = Py_UNICODE_TOUPPER(*s);
6283 status = 1;
6284 }
6285 s++;
6286 }
6287
6288 return status;
6289}
6290
Tim Petersced69f82003-09-16 20:30:58 +00006291static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292int fixcapitalize(PyUnicodeObject *self)
6293{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006294 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006295 Py_UNICODE *s = self->str;
6296 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006297
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006298 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006300 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006301 *s = Py_UNICODE_TOUPPER(*s);
6302 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006304 s++;
6305 while (--len > 0) {
6306 if (Py_UNICODE_ISUPPER(*s)) {
6307 *s = Py_UNICODE_TOLOWER(*s);
6308 status = 1;
6309 }
6310 s++;
6311 }
6312 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313}
6314
6315static
6316int fixtitle(PyUnicodeObject *self)
6317{
6318 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6319 register Py_UNICODE *e;
6320 int previous_is_cased;
6321
6322 /* Shortcut for single character strings */
6323 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6325 if (*p != ch) {
6326 *p = ch;
6327 return 1;
6328 }
6329 else
6330 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 }
Tim Petersced69f82003-09-16 20:30:58 +00006332
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 e = p + PyUnicode_GET_SIZE(self);
6334 previous_is_cased = 0;
6335 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006336 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006337
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 if (previous_is_cased)
6339 *p = Py_UNICODE_TOLOWER(ch);
6340 else
6341 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006342
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 if (Py_UNICODE_ISLOWER(ch) ||
6344 Py_UNICODE_ISUPPER(ch) ||
6345 Py_UNICODE_ISTITLE(ch))
6346 previous_is_cased = 1;
6347 else
6348 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 }
6350 return 1;
6351}
6352
Tim Peters8ce9f162004-08-27 01:49:32 +00006353PyObject *
6354PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355{
Skip Montanaro6543b452004-09-16 03:28:13 +00006356 const Py_UNICODE blank = ' ';
6357 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006358 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006359 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006360 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6361 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006362 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6363 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006364 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006365 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366
Tim Peters05eba1f2004-08-27 21:32:02 +00006367 fseq = PySequence_Fast(seq, "");
6368 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006369 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006370 }
6371
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006372 /* NOTE: the following code can't call back into Python code,
6373 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006374 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006375
Tim Peters05eba1f2004-08-27 21:32:02 +00006376 seqlen = PySequence_Fast_GET_SIZE(fseq);
6377 /* If empty sequence, return u"". */
6378 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006379 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6380 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006381 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006382 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006383 /* If singleton sequence with an exact Unicode, return that. */
6384 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 item = items[0];
6386 if (PyUnicode_CheckExact(item)) {
6387 Py_INCREF(item);
6388 res = (PyUnicodeObject *)item;
6389 goto Done;
6390 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006391 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006392 else {
6393 /* Set up sep and seplen */
6394 if (separator == NULL) {
6395 sep = &blank;
6396 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006397 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006398 else {
6399 if (!PyUnicode_Check(separator)) {
6400 PyErr_Format(PyExc_TypeError,
6401 "separator: expected str instance,"
6402 " %.80s found",
6403 Py_TYPE(separator)->tp_name);
6404 goto onError;
6405 }
6406 sep = PyUnicode_AS_UNICODE(separator);
6407 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006408 }
6409 }
6410
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006411 /* There are at least two things to join, or else we have a subclass
6412 * of str in the sequence.
6413 * Do a pre-pass to figure out the total amount of space we'll
6414 * need (sz), and see whether all argument are strings.
6415 */
6416 sz = 0;
6417 for (i = 0; i < seqlen; i++) {
6418 const Py_ssize_t old_sz = sz;
6419 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 if (!PyUnicode_Check(item)) {
6421 PyErr_Format(PyExc_TypeError,
6422 "sequence item %zd: expected str instance,"
6423 " %.80s found",
6424 i, Py_TYPE(item)->tp_name);
6425 goto onError;
6426 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006427 sz += PyUnicode_GET_SIZE(item);
6428 if (i != 0)
6429 sz += seplen;
6430 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6431 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006433 goto onError;
6434 }
6435 }
Tim Petersced69f82003-09-16 20:30:58 +00006436
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006437 res = _PyUnicode_New(sz);
6438 if (res == NULL)
6439 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006440
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006441 /* Catenate everything. */
6442 res_p = PyUnicode_AS_UNICODE(res);
6443 for (i = 0; i < seqlen; ++i) {
6444 Py_ssize_t itemlen;
6445 item = items[i];
6446 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 /* Copy item, and maybe the separator. */
6448 if (i) {
6449 Py_UNICODE_COPY(res_p, sep, seplen);
6450 res_p += seplen;
6451 }
6452 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6453 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006454 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006455
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006457 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 return (PyObject *)res;
6459
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006461 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006462 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 return NULL;
6464}
6465
Tim Petersced69f82003-09-16 20:30:58 +00006466static
6467PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 Py_ssize_t left,
6469 Py_ssize_t right,
6470 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471{
6472 PyUnicodeObject *u;
6473
6474 if (left < 0)
6475 left = 0;
6476 if (right < 0)
6477 right = 0;
6478
Tim Peters7a29bd52001-09-12 03:03:31 +00006479 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 Py_INCREF(self);
6481 return self;
6482 }
6483
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006484 if (left > PY_SSIZE_T_MAX - self->length ||
6485 right > PY_SSIZE_T_MAX - (left + self->length)) {
6486 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6487 return NULL;
6488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 u = _PyUnicode_New(left + self->length + right);
6490 if (u) {
6491 if (left)
6492 Py_UNICODE_FILL(u->str, fill, left);
6493 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6494 if (right)
6495 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6496 }
6497
6498 return u;
6499}
6500
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006501PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504
6505 string = PyUnicode_FromObject(string);
6506 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006509 list = stringlib_splitlines(
6510 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6511 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512
6513 Py_DECREF(string);
6514 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515}
6516
Tim Petersced69f82003-09-16 20:30:58 +00006517static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 PyUnicodeObject *substring,
6520 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006523 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006526 return stringlib_split_whitespace(
6527 (PyObject*) self, self->str, self->length, maxcount
6528 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006530 return stringlib_split(
6531 (PyObject*) self, self->str, self->length,
6532 substring->str, substring->length,
6533 maxcount
6534 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535}
6536
Tim Petersced69f82003-09-16 20:30:58 +00006537static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006538PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 PyUnicodeObject *substring,
6540 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006541{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006542 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006543 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006544
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006545 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006546 return stringlib_rsplit_whitespace(
6547 (PyObject*) self, self->str, self->length, maxcount
6548 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006549
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006550 return stringlib_rsplit(
6551 (PyObject*) self, self->str, self->length,
6552 substring->str, substring->length,
6553 maxcount
6554 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006555}
6556
6557static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 PyUnicodeObject *str1,
6560 PyUnicodeObject *str2,
6561 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562{
6563 PyUnicodeObject *u;
6564
6565 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006567 else if (maxcount == 0 || self->length == 0)
6568 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569
Thomas Wouters477c8d52006-05-27 19:21:47 +00006570 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006571 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006572 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006573 if (str1->length == 0)
6574 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006575 if (str1->length == 1) {
6576 /* replace characters */
6577 Py_UNICODE u1, u2;
6578 if (!findchar(self->str, self->length, str1->str[0]))
6579 goto nothing;
6580 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6581 if (!u)
6582 return NULL;
6583 Py_UNICODE_COPY(u->str, self->str, self->length);
6584 u1 = str1->str[0];
6585 u2 = str2->str[0];
6586 for (i = 0; i < u->length; i++)
6587 if (u->str[i] == u1) {
6588 if (--maxcount < 0)
6589 break;
6590 u->str[i] = u2;
6591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006593 i = stringlib_find(
6594 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006596 if (i < 0)
6597 goto nothing;
6598 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6599 if (!u)
6600 return NULL;
6601 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006602
6603 /* change everything in-place, starting with this one */
6604 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6605 i += str1->length;
6606
6607 while ( --maxcount > 0) {
6608 i = stringlib_find(self->str+i, self->length-i,
6609 str1->str, str1->length,
6610 i);
6611 if (i == -1)
6612 break;
6613 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6614 i += str1->length;
6615 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006618
6619 Py_ssize_t n, i, j, e;
6620 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 Py_UNICODE *p;
6622
6623 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006624 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6625 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006626 if (n == 0)
6627 goto nothing;
6628 /* new_size = self->length + n * (str2->length - str1->length)); */
6629 delta = (str2->length - str1->length);
6630 if (delta == 0) {
6631 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006633 product = n * (str2->length - str1->length);
6634 if ((product / (str2->length - str1->length)) != n) {
6635 PyErr_SetString(PyExc_OverflowError,
6636 "replace string is too long");
6637 return NULL;
6638 }
6639 new_size = self->length + product;
6640 if (new_size < 0) {
6641 PyErr_SetString(PyExc_OverflowError,
6642 "replace string is too long");
6643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 }
6645 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006646 u = _PyUnicode_New(new_size);
6647 if (!u)
6648 return NULL;
6649 i = 0;
6650 p = u->str;
6651 e = self->length - str1->length;
6652 if (str1->length > 0) {
6653 while (n-- > 0) {
6654 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006655 j = stringlib_find(self->str+i, self->length-i,
6656 str1->str, str1->length,
6657 i);
6658 if (j == -1)
6659 break;
6660 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006661 /* copy unchanged part [i:j] */
6662 Py_UNICODE_COPY(p, self->str+i, j-i);
6663 p += j - i;
6664 }
6665 /* copy substitution string */
6666 if (str2->length > 0) {
6667 Py_UNICODE_COPY(p, str2->str, str2->length);
6668 p += str2->length;
6669 }
6670 i = j + str1->length;
6671 }
6672 if (i < self->length)
6673 /* copy tail [i:] */
6674 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6675 } else {
6676 /* interleave */
6677 while (n > 0) {
6678 Py_UNICODE_COPY(p, str2->str, str2->length);
6679 p += str2->length;
6680 if (--n <= 0)
6681 break;
6682 *p++ = self->str[i++];
6683 }
6684 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006688
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006690 /* nothing to replace; return original string (when possible) */
6691 if (PyUnicode_CheckExact(self)) {
6692 Py_INCREF(self);
6693 return (PyObject *) self;
6694 }
6695 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696}
6697
6698/* --- Unicode Object Methods --------------------------------------------- */
6699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006700PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702\n\
6703Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006704characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705
6706static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006707unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709 return fixup(self, fixtitle);
6710}
6711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714\n\
6715Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006716have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717
6718static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006719unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 return fixup(self, fixcapitalize);
6722}
6723
6724#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006725PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727\n\
6728Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006729normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730
6731static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006732unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733{
6734 PyObject *list;
6735 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006736 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 /* Split into words */
6739 list = split(self, NULL, -1);
6740 if (!list)
6741 return NULL;
6742
6743 /* Capitalize each word */
6744 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6745 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 if (item == NULL)
6748 goto onError;
6749 Py_DECREF(PyList_GET_ITEM(list, i));
6750 PyList_SET_ITEM(list, i, item);
6751 }
6752
6753 /* Join the words to form a new string */
6754 item = PyUnicode_Join(NULL, list);
6755
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 Py_DECREF(list);
6758 return (PyObject *)item;
6759}
6760#endif
6761
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006762/* Argument converter. Coerces to a single unicode character */
6763
6764static int
6765convert_uc(PyObject *obj, void *addr)
6766{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006767 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6768 PyObject *uniobj;
6769 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006770
Benjamin Peterson14339b62009-01-31 16:36:08 +00006771 uniobj = PyUnicode_FromObject(obj);
6772 if (uniobj == NULL) {
6773 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006775 return 0;
6776 }
6777 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6778 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006780 Py_DECREF(uniobj);
6781 return 0;
6782 }
6783 unistr = PyUnicode_AS_UNICODE(uniobj);
6784 *fillcharloc = unistr[0];
6785 Py_DECREF(uniobj);
6786 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006787}
6788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006789PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006792Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006793done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794
6795static PyObject *
6796unicode_center(PyUnicodeObject *self, PyObject *args)
6797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006798 Py_ssize_t marg, left;
6799 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006800 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801
Thomas Woutersde017742006-02-16 19:34:37 +00006802 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 return NULL;
6804
Tim Peters7a29bd52001-09-12 03:03:31 +00006805 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 Py_INCREF(self);
6807 return (PyObject*) self;
6808 }
6809
6810 marg = width - self->length;
6811 left = marg / 2 + (marg & width & 1);
6812
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006813 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814}
6815
Marc-André Lemburge5034372000-08-08 08:04:29 +00006816#if 0
6817
6818/* This code should go into some future Unicode collation support
6819 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006820 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006821
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006822/* speedy UTF-16 code point order comparison */
6823/* gleaned from: */
6824/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6825
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006826static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006827{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006828 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006829 0, 0, 0, 0, 0, 0, 0, 0,
6830 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006831 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006832};
6833
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834static int
6835unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6836{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006837 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006838
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839 Py_UNICODE *s1 = str1->str;
6840 Py_UNICODE *s2 = str2->str;
6841
6842 len1 = str1->length;
6843 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006844
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006846 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006847
6848 c1 = *s1++;
6849 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006850
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 if (c1 > (1<<11) * 26)
6852 c1 += utf16Fixup[c1>>11];
6853 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006854 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006855 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006856
6857 if (c1 != c2)
6858 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006859
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006860 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 }
6862
6863 return (len1 < len2) ? -1 : (len1 != len2);
6864}
6865
Marc-André Lemburge5034372000-08-08 08:04:29 +00006866#else
6867
6868static int
6869unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6870{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006871 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006872
6873 Py_UNICODE *s1 = str1->str;
6874 Py_UNICODE *s2 = str2->str;
6875
6876 len1 = str1->length;
6877 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006878
Marc-André Lemburge5034372000-08-08 08:04:29 +00006879 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006880 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006881
Fredrik Lundh45714e92001-06-26 16:39:36 +00006882 c1 = *s1++;
6883 c2 = *s2++;
6884
6885 if (c1 != c2)
6886 return (c1 < c2) ? -1 : 1;
6887
Marc-André Lemburge5034372000-08-08 08:04:29 +00006888 len1--; len2--;
6889 }
6890
6891 return (len1 < len2) ? -1 : (len1 != len2);
6892}
6893
6894#endif
6895
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006899 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6900 return unicode_compare((PyUnicodeObject *)left,
6901 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006902 PyErr_Format(PyExc_TypeError,
6903 "Can't compare %.100s and %.100s",
6904 left->ob_type->tp_name,
6905 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 return -1;
6907}
6908
Martin v. Löwis5b222132007-06-10 09:51:05 +00006909int
6910PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6911{
6912 int i;
6913 Py_UNICODE *id;
6914 assert(PyUnicode_Check(uni));
6915 id = PyUnicode_AS_UNICODE(uni);
6916 /* Compare Unicode string and source character set string */
6917 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 if (id[i] != str[i])
6919 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006920 /* This check keeps Python strings that end in '\0' from comparing equal
6921 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006922 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006924 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006926 return 0;
6927}
6928
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006929
Benjamin Peterson29060642009-01-31 22:14:21 +00006930#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006931 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006932
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006933PyObject *PyUnicode_RichCompare(PyObject *left,
6934 PyObject *right,
6935 int op)
6936{
6937 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006938
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006939 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6940 PyObject *v;
6941 if (((PyUnicodeObject *) left)->length !=
6942 ((PyUnicodeObject *) right)->length) {
6943 if (op == Py_EQ) {
6944 Py_INCREF(Py_False);
6945 return Py_False;
6946 }
6947 if (op == Py_NE) {
6948 Py_INCREF(Py_True);
6949 return Py_True;
6950 }
6951 }
6952 if (left == right)
6953 result = 0;
6954 else
6955 result = unicode_compare((PyUnicodeObject *)left,
6956 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006957
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006958 /* Convert the return value to a Boolean */
6959 switch (op) {
6960 case Py_EQ:
6961 v = TEST_COND(result == 0);
6962 break;
6963 case Py_NE:
6964 v = TEST_COND(result != 0);
6965 break;
6966 case Py_LE:
6967 v = TEST_COND(result <= 0);
6968 break;
6969 case Py_GE:
6970 v = TEST_COND(result >= 0);
6971 break;
6972 case Py_LT:
6973 v = TEST_COND(result == -1);
6974 break;
6975 case Py_GT:
6976 v = TEST_COND(result == 1);
6977 break;
6978 default:
6979 PyErr_BadArgument();
6980 return NULL;
6981 }
6982 Py_INCREF(v);
6983 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006984 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006985
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006986 Py_INCREF(Py_NotImplemented);
6987 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006988}
6989
Guido van Rossum403d68b2000-03-13 15:55:09 +00006990int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006992{
Thomas Wouters477c8d52006-05-27 19:21:47 +00006993 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006994 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006995
6996 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00006997 sub = PyUnicode_FromObject(element);
6998 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 PyErr_Format(PyExc_TypeError,
7000 "'in <string>' requires string as left operand, not %s",
7001 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007002 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007003 }
7004
Thomas Wouters477c8d52006-05-27 19:21:47 +00007005 str = PyUnicode_FromObject(container);
7006 if (!str) {
7007 Py_DECREF(sub);
7008 return -1;
7009 }
7010
7011 result = stringlib_contains_obj(str, sub);
7012
7013 Py_DECREF(str);
7014 Py_DECREF(sub);
7015
Guido van Rossum403d68b2000-03-13 15:55:09 +00007016 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007017}
7018
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019/* Concat to string or Unicode object giving a new Unicode object. */
7020
7021PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023{
7024 PyUnicodeObject *u = NULL, *v = NULL, *w;
7025
7026 /* Coerce the two arguments */
7027 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7028 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7031 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033
7034 /* Shortcuts */
7035 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 Py_DECREF(v);
7037 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 }
7039 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 Py_DECREF(u);
7041 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 }
7043
7044 /* Concat the two Unicode strings */
7045 w = _PyUnicode_New(u->length + v->length);
7046 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 Py_UNICODE_COPY(w->str, u->str, u->length);
7049 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7050
7051 Py_DECREF(u);
7052 Py_DECREF(v);
7053 return (PyObject *)w;
7054
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056 Py_XDECREF(u);
7057 Py_XDECREF(v);
7058 return NULL;
7059}
7060
Walter Dörwald1ab83302007-05-18 17:15:44 +00007061void
7062PyUnicode_Append(PyObject **pleft, PyObject *right)
7063{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007064 PyObject *new;
7065 if (*pleft == NULL)
7066 return;
7067 if (right == NULL || !PyUnicode_Check(*pleft)) {
7068 Py_DECREF(*pleft);
7069 *pleft = NULL;
7070 return;
7071 }
7072 new = PyUnicode_Concat(*pleft, right);
7073 Py_DECREF(*pleft);
7074 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007075}
7076
7077void
7078PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7079{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007080 PyUnicode_Append(pleft, right);
7081 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007082}
7083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007084PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007087Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007088string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007089interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090
7091static PyObject *
7092unicode_count(PyUnicodeObject *self, PyObject *args)
7093{
7094 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007095 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007096 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 PyObject *result;
7098
Guido van Rossumb8872e62000-05-09 14:14:27 +00007099 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007101 return NULL;
7102
7103 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007104 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007107
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007108 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007109 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007110 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007111 substring->str, substring->length,
7112 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007113 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114
7115 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007116
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117 return result;
7118}
7119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007120PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007123Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007124to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007125handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007126a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7127'xmlcharrefreplace' as well as any other name registered with\n\
7128codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129
7130static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007131unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007133 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 char *encoding = NULL;
7135 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007136 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007137
Benjamin Peterson308d6372009-09-18 21:42:35 +00007138 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7139 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007141 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007142 if (v == NULL)
7143 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007144 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007145 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007146 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007147 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007148 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007149 Py_DECREF(v);
7150 return NULL;
7151 }
7152 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007153
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007155 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007156}
7157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007158PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160\n\
7161Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007162If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163
7164static PyObject*
7165unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7166{
7167 Py_UNICODE *e;
7168 Py_UNICODE *p;
7169 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007170 Py_UNICODE *qe;
7171 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 PyUnicodeObject *u;
7173 int tabsize = 8;
7174
7175 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177
Thomas Wouters7e474022000-07-16 12:04:32 +00007178 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007179 i = 0; /* chars up to and including most recent \n or \r */
7180 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7181 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 for (p = self->str; p < e; p++)
7183 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 if (tabsize > 0) {
7185 incr = tabsize - (j % tabsize); /* cannot overflow */
7186 if (j > PY_SSIZE_T_MAX - incr)
7187 goto overflow1;
7188 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007189 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 if (j > PY_SSIZE_T_MAX - 1)
7193 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194 j++;
7195 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 if (i > PY_SSIZE_T_MAX - j)
7197 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007199 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 }
7201 }
7202
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007203 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007205
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 /* Second pass: create output string and fill it */
7207 u = _PyUnicode_New(i + j);
7208 if (!u)
7209 return NULL;
7210
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007211 j = 0; /* same as in first pass */
7212 q = u->str; /* next output char */
7213 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214
7215 for (p = self->str; p < e; p++)
7216 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 if (tabsize > 0) {
7218 i = tabsize - (j % tabsize);
7219 j += i;
7220 while (i--) {
7221 if (q >= qe)
7222 goto overflow2;
7223 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007224 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007226 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 else {
7228 if (q >= qe)
7229 goto overflow2;
7230 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007231 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 if (*p == '\n' || *p == '\r')
7233 j = 0;
7234 }
7235
7236 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007237
7238 overflow2:
7239 Py_DECREF(u);
7240 overflow1:
7241 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243}
7244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007245PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247\n\
7248Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007249such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250arguments start and end are interpreted as in slice notation.\n\
7251\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007252Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253
7254static PyObject *
7255unicode_find(PyUnicodeObject *self, PyObject *args)
7256{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007257 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007258 Py_ssize_t start;
7259 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007260 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261
Christian Heimes9cd17752007-11-18 19:35:23 +00007262 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264
Thomas Wouters477c8d52006-05-27 19:21:47 +00007265 result = stringlib_find_slice(
7266 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7267 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7268 start, end
7269 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270
7271 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007272
Christian Heimes217cfd12007-12-02 14:31:20 +00007273 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274}
7275
7276static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007277unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278{
7279 if (index < 0 || index >= self->length) {
7280 PyErr_SetString(PyExc_IndexError, "string index out of range");
7281 return NULL;
7282 }
7283
7284 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7285}
7286
Guido van Rossumc2504932007-09-18 19:42:40 +00007287/* Believe it or not, this produces the same value for ASCII strings
7288 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007290unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291{
Guido van Rossumc2504932007-09-18 19:42:40 +00007292 Py_ssize_t len;
7293 Py_UNICODE *p;
7294 long x;
7295
7296 if (self->hash != -1)
7297 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007298 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007299 p = self->str;
7300 x = *p << 7;
7301 while (--len >= 0)
7302 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007303 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007304 if (x == -1)
7305 x = -2;
7306 self->hash = x;
7307 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308}
7309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007310PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007313Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314
7315static PyObject *
7316unicode_index(PyUnicodeObject *self, PyObject *args)
7317{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007318 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007319 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007320 Py_ssize_t start;
7321 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322
Christian Heimes9cd17752007-11-18 19:35:23 +00007323 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325
Thomas Wouters477c8d52006-05-27 19:21:47 +00007326 result = stringlib_find_slice(
7327 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7328 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7329 start, end
7330 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331
7332 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007333
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 if (result < 0) {
7335 PyErr_SetString(PyExc_ValueError, "substring not found");
7336 return NULL;
7337 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007338
Christian Heimes217cfd12007-12-02 14:31:20 +00007339 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340}
7341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007342PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007345Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007346at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347
7348static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007349unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350{
7351 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7352 register const Py_UNICODE *e;
7353 int cased;
7354
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 /* Shortcut for single character strings */
7356 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007359 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007360 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007362
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363 e = p + PyUnicode_GET_SIZE(self);
7364 cased = 0;
7365 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007367
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7369 return PyBool_FromLong(0);
7370 else if (!cased && Py_UNICODE_ISLOWER(ch))
7371 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007373 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374}
7375
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007376PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007379Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007380at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381
7382static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007383unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384{
7385 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7386 register const Py_UNICODE *e;
7387 int cased;
7388
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389 /* Shortcut for single character strings */
7390 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007393 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007394 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007396
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397 e = p + PyUnicode_GET_SIZE(self);
7398 cased = 0;
7399 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007401
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7403 return PyBool_FromLong(0);
7404 else if (!cased && Py_UNICODE_ISUPPER(ch))
7405 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007407 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408}
7409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007410PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007413Return True if S is a titlecased string and there is at least one\n\
7414character in S, i.e. upper- and titlecase characters may only\n\
7415follow uncased characters and lowercase characters only cased ones.\n\
7416Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417
7418static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007419unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420{
7421 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7422 register const Py_UNICODE *e;
7423 int cased, previous_is_cased;
7424
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425 /* Shortcut for single character strings */
7426 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7428 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007430 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007431 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007433
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 e = p + PyUnicode_GET_SIZE(self);
7435 cased = 0;
7436 previous_is_cased = 0;
7437 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007439
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7441 if (previous_is_cased)
7442 return PyBool_FromLong(0);
7443 previous_is_cased = 1;
7444 cased = 1;
7445 }
7446 else if (Py_UNICODE_ISLOWER(ch)) {
7447 if (!previous_is_cased)
7448 return PyBool_FromLong(0);
7449 previous_is_cased = 1;
7450 cased = 1;
7451 }
7452 else
7453 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007455 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456}
7457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007458PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007461Return True if all characters in S are whitespace\n\
7462and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463
7464static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007465unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466{
7467 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7468 register const Py_UNICODE *e;
7469
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470 /* Shortcut for single character strings */
7471 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 Py_UNICODE_ISSPACE(*p))
7473 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007475 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007476 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007478
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 e = p + PyUnicode_GET_SIZE(self);
7480 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 if (!Py_UNICODE_ISSPACE(*p))
7482 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007484 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485}
7486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007487PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007489\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007490Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007491and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007492
7493static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007494unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007495{
7496 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7497 register const Py_UNICODE *e;
7498
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007499 /* Shortcut for single character strings */
7500 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 Py_UNICODE_ISALPHA(*p))
7502 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007503
7504 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007505 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007507
7508 e = p + PyUnicode_GET_SIZE(self);
7509 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 if (!Py_UNICODE_ISALPHA(*p))
7511 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007512 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007513 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007514}
7515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007516PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007518\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007519Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007520and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007521
7522static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007523unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007524{
7525 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7526 register const Py_UNICODE *e;
7527
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007528 /* Shortcut for single character strings */
7529 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 Py_UNICODE_ISALNUM(*p))
7531 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007532
7533 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007534 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007536
7537 e = p + PyUnicode_GET_SIZE(self);
7538 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 if (!Py_UNICODE_ISALNUM(*p))
7540 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007541 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007542 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007543}
7544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007545PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007546 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007548Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007549False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550
7551static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007552unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553{
7554 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7555 register const Py_UNICODE *e;
7556
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557 /* Shortcut for single character strings */
7558 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 Py_UNICODE_ISDECIMAL(*p))
7560 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007562 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007563 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007565
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 e = p + PyUnicode_GET_SIZE(self);
7567 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 if (!Py_UNICODE_ISDECIMAL(*p))
7569 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007571 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572}
7573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007574PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007577Return True if all characters in S are digits\n\
7578and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579
7580static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007581unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582{
7583 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7584 register const Py_UNICODE *e;
7585
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 /* Shortcut for single character strings */
7587 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 Py_UNICODE_ISDIGIT(*p))
7589 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007591 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007592 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007594
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 e = p + PyUnicode_GET_SIZE(self);
7596 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 if (!Py_UNICODE_ISDIGIT(*p))
7598 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007600 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601}
7602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007603PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007606Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007607False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608
7609static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007610unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611{
7612 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7613 register const Py_UNICODE *e;
7614
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 /* Shortcut for single character strings */
7616 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 Py_UNICODE_ISNUMERIC(*p))
7618 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007620 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007621 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007623
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624 e = p + PyUnicode_GET_SIZE(self);
7625 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 if (!Py_UNICODE_ISNUMERIC(*p))
7627 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007629 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630}
7631
Martin v. Löwis47383402007-08-15 07:32:56 +00007632int
7633PyUnicode_IsIdentifier(PyObject *self)
7634{
7635 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7636 register const Py_UNICODE *e;
7637
7638 /* Special case for empty strings */
7639 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007641
7642 /* PEP 3131 says that the first character must be in
7643 XID_Start and subsequent characters in XID_Continue,
7644 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007645 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007646 letters, digits, underscore). However, given the current
7647 definition of XID_Start and XID_Continue, it is sufficient
7648 to check just for these, except that _ must be allowed
7649 as starting an identifier. */
7650 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7651 return 0;
7652
7653 e = p + PyUnicode_GET_SIZE(self);
7654 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 if (!_PyUnicode_IsXidContinue(*p))
7656 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007657 }
7658 return 1;
7659}
7660
7661PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007662 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007663\n\
7664Return True if S is a valid identifier according\n\
7665to the language definition.");
7666
7667static PyObject*
7668unicode_isidentifier(PyObject *self)
7669{
7670 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7671}
7672
Georg Brandl559e5d72008-06-11 18:37:52 +00007673PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007675\n\
7676Return True if all characters in S are considered\n\
7677printable in repr() or S is empty, False otherwise.");
7678
7679static PyObject*
7680unicode_isprintable(PyObject *self)
7681{
7682 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7683 register const Py_UNICODE *e;
7684
7685 /* Shortcut for single character strings */
7686 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7687 Py_RETURN_TRUE;
7688 }
7689
7690 e = p + PyUnicode_GET_SIZE(self);
7691 for (; p < e; p++) {
7692 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7693 Py_RETURN_FALSE;
7694 }
7695 }
7696 Py_RETURN_TRUE;
7697}
7698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007699PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007700 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701\n\
7702Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007703iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704
7705static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007706unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007708 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709}
7710
Martin v. Löwis18e16552006-02-15 17:27:45 +00007711static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712unicode_length(PyUnicodeObject *self)
7713{
7714 return self->length;
7715}
7716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007717PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007720Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007721done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722
7723static PyObject *
7724unicode_ljust(PyUnicodeObject *self, PyObject *args)
7725{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007726 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007727 Py_UNICODE fillchar = ' ';
7728
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007729 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730 return NULL;
7731
Tim Peters7a29bd52001-09-12 03:03:31 +00007732 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 Py_INCREF(self);
7734 return (PyObject*) self;
7735 }
7736
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007737 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738}
7739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007740PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007743Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744
7745static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007746unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 return fixup(self, fixlower);
7749}
7750
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007751#define LEFTSTRIP 0
7752#define RIGHTSTRIP 1
7753#define BOTHSTRIP 2
7754
7755/* Arrays indexed by above */
7756static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7757
7758#define STRIPNAME(i) (stripformat[i]+3)
7759
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007760/* externally visible for str.strip(unicode) */
7761PyObject *
7762_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7763{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007764 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7765 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7766 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7767 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7768 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007769
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007771
Benjamin Peterson14339b62009-01-31 16:36:08 +00007772 i = 0;
7773 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7775 i++;
7776 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007778
Benjamin Peterson14339b62009-01-31 16:36:08 +00007779 j = len;
7780 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 do {
7782 j--;
7783 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7784 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007785 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007786
Benjamin Peterson14339b62009-01-31 16:36:08 +00007787 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 Py_INCREF(self);
7789 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007790 }
7791 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007793}
7794
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795
7796static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007797do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007799 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7800 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007801
Benjamin Peterson14339b62009-01-31 16:36:08 +00007802 i = 0;
7803 if (striptype != RIGHTSTRIP) {
7804 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7805 i++;
7806 }
7807 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007808
Benjamin Peterson14339b62009-01-31 16:36:08 +00007809 j = len;
7810 if (striptype != LEFTSTRIP) {
7811 do {
7812 j--;
7813 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7814 j++;
7815 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007816
Benjamin Peterson14339b62009-01-31 16:36:08 +00007817 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7818 Py_INCREF(self);
7819 return (PyObject*)self;
7820 }
7821 else
7822 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823}
7824
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007825
7826static PyObject *
7827do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7828{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007829 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007830
Benjamin Peterson14339b62009-01-31 16:36:08 +00007831 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7832 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007833
Benjamin Peterson14339b62009-01-31 16:36:08 +00007834 if (sep != NULL && sep != Py_None) {
7835 if (PyUnicode_Check(sep))
7836 return _PyUnicode_XStrip(self, striptype, sep);
7837 else {
7838 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 "%s arg must be None or str",
7840 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007841 return NULL;
7842 }
7843 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007844
Benjamin Peterson14339b62009-01-31 16:36:08 +00007845 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007846}
7847
7848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007849PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007851\n\
7852Return a copy of the string S with leading and trailing\n\
7853whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007854If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007855
7856static PyObject *
7857unicode_strip(PyUnicodeObject *self, PyObject *args)
7858{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 if (PyTuple_GET_SIZE(args) == 0)
7860 return do_strip(self, BOTHSTRIP); /* Common case */
7861 else
7862 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007863}
7864
7865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007866PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007868\n\
7869Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007870If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007871
7872static PyObject *
7873unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7874{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007875 if (PyTuple_GET_SIZE(args) == 0)
7876 return do_strip(self, LEFTSTRIP); /* Common case */
7877 else
7878 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007879}
7880
7881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007882PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007884\n\
7885Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007886If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007887
7888static PyObject *
7889unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7890{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007891 if (PyTuple_GET_SIZE(args) == 0)
7892 return do_strip(self, RIGHTSTRIP); /* Common case */
7893 else
7894 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007895}
7896
7897
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007899unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900{
7901 PyUnicodeObject *u;
7902 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007903 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007904 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905
Georg Brandl222de0f2009-04-12 12:01:50 +00007906 if (len < 1) {
7907 Py_INCREF(unicode_empty);
7908 return (PyObject *)unicode_empty;
7909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910
Tim Peters7a29bd52001-09-12 03:03:31 +00007911 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 /* no repeat, return original string */
7913 Py_INCREF(str);
7914 return (PyObject*) str;
7915 }
Tim Peters8f422462000-09-09 06:13:41 +00007916
7917 /* ensure # of chars needed doesn't overflow int and # of bytes
7918 * needed doesn't overflow size_t
7919 */
7920 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007921 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007922 PyErr_SetString(PyExc_OverflowError,
7923 "repeated string is too long");
7924 return NULL;
7925 }
7926 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7927 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7928 PyErr_SetString(PyExc_OverflowError,
7929 "repeated string is too long");
7930 return NULL;
7931 }
7932 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933 if (!u)
7934 return NULL;
7935
7936 p = u->str;
7937
Georg Brandl222de0f2009-04-12 12:01:50 +00007938 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007939 Py_UNICODE_FILL(p, str->str[0], len);
7940 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007941 Py_ssize_t done = str->length; /* number of characters copied this far */
7942 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007944 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007945 Py_UNICODE_COPY(p+done, p, n);
7946 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948 }
7949
7950 return (PyObject*) u;
7951}
7952
7953PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 PyObject *subobj,
7955 PyObject *replobj,
7956 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957{
7958 PyObject *self;
7959 PyObject *str1;
7960 PyObject *str2;
7961 PyObject *result;
7962
7963 self = PyUnicode_FromObject(obj);
7964 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 str1 = PyUnicode_FromObject(subobj);
7967 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 Py_DECREF(self);
7969 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 }
7971 str2 = PyUnicode_FromObject(replobj);
7972 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 Py_DECREF(self);
7974 Py_DECREF(str1);
7975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 }
Tim Petersced69f82003-09-16 20:30:58 +00007977 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 (PyUnicodeObject *)str1,
7979 (PyUnicodeObject *)str2,
7980 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 Py_DECREF(self);
7982 Py_DECREF(str1);
7983 Py_DECREF(str2);
7984 return result;
7985}
7986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007987PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00007988 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989\n\
7990Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007991old replaced by new. If the optional argument count is\n\
7992given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993
7994static PyObject*
7995unicode_replace(PyUnicodeObject *self, PyObject *args)
7996{
7997 PyUnicodeObject *str1;
7998 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007999 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 PyObject *result;
8001
Martin v. Löwis18e16552006-02-15 17:27:45 +00008002 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 return NULL;
8004 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8005 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008008 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 Py_DECREF(str1);
8010 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012
8013 result = replace(self, str1, str2, maxcount);
8014
8015 Py_DECREF(str1);
8016 Py_DECREF(str2);
8017 return result;
8018}
8019
8020static
8021PyObject *unicode_repr(PyObject *unicode)
8022{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008023 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008024 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008025 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8026 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8027
8028 /* XXX(nnorwitz): rather than over-allocating, it would be
8029 better to choose a different scheme. Perhaps scan the
8030 first N-chars of the string and allocate based on that size.
8031 */
8032 /* Initial allocation is based on the longest-possible unichr
8033 escape.
8034
8035 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8036 unichr, so in this case it's the longest unichr escape. In
8037 narrow (UTF-16) builds this is five chars per source unichr
8038 since there are two unichrs in the surrogate pair, so in narrow
8039 (UTF-16) builds it's not the longest unichr escape.
8040
8041 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8042 so in the narrow (UTF-16) build case it's the longest unichr
8043 escape.
8044 */
8045
Walter Dörwald1ab83302007-05-18 17:15:44 +00008046 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008048#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008050#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008052#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008054 if (repr == NULL)
8055 return NULL;
8056
Walter Dörwald1ab83302007-05-18 17:15:44 +00008057 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008058
8059 /* Add quote */
8060 *p++ = (findchar(s, size, '\'') &&
8061 !findchar(s, size, '"')) ? '"' : '\'';
8062 while (size-- > 0) {
8063 Py_UNICODE ch = *s++;
8064
8065 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008066 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008067 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008068 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008069 continue;
8070 }
8071
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008073 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008074 *p++ = '\\';
8075 *p++ = 't';
8076 }
8077 else if (ch == '\n') {
8078 *p++ = '\\';
8079 *p++ = 'n';
8080 }
8081 else if (ch == '\r') {
8082 *p++ = '\\';
8083 *p++ = 'r';
8084 }
8085
8086 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008087 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008088 *p++ = '\\';
8089 *p++ = 'x';
8090 *p++ = hexdigits[(ch >> 4) & 0x000F];
8091 *p++ = hexdigits[ch & 0x000F];
8092 }
8093
Georg Brandl559e5d72008-06-11 18:37:52 +00008094 /* Copy ASCII characters as-is */
8095 else if (ch < 0x7F) {
8096 *p++ = ch;
8097 }
8098
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008100 else {
8101 Py_UCS4 ucs = ch;
8102
8103#ifndef Py_UNICODE_WIDE
8104 Py_UNICODE ch2 = 0;
8105 /* Get code point from surrogate pair */
8106 if (size > 0) {
8107 ch2 = *s;
8108 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008110 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008112 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008113 size--;
8114 }
8115 }
8116#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008117 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008118 (categories Z* and C* except ASCII space)
8119 */
8120 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8121 /* Map 8-bit characters to '\xhh' */
8122 if (ucs <= 0xff) {
8123 *p++ = '\\';
8124 *p++ = 'x';
8125 *p++ = hexdigits[(ch >> 4) & 0x000F];
8126 *p++ = hexdigits[ch & 0x000F];
8127 }
8128 /* Map 21-bit characters to '\U00xxxxxx' */
8129 else if (ucs >= 0x10000) {
8130 *p++ = '\\';
8131 *p++ = 'U';
8132 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8133 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8134 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8135 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8136 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8137 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8138 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8139 *p++ = hexdigits[ucs & 0x0000000F];
8140 }
8141 /* Map 16-bit characters to '\uxxxx' */
8142 else {
8143 *p++ = '\\';
8144 *p++ = 'u';
8145 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8146 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8147 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8148 *p++ = hexdigits[ucs & 0x000F];
8149 }
8150 }
8151 /* Copy characters as-is */
8152 else {
8153 *p++ = ch;
8154#ifndef Py_UNICODE_WIDE
8155 if (ucs >= 0x10000)
8156 *p++ = ch2;
8157#endif
8158 }
8159 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008160 }
8161 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008162 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008163
8164 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008165 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008166 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167}
8168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008169PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171\n\
8172Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008173such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174arguments start and end are interpreted as in slice notation.\n\
8175\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008176Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177
8178static PyObject *
8179unicode_rfind(PyUnicodeObject *self, PyObject *args)
8180{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008181 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008182 Py_ssize_t start;
8183 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008184 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185
Christian Heimes9cd17752007-11-18 19:35:23 +00008186 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188
Thomas Wouters477c8d52006-05-27 19:21:47 +00008189 result = stringlib_rfind_slice(
8190 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8191 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8192 start, end
8193 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194
8195 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008196
Christian Heimes217cfd12007-12-02 14:31:20 +00008197 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198}
8199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008200PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008203Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204
8205static PyObject *
8206unicode_rindex(PyUnicodeObject *self, PyObject *args)
8207{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008208 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008209 Py_ssize_t start;
8210 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008211 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212
Christian Heimes9cd17752007-11-18 19:35:23 +00008213 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215
Thomas Wouters477c8d52006-05-27 19:21:47 +00008216 result = stringlib_rfind_slice(
8217 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8218 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8219 start, end
8220 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221
8222 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008223
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 if (result < 0) {
8225 PyErr_SetString(PyExc_ValueError, "substring not found");
8226 return NULL;
8227 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008228 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229}
8230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008231PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008234Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008235done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236
8237static PyObject *
8238unicode_rjust(PyUnicodeObject *self, PyObject *args)
8239{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008240 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008241 Py_UNICODE fillchar = ' ';
8242
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008243 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244 return NULL;
8245
Tim Peters7a29bd52001-09-12 03:03:31 +00008246 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 Py_INCREF(self);
8248 return (PyObject*) self;
8249 }
8250
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008251 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252}
8253
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 PyObject *sep,
8256 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257{
8258 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008259
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 s = PyUnicode_FromObject(s);
8261 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008262 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 if (sep != NULL) {
8264 sep = PyUnicode_FromObject(sep);
8265 if (sep == NULL) {
8266 Py_DECREF(s);
8267 return NULL;
8268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 }
8270
8271 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8272
8273 Py_DECREF(s);
8274 Py_XDECREF(sep);
8275 return result;
8276}
8277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008278PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280\n\
8281Return a list of the words in S, using sep as the\n\
8282delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008283splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008284whitespace string is a separator and empty strings are\n\
8285removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286
8287static PyObject*
8288unicode_split(PyUnicodeObject *self, PyObject *args)
8289{
8290 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008291 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292
Martin v. Löwis18e16552006-02-15 17:27:45 +00008293 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294 return NULL;
8295
8296 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302}
8303
Thomas Wouters477c8d52006-05-27 19:21:47 +00008304PyObject *
8305PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8306{
8307 PyObject* str_obj;
8308 PyObject* sep_obj;
8309 PyObject* out;
8310
8311 str_obj = PyUnicode_FromObject(str_in);
8312 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008314 sep_obj = PyUnicode_FromObject(sep_in);
8315 if (!sep_obj) {
8316 Py_DECREF(str_obj);
8317 return NULL;
8318 }
8319
8320 out = stringlib_partition(
8321 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8322 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8323 );
8324
8325 Py_DECREF(sep_obj);
8326 Py_DECREF(str_obj);
8327
8328 return out;
8329}
8330
8331
8332PyObject *
8333PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8334{
8335 PyObject* str_obj;
8336 PyObject* sep_obj;
8337 PyObject* out;
8338
8339 str_obj = PyUnicode_FromObject(str_in);
8340 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008342 sep_obj = PyUnicode_FromObject(sep_in);
8343 if (!sep_obj) {
8344 Py_DECREF(str_obj);
8345 return NULL;
8346 }
8347
8348 out = stringlib_rpartition(
8349 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8350 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8351 );
8352
8353 Py_DECREF(sep_obj);
8354 Py_DECREF(str_obj);
8355
8356 return out;
8357}
8358
8359PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008361\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008362Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008363the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008364found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008365
8366static PyObject*
8367unicode_partition(PyUnicodeObject *self, PyObject *separator)
8368{
8369 return PyUnicode_Partition((PyObject *)self, separator);
8370}
8371
8372PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008373 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008374\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008375Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008376the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008377separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008378
8379static PyObject*
8380unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8381{
8382 return PyUnicode_RPartition((PyObject *)self, separator);
8383}
8384
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008385PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 PyObject *sep,
8387 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008388{
8389 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008391 s = PyUnicode_FromObject(s);
8392 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008393 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 if (sep != NULL) {
8395 sep = PyUnicode_FromObject(sep);
8396 if (sep == NULL) {
8397 Py_DECREF(s);
8398 return NULL;
8399 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008400 }
8401
8402 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8403
8404 Py_DECREF(s);
8405 Py_XDECREF(sep);
8406 return result;
8407}
8408
8409PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008411\n\
8412Return a list of the words in S, using sep as the\n\
8413delimiter string, starting at the end of the string and\n\
8414working to the front. If maxsplit is given, at most maxsplit\n\
8415splits are done. If sep is not specified, any whitespace string\n\
8416is a separator.");
8417
8418static PyObject*
8419unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8420{
8421 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008422 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008423
Martin v. Löwis18e16552006-02-15 17:27:45 +00008424 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008425 return NULL;
8426
8427 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008429 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008431 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008433}
8434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008435PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437\n\
8438Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008439Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008440is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441
8442static PyObject*
8443unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8444{
Guido van Rossum86662912000-04-11 15:38:46 +00008445 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446
Guido van Rossum86662912000-04-11 15:38:46 +00008447 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 return NULL;
8449
Guido van Rossum86662912000-04-11 15:38:46 +00008450 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451}
8452
8453static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008454PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455{
Walter Dörwald346737f2007-05-31 10:44:43 +00008456 if (PyUnicode_CheckExact(self)) {
8457 Py_INCREF(self);
8458 return self;
8459 } else
8460 /* Subtype -- return genuine unicode string with the same value. */
8461 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8462 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463}
8464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008465PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467\n\
8468Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008469and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470
8471static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008472unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474 return fixup(self, fixswapcase);
8475}
8476
Georg Brandlceee0772007-11-27 23:48:05 +00008477PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008479\n\
8480Return a translation table usable for str.translate().\n\
8481If there is only one argument, it must be a dictionary mapping Unicode\n\
8482ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008483Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008484If there are two arguments, they must be strings of equal length, and\n\
8485in the resulting dictionary, each character in x will be mapped to the\n\
8486character at the same position in y. If there is a third argument, it\n\
8487must be a string, whose characters will be mapped to None in the result.");
8488
8489static PyObject*
8490unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8491{
8492 PyObject *x, *y = NULL, *z = NULL;
8493 PyObject *new = NULL, *key, *value;
8494 Py_ssize_t i = 0;
8495 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008496
Georg Brandlceee0772007-11-27 23:48:05 +00008497 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8498 return NULL;
8499 new = PyDict_New();
8500 if (!new)
8501 return NULL;
8502 if (y != NULL) {
8503 /* x must be a string too, of equal length */
8504 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8505 if (!PyUnicode_Check(x)) {
8506 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8507 "be a string if there is a second argument");
8508 goto err;
8509 }
8510 if (PyUnicode_GET_SIZE(x) != ylen) {
8511 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8512 "arguments must have equal length");
8513 goto err;
8514 }
8515 /* create entries for translating chars in x to those in y */
8516 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008517 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8518 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008519 if (!key || !value)
8520 goto err;
8521 res = PyDict_SetItem(new, key, value);
8522 Py_DECREF(key);
8523 Py_DECREF(value);
8524 if (res < 0)
8525 goto err;
8526 }
8527 /* create entries for deleting chars in z */
8528 if (z != NULL) {
8529 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008530 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008531 if (!key)
8532 goto err;
8533 res = PyDict_SetItem(new, key, Py_None);
8534 Py_DECREF(key);
8535 if (res < 0)
8536 goto err;
8537 }
8538 }
8539 } else {
8540 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008541 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008542 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8543 "to maketrans it must be a dict");
8544 goto err;
8545 }
8546 /* copy entries into the new dict, converting string keys to int keys */
8547 while (PyDict_Next(x, &i, &key, &value)) {
8548 if (PyUnicode_Check(key)) {
8549 /* convert string keys to integer keys */
8550 PyObject *newkey;
8551 if (PyUnicode_GET_SIZE(key) != 1) {
8552 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8553 "table must be of length 1");
8554 goto err;
8555 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008556 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008557 if (!newkey)
8558 goto err;
8559 res = PyDict_SetItem(new, newkey, value);
8560 Py_DECREF(newkey);
8561 if (res < 0)
8562 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008563 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008564 /* just keep integer keys */
8565 if (PyDict_SetItem(new, key, value) < 0)
8566 goto err;
8567 } else {
8568 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8569 "be strings or integers");
8570 goto err;
8571 }
8572 }
8573 }
8574 return new;
8575 err:
8576 Py_DECREF(new);
8577 return NULL;
8578}
8579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008580PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582\n\
8583Return a copy of the string S, where all characters have been mapped\n\
8584through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008585Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008586Unmapped characters are left untouched. Characters mapped to None\n\
8587are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
8589static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008590unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591{
Georg Brandlceee0772007-11-27 23:48:05 +00008592 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593}
8594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008595PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008598Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599
8600static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008601unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 return fixup(self, fixupper);
8604}
8605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008606PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008609Pad a numeric string S with zeros on the left, to fill a field\n\
8610of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
8612static PyObject *
8613unicode_zfill(PyUnicodeObject *self, PyObject *args)
8614{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008615 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 PyUnicodeObject *u;
8617
Martin v. Löwis18e16552006-02-15 17:27:45 +00008618 Py_ssize_t width;
8619 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 return NULL;
8621
8622 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008623 if (PyUnicode_CheckExact(self)) {
8624 Py_INCREF(self);
8625 return (PyObject*) self;
8626 }
8627 else
8628 return PyUnicode_FromUnicode(
8629 PyUnicode_AS_UNICODE(self),
8630 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 }
8633
8634 fill = width - self->length;
8635
8636 u = pad(self, fill, 0, '0');
8637
Walter Dörwald068325e2002-04-15 13:36:47 +00008638 if (u == NULL)
8639 return NULL;
8640
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 if (u->str[fill] == '+' || u->str[fill] == '-') {
8642 /* move sign to beginning of string */
8643 u->str[0] = u->str[fill];
8644 u->str[fill] = '0';
8645 }
8646
8647 return (PyObject*) u;
8648}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649
8650#if 0
8651static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008652unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653{
Christian Heimes2202f872008-02-06 14:31:34 +00008654 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655}
8656#endif
8657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008658PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008661Return True if S starts with the specified prefix, False otherwise.\n\
8662With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008663With optional end, stop comparing S at that position.\n\
8664prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665
8666static PyObject *
8667unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008670 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008672 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008673 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008674 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008676 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8678 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008679 if (PyTuple_Check(subobj)) {
8680 Py_ssize_t i;
8681 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8682 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008684 if (substring == NULL)
8685 return NULL;
8686 result = tailmatch(self, substring, start, end, -1);
8687 Py_DECREF(substring);
8688 if (result) {
8689 Py_RETURN_TRUE;
8690 }
8691 }
8692 /* nothing matched */
8693 Py_RETURN_FALSE;
8694 }
8695 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008698 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008700 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701}
8702
8703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008704PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008707Return True if S ends with the specified suffix, False otherwise.\n\
8708With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008709With optional end, stop comparing S at that position.\n\
8710suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711
8712static PyObject *
8713unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008716 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008718 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008719 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008720 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008722 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8724 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008725 if (PyTuple_Check(subobj)) {
8726 Py_ssize_t i;
8727 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8728 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008730 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008732 result = tailmatch(self, substring, start, end, +1);
8733 Py_DECREF(substring);
8734 if (result) {
8735 Py_RETURN_TRUE;
8736 }
8737 }
8738 Py_RETURN_FALSE;
8739 }
8740 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008744 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008746 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747}
8748
Eric Smith8c663262007-08-25 02:26:07 +00008749#include "stringlib/string_format.h"
8750
8751PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008753\n\
8754");
8755
Eric Smith4a7d76d2008-05-30 18:10:19 +00008756static PyObject *
8757unicode__format__(PyObject* self, PyObject* args)
8758{
8759 PyObject *format_spec;
8760
8761 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8762 return NULL;
8763
8764 return _PyUnicode_FormatAdvanced(self,
8765 PyUnicode_AS_UNICODE(format_spec),
8766 PyUnicode_GET_SIZE(format_spec));
8767}
8768
Eric Smith8c663262007-08-25 02:26:07 +00008769PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008771\n\
8772");
8773
8774static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008775unicode__sizeof__(PyUnicodeObject *v)
8776{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008777 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8778 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008779}
8780
8781PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008783
8784static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008785unicode_getnewargs(PyUnicodeObject *v)
8786{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008787 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008788}
8789
8790
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791static PyMethodDef unicode_methods[] = {
8792
8793 /* Order is according to common usage: often used methods should
8794 appear first, since lookup is done sequentially. */
8795
Benjamin Peterson308d6372009-09-18 21:42:35 +00008796 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008797 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8798 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008799 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008800 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8801 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8802 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8803 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8804 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8805 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8806 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008807 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008808 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8809 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8810 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008811 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008812 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8813 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8814 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008815 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008816 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008817 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008818 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008819 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8820 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8821 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8822 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8823 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8824 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8825 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8826 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8827 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8828 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8829 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8830 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8831 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8832 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008833 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008834 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008835 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008836 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008837 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008838 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8839 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008840 {"maketrans", (PyCFunction) unicode_maketrans,
8841 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008842 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008843#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008844 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845#endif
8846
8847#if 0
8848 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008849 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850#endif
8851
Benjamin Peterson14339b62009-01-31 16:36:08 +00008852 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 {NULL, NULL}
8854};
8855
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008856static PyObject *
8857unicode_mod(PyObject *v, PyObject *w)
8858{
Benjamin Peterson29060642009-01-31 22:14:21 +00008859 if (!PyUnicode_Check(v)) {
8860 Py_INCREF(Py_NotImplemented);
8861 return Py_NotImplemented;
8862 }
8863 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008864}
8865
8866static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008867 0, /*nb_add*/
8868 0, /*nb_subtract*/
8869 0, /*nb_multiply*/
8870 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008871};
8872
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008874 (lenfunc) unicode_length, /* sq_length */
8875 PyUnicode_Concat, /* sq_concat */
8876 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8877 (ssizeargfunc) unicode_getitem, /* sq_item */
8878 0, /* sq_slice */
8879 0, /* sq_ass_item */
8880 0, /* sq_ass_slice */
8881 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882};
8883
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008884static PyObject*
8885unicode_subscript(PyUnicodeObject* self, PyObject* item)
8886{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008887 if (PyIndex_Check(item)) {
8888 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008889 if (i == -1 && PyErr_Occurred())
8890 return NULL;
8891 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008892 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008893 return unicode_getitem(self, i);
8894 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008895 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008896 Py_UNICODE* source_buf;
8897 Py_UNICODE* result_buf;
8898 PyObject* result;
8899
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008900 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008902 return NULL;
8903 }
8904
8905 if (slicelength <= 0) {
8906 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008907 } else if (start == 0 && step == 1 && slicelength == self->length &&
8908 PyUnicode_CheckExact(self)) {
8909 Py_INCREF(self);
8910 return (PyObject *)self;
8911 } else if (step == 1) {
8912 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008913 } else {
8914 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008915 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8916 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008917
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 if (result_buf == NULL)
8919 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008920
8921 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8922 result_buf[i] = source_buf[cur];
8923 }
Tim Petersced69f82003-09-16 20:30:58 +00008924
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008925 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008926 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008927 return result;
8928 }
8929 } else {
8930 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8931 return NULL;
8932 }
8933}
8934
8935static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008936 (lenfunc)unicode_length, /* mp_length */
8937 (binaryfunc)unicode_subscript, /* mp_subscript */
8938 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008939};
8940
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942/* Helpers for PyUnicode_Format() */
8943
8944static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008945getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008947 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 (*p_argidx)++;
8950 if (arglen < 0)
8951 return args;
8952 else
8953 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954 }
8955 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957 return NULL;
8958}
8959
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008960/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008962static PyObject *
8963formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008965 char *p;
8966 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008968
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 x = PyFloat_AsDouble(v);
8970 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008971 return NULL;
8972
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008975
Eric Smith0923d1d2009-04-16 20:16:10 +00008976 p = PyOS_double_to_string(x, type, prec,
8977 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008978 if (p == NULL)
8979 return NULL;
8980 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008981 PyMem_Free(p);
8982 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983}
8984
Tim Peters38fd5b62000-09-21 05:43:11 +00008985static PyObject*
8986formatlong(PyObject *val, int flags, int prec, int type)
8987{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008988 char *buf;
8989 int len;
8990 PyObject *str; /* temporary string object. */
8991 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008992
Benjamin Peterson14339b62009-01-31 16:36:08 +00008993 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8994 if (!str)
8995 return NULL;
8996 result = PyUnicode_FromStringAndSize(buf, len);
8997 Py_DECREF(str);
8998 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008999}
9000
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001static int
9002formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009003 size_t buflen,
9004 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009006 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009007 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 if (PyUnicode_GET_SIZE(v) == 1) {
9009 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9010 buf[1] = '\0';
9011 return 1;
9012 }
9013#ifndef Py_UNICODE_WIDE
9014 if (PyUnicode_GET_SIZE(v) == 2) {
9015 /* Decode a valid surrogate pair */
9016 int c0 = PyUnicode_AS_UNICODE(v)[0];
9017 int c1 = PyUnicode_AS_UNICODE(v)[1];
9018 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9019 0xDC00 <= c1 && c1 <= 0xDFFF) {
9020 buf[0] = c0;
9021 buf[1] = c1;
9022 buf[2] = '\0';
9023 return 2;
9024 }
9025 }
9026#endif
9027 goto onError;
9028 }
9029 else {
9030 /* Integer input truncated to a character */
9031 long x;
9032 x = PyLong_AsLong(v);
9033 if (x == -1 && PyErr_Occurred())
9034 goto onError;
9035
9036 if (x < 0 || x > 0x10ffff) {
9037 PyErr_SetString(PyExc_OverflowError,
9038 "%c arg not in range(0x110000)");
9039 return -1;
9040 }
9041
9042#ifndef Py_UNICODE_WIDE
9043 if (x > 0xffff) {
9044 x -= 0x10000;
9045 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9046 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9047 return 2;
9048 }
9049#endif
9050 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009051 buf[1] = '\0';
9052 return 1;
9053 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009054
Benjamin Peterson29060642009-01-31 22:14:21 +00009055 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009056 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009058 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059}
9060
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009061/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009062 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009063*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009064#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009065
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009067 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068{
9069 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009070 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071 int args_owned = 0;
9072 PyUnicodeObject *result = NULL;
9073 PyObject *dict = NULL;
9074 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009075
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 PyErr_BadInternalCall();
9078 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079 }
9080 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009081 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083 fmt = PyUnicode_AS_UNICODE(uformat);
9084 fmtcnt = PyUnicode_GET_SIZE(uformat);
9085
9086 reslen = rescnt = fmtcnt + 100;
9087 result = _PyUnicode_New(reslen);
9088 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 res = PyUnicode_AS_UNICODE(result);
9091
9092 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009093 arglen = PyTuple_Size(args);
9094 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095 }
9096 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 arglen = -1;
9098 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009100 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009101 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103
9104 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 if (*fmt != '%') {
9106 if (--rescnt < 0) {
9107 rescnt = fmtcnt + 100;
9108 reslen += rescnt;
9109 if (_PyUnicode_Resize(&result, reslen) < 0)
9110 goto onError;
9111 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9112 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009113 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009115 }
9116 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 /* Got a format specifier */
9118 int flags = 0;
9119 Py_ssize_t width = -1;
9120 int prec = -1;
9121 Py_UNICODE c = '\0';
9122 Py_UNICODE fill;
9123 int isnumok;
9124 PyObject *v = NULL;
9125 PyObject *temp = NULL;
9126 Py_UNICODE *pbuf;
9127 Py_UNICODE sign;
9128 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009129 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 fmt++;
9132 if (*fmt == '(') {
9133 Py_UNICODE *keystart;
9134 Py_ssize_t keylen;
9135 PyObject *key;
9136 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009137
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 if (dict == NULL) {
9139 PyErr_SetString(PyExc_TypeError,
9140 "format requires a mapping");
9141 goto onError;
9142 }
9143 ++fmt;
9144 --fmtcnt;
9145 keystart = fmt;
9146 /* Skip over balanced parentheses */
9147 while (pcount > 0 && --fmtcnt >= 0) {
9148 if (*fmt == ')')
9149 --pcount;
9150 else if (*fmt == '(')
9151 ++pcount;
9152 fmt++;
9153 }
9154 keylen = fmt - keystart - 1;
9155 if (fmtcnt < 0 || pcount > 0) {
9156 PyErr_SetString(PyExc_ValueError,
9157 "incomplete format key");
9158 goto onError;
9159 }
9160#if 0
9161 /* keys are converted to strings using UTF-8 and
9162 then looked up since Python uses strings to hold
9163 variables names etc. in its namespaces and we
9164 wouldn't want to break common idioms. */
9165 key = PyUnicode_EncodeUTF8(keystart,
9166 keylen,
9167 NULL);
9168#else
9169 key = PyUnicode_FromUnicode(keystart, keylen);
9170#endif
9171 if (key == NULL)
9172 goto onError;
9173 if (args_owned) {
9174 Py_DECREF(args);
9175 args_owned = 0;
9176 }
9177 args = PyObject_GetItem(dict, key);
9178 Py_DECREF(key);
9179 if (args == NULL) {
9180 goto onError;
9181 }
9182 args_owned = 1;
9183 arglen = -1;
9184 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009185 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 while (--fmtcnt >= 0) {
9187 switch (c = *fmt++) {
9188 case '-': flags |= F_LJUST; continue;
9189 case '+': flags |= F_SIGN; continue;
9190 case ' ': flags |= F_BLANK; continue;
9191 case '#': flags |= F_ALT; continue;
9192 case '0': flags |= F_ZERO; continue;
9193 }
9194 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009195 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 if (c == '*') {
9197 v = getnextarg(args, arglen, &argidx);
9198 if (v == NULL)
9199 goto onError;
9200 if (!PyLong_Check(v)) {
9201 PyErr_SetString(PyExc_TypeError,
9202 "* wants int");
9203 goto onError;
9204 }
9205 width = PyLong_AsLong(v);
9206 if (width == -1 && PyErr_Occurred())
9207 goto onError;
9208 if (width < 0) {
9209 flags |= F_LJUST;
9210 width = -width;
9211 }
9212 if (--fmtcnt >= 0)
9213 c = *fmt++;
9214 }
9215 else if (c >= '0' && c <= '9') {
9216 width = c - '0';
9217 while (--fmtcnt >= 0) {
9218 c = *fmt++;
9219 if (c < '0' || c > '9')
9220 break;
9221 if ((width*10) / 10 != width) {
9222 PyErr_SetString(PyExc_ValueError,
9223 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009224 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009225 }
9226 width = width*10 + (c - '0');
9227 }
9228 }
9229 if (c == '.') {
9230 prec = 0;
9231 if (--fmtcnt >= 0)
9232 c = *fmt++;
9233 if (c == '*') {
9234 v = getnextarg(args, arglen, &argidx);
9235 if (v == NULL)
9236 goto onError;
9237 if (!PyLong_Check(v)) {
9238 PyErr_SetString(PyExc_TypeError,
9239 "* wants int");
9240 goto onError;
9241 }
9242 prec = PyLong_AsLong(v);
9243 if (prec == -1 && PyErr_Occurred())
9244 goto onError;
9245 if (prec < 0)
9246 prec = 0;
9247 if (--fmtcnt >= 0)
9248 c = *fmt++;
9249 }
9250 else if (c >= '0' && c <= '9') {
9251 prec = c - '0';
9252 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009253 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009254 if (c < '0' || c > '9')
9255 break;
9256 if ((prec*10) / 10 != prec) {
9257 PyErr_SetString(PyExc_ValueError,
9258 "prec too big");
9259 goto onError;
9260 }
9261 prec = prec*10 + (c - '0');
9262 }
9263 }
9264 } /* prec */
9265 if (fmtcnt >= 0) {
9266 if (c == 'h' || c == 'l' || c == 'L') {
9267 if (--fmtcnt >= 0)
9268 c = *fmt++;
9269 }
9270 }
9271 if (fmtcnt < 0) {
9272 PyErr_SetString(PyExc_ValueError,
9273 "incomplete format");
9274 goto onError;
9275 }
9276 if (c != '%') {
9277 v = getnextarg(args, arglen, &argidx);
9278 if (v == NULL)
9279 goto onError;
9280 }
9281 sign = 0;
9282 fill = ' ';
9283 switch (c) {
9284
9285 case '%':
9286 pbuf = formatbuf;
9287 /* presume that buffer length is at least 1 */
9288 pbuf[0] = '%';
9289 len = 1;
9290 break;
9291
9292 case 's':
9293 case 'r':
9294 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009295 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009296 temp = v;
9297 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009298 }
9299 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009300 if (c == 's')
9301 temp = PyObject_Str(v);
9302 else if (c == 'r')
9303 temp = PyObject_Repr(v);
9304 else
9305 temp = PyObject_ASCII(v);
9306 if (temp == NULL)
9307 goto onError;
9308 if (PyUnicode_Check(temp))
9309 /* nothing to do */;
9310 else {
9311 Py_DECREF(temp);
9312 PyErr_SetString(PyExc_TypeError,
9313 "%s argument has non-string str()");
9314 goto onError;
9315 }
9316 }
9317 pbuf = PyUnicode_AS_UNICODE(temp);
9318 len = PyUnicode_GET_SIZE(temp);
9319 if (prec >= 0 && len > prec)
9320 len = prec;
9321 break;
9322
9323 case 'i':
9324 case 'd':
9325 case 'u':
9326 case 'o':
9327 case 'x':
9328 case 'X':
9329 if (c == 'i')
9330 c = 'd';
9331 isnumok = 0;
9332 if (PyNumber_Check(v)) {
9333 PyObject *iobj=NULL;
9334
9335 if (PyLong_Check(v)) {
9336 iobj = v;
9337 Py_INCREF(iobj);
9338 }
9339 else {
9340 iobj = PyNumber_Long(v);
9341 }
9342 if (iobj!=NULL) {
9343 if (PyLong_Check(iobj)) {
9344 isnumok = 1;
9345 temp = formatlong(iobj, flags, prec, c);
9346 Py_DECREF(iobj);
9347 if (!temp)
9348 goto onError;
9349 pbuf = PyUnicode_AS_UNICODE(temp);
9350 len = PyUnicode_GET_SIZE(temp);
9351 sign = 1;
9352 }
9353 else {
9354 Py_DECREF(iobj);
9355 }
9356 }
9357 }
9358 if (!isnumok) {
9359 PyErr_Format(PyExc_TypeError,
9360 "%%%c format: a number is required, "
9361 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9362 goto onError;
9363 }
9364 if (flags & F_ZERO)
9365 fill = '0';
9366 break;
9367
9368 case 'e':
9369 case 'E':
9370 case 'f':
9371 case 'F':
9372 case 'g':
9373 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009374 temp = formatfloat(v, flags, prec, c);
9375 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009377 pbuf = PyUnicode_AS_UNICODE(temp);
9378 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009379 sign = 1;
9380 if (flags & F_ZERO)
9381 fill = '0';
9382 break;
9383
9384 case 'c':
9385 pbuf = formatbuf;
9386 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9387 if (len < 0)
9388 goto onError;
9389 break;
9390
9391 default:
9392 PyErr_Format(PyExc_ValueError,
9393 "unsupported format character '%c' (0x%x) "
9394 "at index %zd",
9395 (31<=c && c<=126) ? (char)c : '?',
9396 (int)c,
9397 (Py_ssize_t)(fmt - 1 -
9398 PyUnicode_AS_UNICODE(uformat)));
9399 goto onError;
9400 }
9401 if (sign) {
9402 if (*pbuf == '-' || *pbuf == '+') {
9403 sign = *pbuf++;
9404 len--;
9405 }
9406 else if (flags & F_SIGN)
9407 sign = '+';
9408 else if (flags & F_BLANK)
9409 sign = ' ';
9410 else
9411 sign = 0;
9412 }
9413 if (width < len)
9414 width = len;
9415 if (rescnt - (sign != 0) < width) {
9416 reslen -= rescnt;
9417 rescnt = width + fmtcnt + 100;
9418 reslen += rescnt;
9419 if (reslen < 0) {
9420 Py_XDECREF(temp);
9421 PyErr_NoMemory();
9422 goto onError;
9423 }
9424 if (_PyUnicode_Resize(&result, reslen) < 0) {
9425 Py_XDECREF(temp);
9426 goto onError;
9427 }
9428 res = PyUnicode_AS_UNICODE(result)
9429 + reslen - rescnt;
9430 }
9431 if (sign) {
9432 if (fill != ' ')
9433 *res++ = sign;
9434 rescnt--;
9435 if (width > len)
9436 width--;
9437 }
9438 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9439 assert(pbuf[0] == '0');
9440 assert(pbuf[1] == c);
9441 if (fill != ' ') {
9442 *res++ = *pbuf++;
9443 *res++ = *pbuf++;
9444 }
9445 rescnt -= 2;
9446 width -= 2;
9447 if (width < 0)
9448 width = 0;
9449 len -= 2;
9450 }
9451 if (width > len && !(flags & F_LJUST)) {
9452 do {
9453 --rescnt;
9454 *res++ = fill;
9455 } while (--width > len);
9456 }
9457 if (fill == ' ') {
9458 if (sign)
9459 *res++ = sign;
9460 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9461 assert(pbuf[0] == '0');
9462 assert(pbuf[1] == c);
9463 *res++ = *pbuf++;
9464 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009465 }
9466 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 Py_UNICODE_COPY(res, pbuf, len);
9468 res += len;
9469 rescnt -= len;
9470 while (--width >= len) {
9471 --rescnt;
9472 *res++ = ' ';
9473 }
9474 if (dict && (argidx < arglen) && c != '%') {
9475 PyErr_SetString(PyExc_TypeError,
9476 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009477 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009478 goto onError;
9479 }
9480 Py_XDECREF(temp);
9481 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482 } /* until end */
9483 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 PyErr_SetString(PyExc_TypeError,
9485 "not all arguments converted during string formatting");
9486 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487 }
9488
Thomas Woutersa96affe2006-03-12 00:29:36 +00009489 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009490 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493 }
9494 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 return (PyObject *)result;
9496
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498 Py_XDECREF(result);
9499 Py_DECREF(uformat);
9500 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 }
9503 return NULL;
9504}
9505
Jeremy Hylton938ace62002-07-17 16:30:39 +00009506static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009507unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9508
Tim Peters6d6c1a32001-08-02 04:15:00 +00009509static PyObject *
9510unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9511{
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009513 static char *kwlist[] = {"object", "encoding", "errors", 0};
9514 char *encoding = NULL;
9515 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009516
Benjamin Peterson14339b62009-01-31 16:36:08 +00009517 if (type != &PyUnicode_Type)
9518 return unicode_subtype_new(type, args, kwds);
9519 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009520 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009521 return NULL;
9522 if (x == NULL)
9523 return (PyObject *)_PyUnicode_New(0);
9524 if (encoding == NULL && errors == NULL)
9525 return PyObject_Str(x);
9526 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009527 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009528}
9529
Guido van Rossume023fe02001-08-30 03:12:59 +00009530static PyObject *
9531unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9532{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009533 PyUnicodeObject *tmp, *pnew;
9534 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009535
Benjamin Peterson14339b62009-01-31 16:36:08 +00009536 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9537 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9538 if (tmp == NULL)
9539 return NULL;
9540 assert(PyUnicode_Check(tmp));
9541 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9542 if (pnew == NULL) {
9543 Py_DECREF(tmp);
9544 return NULL;
9545 }
9546 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9547 if (pnew->str == NULL) {
9548 _Py_ForgetReference((PyObject *)pnew);
9549 PyObject_Del(pnew);
9550 Py_DECREF(tmp);
9551 return PyErr_NoMemory();
9552 }
9553 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9554 pnew->length = n;
9555 pnew->hash = tmp->hash;
9556 Py_DECREF(tmp);
9557 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009558}
9559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009560PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009561 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009562\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009563Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009564encoding defaults to the current default string encoding.\n\
9565errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009566
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009567static PyObject *unicode_iter(PyObject *seq);
9568
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009570 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009571 "str", /* tp_name */
9572 sizeof(PyUnicodeObject), /* tp_size */
9573 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009575 (destructor)unicode_dealloc, /* tp_dealloc */
9576 0, /* tp_print */
9577 0, /* tp_getattr */
9578 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009579 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009580 unicode_repr, /* tp_repr */
9581 &unicode_as_number, /* tp_as_number */
9582 &unicode_as_sequence, /* tp_as_sequence */
9583 &unicode_as_mapping, /* tp_as_mapping */
9584 (hashfunc) unicode_hash, /* tp_hash*/
9585 0, /* tp_call*/
9586 (reprfunc) unicode_str, /* tp_str */
9587 PyObject_GenericGetAttr, /* tp_getattro */
9588 0, /* tp_setattro */
9589 0, /* tp_as_buffer */
9590 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009592 unicode_doc, /* tp_doc */
9593 0, /* tp_traverse */
9594 0, /* tp_clear */
9595 PyUnicode_RichCompare, /* tp_richcompare */
9596 0, /* tp_weaklistoffset */
9597 unicode_iter, /* tp_iter */
9598 0, /* tp_iternext */
9599 unicode_methods, /* tp_methods */
9600 0, /* tp_members */
9601 0, /* tp_getset */
9602 &PyBaseObject_Type, /* tp_base */
9603 0, /* tp_dict */
9604 0, /* tp_descr_get */
9605 0, /* tp_descr_set */
9606 0, /* tp_dictoffset */
9607 0, /* tp_init */
9608 0, /* tp_alloc */
9609 unicode_new, /* tp_new */
9610 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611};
9612
9613/* Initialize the Unicode implementation */
9614
Thomas Wouters78890102000-07-22 19:25:51 +00009615void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009617 int i;
9618
Thomas Wouters477c8d52006-05-27 19:21:47 +00009619 /* XXX - move this array to unicodectype.c ? */
9620 Py_UNICODE linebreak[] = {
9621 0x000A, /* LINE FEED */
9622 0x000D, /* CARRIAGE RETURN */
9623 0x001C, /* FILE SEPARATOR */
9624 0x001D, /* GROUP SEPARATOR */
9625 0x001E, /* RECORD SEPARATOR */
9626 0x0085, /* NEXT LINE */
9627 0x2028, /* LINE SEPARATOR */
9628 0x2029, /* PARAGRAPH SEPARATOR */
9629 };
9630
Fred Drakee4315f52000-05-09 19:53:39 +00009631 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009632 free_list = NULL;
9633 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009635 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009636 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009637
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009638 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009639 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009640 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009641 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009642
9643 /* initialize the linebreak bloom filter */
9644 bloom_linebreak = make_bloom_mask(
9645 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9646 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009647
9648 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649}
9650
9651/* Finalize the Unicode implementation */
9652
Christian Heimesa156e092008-02-16 07:38:31 +00009653int
9654PyUnicode_ClearFreeList(void)
9655{
9656 int freelist_size = numfree;
9657 PyUnicodeObject *u;
9658
9659 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009660 PyUnicodeObject *v = u;
9661 u = *(PyUnicodeObject **)u;
9662 if (v->str)
9663 PyObject_DEL(v->str);
9664 Py_XDECREF(v->defenc);
9665 PyObject_Del(v);
9666 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009667 }
9668 free_list = NULL;
9669 assert(numfree == 0);
9670 return freelist_size;
9671}
9672
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673void
Thomas Wouters78890102000-07-22 19:25:51 +00009674_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009676 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009678 Py_XDECREF(unicode_empty);
9679 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009680
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009681 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009682 if (unicode_latin1[i]) {
9683 Py_DECREF(unicode_latin1[i]);
9684 unicode_latin1[i] = NULL;
9685 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009686 }
Christian Heimesa156e092008-02-16 07:38:31 +00009687 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009689
Walter Dörwald16807132007-05-25 13:52:07 +00009690void
9691PyUnicode_InternInPlace(PyObject **p)
9692{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009693 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9694 PyObject *t;
9695 if (s == NULL || !PyUnicode_Check(s))
9696 Py_FatalError(
9697 "PyUnicode_InternInPlace: unicode strings only please!");
9698 /* If it's a subclass, we don't really know what putting
9699 it in the interned dict might do. */
9700 if (!PyUnicode_CheckExact(s))
9701 return;
9702 if (PyUnicode_CHECK_INTERNED(s))
9703 return;
9704 if (interned == NULL) {
9705 interned = PyDict_New();
9706 if (interned == NULL) {
9707 PyErr_Clear(); /* Don't leave an exception */
9708 return;
9709 }
9710 }
9711 /* It might be that the GetItem call fails even
9712 though the key is present in the dictionary,
9713 namely when this happens during a stack overflow. */
9714 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009715 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009716 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009717
Benjamin Peterson29060642009-01-31 22:14:21 +00009718 if (t) {
9719 Py_INCREF(t);
9720 Py_DECREF(*p);
9721 *p = t;
9722 return;
9723 }
Walter Dörwald16807132007-05-25 13:52:07 +00009724
Benjamin Peterson14339b62009-01-31 16:36:08 +00009725 PyThreadState_GET()->recursion_critical = 1;
9726 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9727 PyErr_Clear();
9728 PyThreadState_GET()->recursion_critical = 0;
9729 return;
9730 }
9731 PyThreadState_GET()->recursion_critical = 0;
9732 /* The two references in interned are not counted by refcnt.
9733 The deallocator will take care of this */
9734 Py_REFCNT(s) -= 2;
9735 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009736}
9737
9738void
9739PyUnicode_InternImmortal(PyObject **p)
9740{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009741 PyUnicode_InternInPlace(p);
9742 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9743 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9744 Py_INCREF(*p);
9745 }
Walter Dörwald16807132007-05-25 13:52:07 +00009746}
9747
9748PyObject *
9749PyUnicode_InternFromString(const char *cp)
9750{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009751 PyObject *s = PyUnicode_FromString(cp);
9752 if (s == NULL)
9753 return NULL;
9754 PyUnicode_InternInPlace(&s);
9755 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009756}
9757
9758void _Py_ReleaseInternedUnicodeStrings(void)
9759{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009760 PyObject *keys;
9761 PyUnicodeObject *s;
9762 Py_ssize_t i, n;
9763 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009764
Benjamin Peterson14339b62009-01-31 16:36:08 +00009765 if (interned == NULL || !PyDict_Check(interned))
9766 return;
9767 keys = PyDict_Keys(interned);
9768 if (keys == NULL || !PyList_Check(keys)) {
9769 PyErr_Clear();
9770 return;
9771 }
Walter Dörwald16807132007-05-25 13:52:07 +00009772
Benjamin Peterson14339b62009-01-31 16:36:08 +00009773 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9774 detector, interned unicode strings are not forcibly deallocated;
9775 rather, we give them their stolen references back, and then clear
9776 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009777
Benjamin Peterson14339b62009-01-31 16:36:08 +00009778 n = PyList_GET_SIZE(keys);
9779 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009780 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009781 for (i = 0; i < n; i++) {
9782 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9783 switch (s->state) {
9784 case SSTATE_NOT_INTERNED:
9785 /* XXX Shouldn't happen */
9786 break;
9787 case SSTATE_INTERNED_IMMORTAL:
9788 Py_REFCNT(s) += 1;
9789 immortal_size += s->length;
9790 break;
9791 case SSTATE_INTERNED_MORTAL:
9792 Py_REFCNT(s) += 2;
9793 mortal_size += s->length;
9794 break;
9795 default:
9796 Py_FatalError("Inconsistent interned string state.");
9797 }
9798 s->state = SSTATE_NOT_INTERNED;
9799 }
9800 fprintf(stderr, "total size of all interned strings: "
9801 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9802 "mortal/immortal\n", mortal_size, immortal_size);
9803 Py_DECREF(keys);
9804 PyDict_Clear(interned);
9805 Py_DECREF(interned);
9806 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009807}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009808
9809
9810/********************* Unicode Iterator **************************/
9811
9812typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009813 PyObject_HEAD
9814 Py_ssize_t it_index;
9815 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009816} unicodeiterobject;
9817
9818static void
9819unicodeiter_dealloc(unicodeiterobject *it)
9820{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009821 _PyObject_GC_UNTRACK(it);
9822 Py_XDECREF(it->it_seq);
9823 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009824}
9825
9826static int
9827unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9828{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009829 Py_VISIT(it->it_seq);
9830 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009831}
9832
9833static PyObject *
9834unicodeiter_next(unicodeiterobject *it)
9835{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009836 PyUnicodeObject *seq;
9837 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009838
Benjamin Peterson14339b62009-01-31 16:36:08 +00009839 assert(it != NULL);
9840 seq = it->it_seq;
9841 if (seq == NULL)
9842 return NULL;
9843 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009844
Benjamin Peterson14339b62009-01-31 16:36:08 +00009845 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9846 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009847 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009848 if (item != NULL)
9849 ++it->it_index;
9850 return item;
9851 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009852
Benjamin Peterson14339b62009-01-31 16:36:08 +00009853 Py_DECREF(seq);
9854 it->it_seq = NULL;
9855 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009856}
9857
9858static PyObject *
9859unicodeiter_len(unicodeiterobject *it)
9860{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009861 Py_ssize_t len = 0;
9862 if (it->it_seq)
9863 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9864 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009865}
9866
9867PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9868
9869static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009870 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009872 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009873};
9874
9875PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009876 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9877 "str_iterator", /* tp_name */
9878 sizeof(unicodeiterobject), /* tp_basicsize */
9879 0, /* tp_itemsize */
9880 /* methods */
9881 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9882 0, /* tp_print */
9883 0, /* tp_getattr */
9884 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009885 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009886 0, /* tp_repr */
9887 0, /* tp_as_number */
9888 0, /* tp_as_sequence */
9889 0, /* tp_as_mapping */
9890 0, /* tp_hash */
9891 0, /* tp_call */
9892 0, /* tp_str */
9893 PyObject_GenericGetAttr, /* tp_getattro */
9894 0, /* tp_setattro */
9895 0, /* tp_as_buffer */
9896 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9897 0, /* tp_doc */
9898 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9899 0, /* tp_clear */
9900 0, /* tp_richcompare */
9901 0, /* tp_weaklistoffset */
9902 PyObject_SelfIter, /* tp_iter */
9903 (iternextfunc)unicodeiter_next, /* tp_iternext */
9904 unicodeiter_methods, /* tp_methods */
9905 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009906};
9907
9908static PyObject *
9909unicode_iter(PyObject *seq)
9910{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009911 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009912
Benjamin Peterson14339b62009-01-31 16:36:08 +00009913 if (!PyUnicode_Check(seq)) {
9914 PyErr_BadInternalCall();
9915 return NULL;
9916 }
9917 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9918 if (it == NULL)
9919 return NULL;
9920 it->it_index = 0;
9921 Py_INCREF(seq);
9922 it->it_seq = (PyUnicodeObject *)seq;
9923 _PyObject_GC_TRACK(it);
9924 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009925}
9926
Martin v. Löwis5b222132007-06-10 09:51:05 +00009927size_t
9928Py_UNICODE_strlen(const Py_UNICODE *u)
9929{
9930 int res = 0;
9931 while(*u++)
9932 res++;
9933 return res;
9934}
9935
9936Py_UNICODE*
9937Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9938{
9939 Py_UNICODE *u = s1;
9940 while ((*u++ = *s2++));
9941 return s1;
9942}
9943
9944Py_UNICODE*
9945Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9946{
9947 Py_UNICODE *u = s1;
9948 while ((*u++ = *s2++))
9949 if (n-- == 0)
9950 break;
9951 return s1;
9952}
9953
Victor Stinnerc4eb7652010-09-01 23:43:50 +00009954Py_UNICODE*
9955Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
9956{
9957 Py_UNICODE *u1 = s1;
9958 u1 += Py_UNICODE_strlen(u1);
9959 Py_UNICODE_strcpy(u1, s2);
9960 return s1;
9961}
9962
Martin v. Löwis5b222132007-06-10 09:51:05 +00009963int
9964Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9965{
9966 while (*s1 && *s2 && *s1 == *s2)
9967 s1++, s2++;
9968 if (*s1 && *s2)
9969 return (*s1 < *s2) ? -1 : +1;
9970 if (*s1)
9971 return 1;
9972 if (*s2)
9973 return -1;
9974 return 0;
9975}
9976
Victor Stinneref8d95c2010-08-16 22:03:11 +00009977int
9978Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9979{
9980 register Py_UNICODE u1, u2;
9981 for (; n != 0; n--) {
9982 u1 = *s1;
9983 u2 = *s2;
9984 if (u1 != u2)
9985 return (u1 < u2) ? -1 : +1;
9986 if (u1 == '\0')
9987 return 0;
9988 s1++;
9989 s2++;
9990 }
9991 return 0;
9992}
9993
Martin v. Löwis5b222132007-06-10 09:51:05 +00009994Py_UNICODE*
9995Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9996{
9997 const Py_UNICODE *p;
9998 for (p = s; *p; p++)
9999 if (*p == c)
10000 return (Py_UNICODE*)p;
10001 return NULL;
10002}
10003
Victor Stinner331ea922010-08-10 16:37:20 +000010004Py_UNICODE*
10005Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10006{
10007 const Py_UNICODE *p;
10008 p = s + Py_UNICODE_strlen(s);
10009 while (p != s) {
10010 p--;
10011 if (*p == c)
10012 return (Py_UNICODE*)p;
10013 }
10014 return NULL;
10015}
10016
Martin v. Löwis5b222132007-06-10 09:51:05 +000010017
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010018#ifdef __cplusplus
10019}
10020#endif