blob: 8ceb3f32b8bc97b8892f897e6cbd54b532c226b3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Christian Heimes190d79e2008-01-30 11:58:22 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000119 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000120/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000121/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000122/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000123/* case 0x000C: * FORM FEED */
124/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000127/* case 0x001C: * FILE SEPARATOR */
128/* case 0x001D: * GROUP SEPARATOR */
129/* case 0x001E: * RECORD SEPARATOR */
130/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000131 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson14339b62009-01-31 16:36:08 +0000138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000146};
147
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149 PyObject **errorHandler,const char *encoding, const char *reason,
150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
Victor Stinner31be90b2010-04-22 19:38:16 +0000153static void raise_encode_exception(PyObject **exceptionObject,
154 const char *encoding,
155 const Py_UNICODE *unicode, Py_ssize_t size,
156 Py_ssize_t startpos, Py_ssize_t endpos,
157 const char *reason);
158
Christian Heimes190d79e2008-01-30 11:58:22 +0000159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrouf068f942010-01-13 14:19:12 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
Antoine Pitrouf068f942010-01-13 14:19:12 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
Benjamin Peterson29060642009-01-31 22:14:21 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252}
253
Benjamin Peterson29060642009-01-31 22:14:21 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000278 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000309 Ux0000 terminated; some code (e.g. new_identifier)
310 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000313 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314
315*/
316
317static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319{
320 register PyUnicodeObject *unicode;
321
Thomas Wouters477c8d52006-05-27 19:21:47 +0000322 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 if (length == 0 && unicode_empty != NULL) {
324 Py_INCREF(unicode_empty);
325 return unicode_empty;
326 }
327
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000328 /* Ensure we won't overflow the size. */
329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330 return (PyUnicodeObject *)PyErr_NoMemory();
331 }
332
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000334 if (free_list) {
335 unicode = free_list;
336 free_list = *(PyUnicodeObject **)unicode;
337 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000338 if (unicode->str) {
339 /* Keep-Alive optimization: we only upsize the buffer,
340 never downsize it. */
341 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000342 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000343 PyObject_DEL(unicode->str);
344 unicode->str = NULL;
345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000350 }
351 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 }
353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 if (unicode == NULL)
357 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 }
361
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 PyErr_NoMemory();
364 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000365 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000366 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000367 * the caller fails before initializing str -- unicode_resize()
368 * reads str[0], and the Keep-Alive optimization can keep memory
369 * allocated for str alive across a call to unicode_dealloc(unicode).
370 * We don't want unicode_resize to read uninitialized memory in
371 * that case.
372 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000373 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000377 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000378 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000380
Benjamin Peterson29060642009-01-31 22:14:21 +0000381 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000382 /* XXX UNREF/NEWREF interface should be more symmetrical */
383 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000385 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387}
388
389static
Guido van Rossum9475a232001-10-05 20:51:39 +0000390void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391{
Walter Dörwald16807132007-05-25 13:52:07 +0000392 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_NOT_INTERNED:
394 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000395
Benjamin Peterson29060642009-01-31 22:14:21 +0000396 case SSTATE_INTERNED_MORTAL:
397 /* revive dead object temporarily for DelItem */
398 Py_REFCNT(unicode) = 3;
399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400 Py_FatalError(
401 "deletion of interned string failed");
402 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 case SSTATE_INTERNED_IMMORTAL:
405 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406
Benjamin Peterson29060642009-01-31 22:14:21 +0000407 default:
408 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000409 }
410
Guido van Rossum604ddf82001-12-06 20:03:56 +0000411 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000412 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000413 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415 PyObject_DEL(unicode->str);
416 unicode->str = NULL;
417 unicode->length = 0;
418 }
419 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000420 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 }
422 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000423 *(PyUnicodeObject **)unicode = free_list;
424 free_list = unicode;
425 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000426 }
427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyObject_DEL(unicode->str);
429 Py_XDECREF(unicode->defenc);
430 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431 }
432}
433
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000436{
437 register PyUnicodeObject *v;
438
439 /* Argument checks */
440 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 PyErr_BadInternalCall();
442 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000443 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 PyErr_BadInternalCall();
447 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 }
449
450 /* Resizing unicode_empty and single character objects is not
451 possible since these are being shared. We simply return a fresh
452 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000453 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000454 (v == unicode_empty || v->length == 1)) {
455 PyUnicodeObject *w = _PyUnicode_New(length);
456 if (w == NULL)
457 return -1;
458 Py_UNICODE_COPY(w->str, v->str,
459 length < v->length ? length : v->length);
460 Py_DECREF(*unicode);
461 *unicode = w;
462 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000463 }
464
465 /* Note that we don't have to modify *unicode for unshared Unicode
466 objects, since we can modify them in-place. */
467 return unicode_resize(v, length);
468}
469
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477{
478 PyUnicodeObject *unicode;
479
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects. */
482 if (u != NULL) {
483
Benjamin Peterson29060642009-01-31 22:14:21 +0000484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000489
490 /* Single character Unicode objects in the Latin-1 range are
491 shared when using this constructor */
492 if (size == 1 && *u < 256) {
493 unicode = unicode_latin1[*u];
494 if (!unicode) {
495 unicode = _PyUnicode_New(1);
496 if (!unicode)
497 return NULL;
498 unicode->str[0] = *u;
499 unicode_latin1[*u] = unicode;
500 }
501 Py_INCREF(unicode);
502 return (PyObject *)unicode;
503 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000504 }
Tim Petersced69f82003-09-16 20:30:58 +0000505
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 unicode = _PyUnicode_New(size);
507 if (!unicode)
508 return NULL;
509
510 /* Copy the Unicode data into the new object */
511 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000512 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 return (PyObject *)unicode;
515}
516
Walter Dörwaldd2034312007-05-18 16:29:38 +0000517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518{
519 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000520
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 if (size < 0) {
522 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 return NULL;
525 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000526
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000528 some optimizations which share commonly used objects.
529 Also, this means the input must be UTF-8, so fall back to the
530 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (u != NULL) {
532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533 /* Optimization for empty strings */
534 if (size == 0 && unicode_empty != NULL) {
535 Py_INCREF(unicode_empty);
536 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000537 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000538
539 /* Single characters are shared when using this constructor.
540 Restrict to ASCII, since the input must be UTF-8. */
541 if (size == 1 && Py_CHARMASK(*u) < 128) {
542 unicode = unicode_latin1[Py_CHARMASK(*u)];
543 if (!unicode) {
544 unicode = _PyUnicode_New(1);
545 if (!unicode)
546 return NULL;
547 unicode->str[0] = Py_CHARMASK(*u);
548 unicode_latin1[Py_CHARMASK(*u)] = unicode;
549 }
550 Py_INCREF(unicode);
551 return (PyObject *)unicode;
552 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000553
554 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 }
556
Walter Dörwald55507312007-05-18 13:12:10 +0000557 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 if (!unicode)
559 return NULL;
560
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 return (PyObject *)unicode;
562}
563
Walter Dörwaldd2034312007-05-18 16:29:38 +0000564PyObject *PyUnicode_FromString(const char *u)
565{
566 size_t size = strlen(u);
567 if (size > PY_SSIZE_T_MAX) {
568 PyErr_SetString(PyExc_OverflowError, "input too long");
569 return NULL;
570 }
571
572 return PyUnicode_FromStringAndSize(u, size);
573}
574
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575#ifdef HAVE_WCHAR_H
576
Mark Dickinson081dfee2009-03-18 14:47:41 +0000577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584 to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
588{
589 PyUnicodeObject *unicode;
590 register Py_ssize_t i;
591 Py_ssize_t alloc;
592 const wchar_t *orig_w;
593
594 if (w == NULL) {
595 if (size == 0)
596 return PyUnicode_FromStringAndSize(NULL, 0);
597 PyErr_BadInternalCall();
598 return NULL;
599 }
600
601 if (size == -1) {
602 size = wcslen(w);
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000643 if (size == 0)
644 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000645 PyErr_BadInternalCall();
646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647 }
648
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == -1) {
650 size = wcslen(w);
651 }
652
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 unicode = _PyUnicode_New(size);
654 if (!unicode)
655 return NULL;
656
657 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000660#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000662 register Py_UNICODE *u;
663 register Py_ssize_t i;
664 u = PyUnicode_AS_UNICODE(unicode);
665 for (i = size; i > 0; i--)
666 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 }
668#endif
669
670 return (PyObject *)unicode;
671}
672
Mark Dickinson081dfee2009-03-18 14:47:41 +0000673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
Walter Dörwald346737f2007-05-31 10:44:43 +0000677static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000680{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000681 *fmt++ = '%';
682 if (width) {
683 if (zeropad)
684 *fmt++ = '0';
685 fmt += sprintf(fmt, "%d", width);
686 }
687 if (precision)
688 fmt += sprintf(fmt, ".%d", precision);
689 if (longflag)
690 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000691 else if (longlongflag) {
692 /* longlongflag should only ever be nonzero on machines with
693 HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695 char *f = PY_FORMAT_LONG_LONG;
696 while (*f)
697 *fmt++ = *f++;
698#else
699 /* we shouldn't ever get here */
700 assert(0);
701 *fmt++ = 'l';
702#endif
703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000704 else if (size_tflag) {
705 char *f = PY_FORMAT_SIZE_T;
706 while (*f)
707 *fmt++ = *f++;
708 }
709 *fmt++ = c;
710 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000711}
712
Walter Dörwaldd2034312007-05-18 16:29:38 +0000713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld. 21 characters
718 allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000728 va_list count;
729 Py_ssize_t callcount = 0;
730 PyObject **callresults = NULL;
731 PyObject **callresult = NULL;
732 Py_ssize_t n = 0;
733 int width = 0;
734 int precision = 0;
735 int zeropad;
736 const char* f;
737 Py_UNICODE *s;
738 PyObject *string;
739 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000740 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000741 /* use abuffer instead of buffer, if we need more space
742 * (which can happen if there's a format specifier with width). */
743 char *abuffer = NULL;
744 char *realbuffer;
745 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000749 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000750 /* step 1: count the number of %S/%R/%A/%s format specifications
751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000754 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000755 if (*f == '%') {
756 if (*(f+1)=='%')
757 continue;
758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759 ++callcount;
760 while (ISDIGIT((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763 ;
764 if (*f == 's')
765 ++callcount;
766 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000767 else if (128 <= (unsigned char)*f) {
768 PyErr_Format(PyExc_ValueError,
769 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000770 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000771 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000772 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 }
775 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000776 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000777 if (callcount) {
778 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
779 if (!callresults) {
780 PyErr_NoMemory();
781 return NULL;
782 }
783 callresult = callresults;
784 }
785 /* step 3: figure out how large a buffer we need */
786 for (f = format; *f; f++) {
787 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000788#ifdef HAVE_LONG_LONG
789 int longlongflag = 0;
790#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 const char* p = f;
792 width = 0;
793 while (ISDIGIT((unsigned)*f))
794 width = (width*10) + *f++ - '0';
795 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
796 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797
Benjamin Peterson14339b62009-01-31 16:36:08 +0000798 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
799 * they don't affect the amount of space we reserve.
800 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000801 if (*f == 'l') {
802 if (f[1] == 'd' || f[1] == 'u') {
803 ++f;
804 }
805#ifdef HAVE_LONG_LONG
806 else if (f[1] == 'l' &&
807 (f[2] == 'd' || f[2] == 'u')) {
808 longlongflag = 1;
809 f += 2;
810 }
811#endif
812 }
813 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000814 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000816
Benjamin Peterson14339b62009-01-31 16:36:08 +0000817 switch (*f) {
818 case 'c':
819 (void)va_arg(count, int);
820 /* fall through... */
821 case '%':
822 n++;
823 break;
824 case 'd': case 'u': case 'i': case 'x':
825 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000826#ifdef HAVE_LONG_LONG
827 if (longlongflag) {
828 if (width < MAX_LONG_LONG_CHARS)
829 width = MAX_LONG_LONG_CHARS;
830 }
831 else
832#endif
833 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
834 including sign. Decimal takes the most space. This
835 isn't enough for octal. If a width is specified we
836 need more (which we allocate later). */
837 if (width < MAX_LONG_CHARS)
838 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000839 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000840 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 if (abuffersize < width)
842 abuffersize = width;
843 break;
844 case 's':
845 {
846 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000847 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000848 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
849 if (!str)
850 goto fail;
851 n += PyUnicode_GET_SIZE(str);
852 /* Remember the str and switch to the next slot */
853 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000854 break;
855 }
856 case 'U':
857 {
858 PyObject *obj = va_arg(count, PyObject *);
859 assert(obj && PyUnicode_Check(obj));
860 n += PyUnicode_GET_SIZE(obj);
861 break;
862 }
863 case 'V':
864 {
865 PyObject *obj = va_arg(count, PyObject *);
866 const char *str = va_arg(count, const char *);
867 assert(obj || str);
868 assert(!obj || PyUnicode_Check(obj));
869 if (obj)
870 n += PyUnicode_GET_SIZE(obj);
871 else
872 n += strlen(str);
873 break;
874 }
875 case 'S':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 PyObject *str;
879 assert(obj);
880 str = PyObject_Str(obj);
881 if (!str)
882 goto fail;
883 n += PyUnicode_GET_SIZE(str);
884 /* Remember the str and switch to the next slot */
885 *callresult++ = str;
886 break;
887 }
888 case 'R':
889 {
890 PyObject *obj = va_arg(count, PyObject *);
891 PyObject *repr;
892 assert(obj);
893 repr = PyObject_Repr(obj);
894 if (!repr)
895 goto fail;
896 n += PyUnicode_GET_SIZE(repr);
897 /* Remember the repr and switch to the next slot */
898 *callresult++ = repr;
899 break;
900 }
901 case 'A':
902 {
903 PyObject *obj = va_arg(count, PyObject *);
904 PyObject *ascii;
905 assert(obj);
906 ascii = PyObject_ASCII(obj);
907 if (!ascii)
908 goto fail;
909 n += PyUnicode_GET_SIZE(ascii);
910 /* Remember the repr and switch to the next slot */
911 *callresult++ = ascii;
912 break;
913 }
914 case 'p':
915 (void) va_arg(count, int);
916 /* maximum 64-bit pointer representation:
917 * 0xffffffffffffffff
918 * so 19 characters is enough.
919 * XXX I count 18 -- what's the extra for?
920 */
921 n += 19;
922 break;
923 default:
924 /* if we stumble upon an unknown
925 formatting code, copy the rest of
926 the format string to the output
927 string. (we cannot just skip the
928 code, since there's no way to know
929 what's in the argument list) */
930 n += strlen(p);
931 goto expand;
932 }
933 } else
934 n++;
935 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000936 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000937 if (abuffersize > ITEM_BUFFER_LEN) {
938 /* add 1 for sprintf's trailing null byte */
939 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000940 if (!abuffer) {
941 PyErr_NoMemory();
942 goto fail;
943 }
944 realbuffer = abuffer;
945 }
946 else
947 realbuffer = buffer;
948 /* step 4: fill the buffer */
949 /* Since we've analyzed how much space we need for the worst case,
950 we don't have to resize the string.
951 There can be no errors beyond this point. */
952 string = PyUnicode_FromUnicode(NULL, n);
953 if (!string)
954 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000955
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 s = PyUnicode_AS_UNICODE(string);
957 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000958
Benjamin Peterson14339b62009-01-31 16:36:08 +0000959 for (f = format; *f; f++) {
960 if (*f == '%') {
961 const char* p = f++;
962 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000963 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000964 int size_tflag = 0;
965 zeropad = (*f == '0');
966 /* parse the width.precision part */
967 width = 0;
968 while (ISDIGIT((unsigned)*f))
969 width = (width*10) + *f++ - '0';
970 precision = 0;
971 if (*f == '.') {
972 f++;
973 while (ISDIGIT((unsigned)*f))
974 precision = (precision*10) + *f++ - '0';
975 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000976 /* Handle %ld, %lu, %lld and %llu. */
977 if (*f == 'l') {
978 if (f[1] == 'd' || f[1] == 'u') {
979 longflag = 1;
980 ++f;
981 }
982#ifdef HAVE_LONG_LONG
983 else if (f[1] == 'l' &&
984 (f[2] == 'd' || f[2] == 'u')) {
985 longlongflag = 1;
986 f += 2;
987 }
988#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 }
990 /* handle the size_t flag. */
991 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
992 size_tflag = 1;
993 ++f;
994 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000995
Benjamin Peterson14339b62009-01-31 16:36:08 +0000996 switch (*f) {
997 case 'c':
998 *s++ = va_arg(vargs, int);
999 break;
1000 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001001 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1002 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001003 if (longflag)
1004 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001005#ifdef HAVE_LONG_LONG
1006 else if (longlongflag)
1007 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1008#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001009 else if (size_tflag)
1010 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1011 else
1012 sprintf(realbuffer, fmt, va_arg(vargs, int));
1013 appendstring(realbuffer);
1014 break;
1015 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001016 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1017 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 if (longflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001020#ifdef HAVE_LONG_LONG
1021 else if (longlongflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs,
1023 unsigned PY_LONG_LONG));
1024#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001025 else if (size_tflag)
1026 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1027 else
1028 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1029 appendstring(realbuffer);
1030 break;
1031 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 sprintf(realbuffer, fmt, va_arg(vargs, int));
1034 appendstring(realbuffer);
1035 break;
1036 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001037 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 's':
1042 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001043 /* unused, since we already have the result */
1044 (void) va_arg(vargs, char *);
1045 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1046 PyUnicode_GET_SIZE(*callresult));
1047 s += PyUnicode_GET_SIZE(*callresult);
1048 /* We're done with the unicode()/repr() => forget it */
1049 Py_DECREF(*callresult);
1050 /* switch to next unicode()/repr() result */
1051 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001052 break;
1053 }
1054 case 'U':
1055 {
1056 PyObject *obj = va_arg(vargs, PyObject *);
1057 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1058 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1059 s += size;
1060 break;
1061 }
1062 case 'V':
1063 {
1064 PyObject *obj = va_arg(vargs, PyObject *);
1065 const char *str = va_arg(vargs, const char *);
1066 if (obj) {
1067 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1068 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1069 s += size;
1070 } else {
1071 appendstring(str);
1072 }
1073 break;
1074 }
1075 case 'S':
1076 case 'R':
1077 {
1078 Py_UNICODE *ucopy;
1079 Py_ssize_t usize;
1080 Py_ssize_t upos;
1081 /* unused, since we already have the result */
1082 (void) va_arg(vargs, PyObject *);
1083 ucopy = PyUnicode_AS_UNICODE(*callresult);
1084 usize = PyUnicode_GET_SIZE(*callresult);
1085 for (upos = 0; upos<usize;)
1086 *s++ = ucopy[upos++];
1087 /* We're done with the unicode()/repr() => forget it */
1088 Py_DECREF(*callresult);
1089 /* switch to next unicode()/repr() result */
1090 ++callresult;
1091 break;
1092 }
1093 case 'p':
1094 sprintf(buffer, "%p", va_arg(vargs, void*));
1095 /* %p is ill-defined: ensure leading 0x. */
1096 if (buffer[1] == 'X')
1097 buffer[1] = 'x';
1098 else if (buffer[1] != 'x') {
1099 memmove(buffer+2, buffer, strlen(buffer)+1);
1100 buffer[0] = '0';
1101 buffer[1] = 'x';
1102 }
1103 appendstring(buffer);
1104 break;
1105 case '%':
1106 *s++ = '%';
1107 break;
1108 default:
1109 appendstring(p);
1110 goto end;
1111 }
Victor Stinner1205f272010-09-11 00:54:47 +00001112 }
Victor Stinner1205f272010-09-11 00:54:47 +00001113 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001114 *s++ = *f;
1115 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001116
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 if (callresults)
1119 PyObject_Free(callresults);
1120 if (abuffer)
1121 PyObject_Free(abuffer);
1122 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1123 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001124 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 if (callresults) {
1126 PyObject **callresult2 = callresults;
1127 while (callresult2 < callresult) {
1128 Py_DECREF(*callresult2);
1129 ++callresult2;
1130 }
1131 PyObject_Free(callresults);
1132 }
1133 if (abuffer)
1134 PyObject_Free(abuffer);
1135 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001136}
1137
1138#undef appendstring
1139
1140PyObject *
1141PyUnicode_FromFormat(const char *format, ...)
1142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001143 PyObject* ret;
1144 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001145
1146#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001147 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001148#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001150#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001151 ret = PyUnicode_FromFormatV(format, vargs);
1152 va_end(vargs);
1153 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001154}
1155
Victor Stinner137c34c2010-09-29 10:25:54 +00001156static void
1157unicode_aswidechar(PyUnicodeObject *unicode,
1158 wchar_t *w,
1159 Py_ssize_t size)
1160{
1161#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1162 memcpy(w, unicode->str, size * sizeof(wchar_t));
1163#else
1164 register Py_UNICODE *u;
1165 register Py_ssize_t i;
1166 u = PyUnicode_AS_UNICODE(unicode);
1167 for (i = size; i > 0; i--)
1168 *w++ = *u++;
1169#endif
1170}
1171
1172Py_ssize_t
1173PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1174 wchar_t *w,
1175 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176{
1177 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 PyErr_BadInternalCall();
1179 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001181
1182 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001185
Victor Stinner137c34c2010-09-29 10:25:54 +00001186 unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001188 if (size > PyUnicode_GET_SIZE(unicode))
1189 return PyUnicode_GET_SIZE(unicode);
1190 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001191 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192}
1193
Victor Stinner137c34c2010-09-29 10:25:54 +00001194wchar_t*
1195PyUnicode_AsWideCharString(PyUnicodeObject *unicode,
1196 Py_ssize_t *size)
1197{
1198 wchar_t* buffer;
1199 Py_ssize_t buflen;
1200
1201 if (unicode == NULL) {
1202 PyErr_BadInternalCall();
1203 return NULL;
1204 }
1205
1206 if ((PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) < PyUnicode_GET_SIZE(unicode)) {
1207 PyErr_NoMemory();
1208 return NULL;
1209 }
1210
1211 buflen = PyUnicode_GET_SIZE(unicode) + 1; /* copy L'\0' */
1212 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1213 if (buffer == NULL) {
1214 PyErr_NoMemory();
1215 return NULL;
1216 }
1217 unicode_aswidechar(unicode, buffer, buflen);
1218 return buffer;
1219}
1220
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221#endif
1222
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001223PyObject *PyUnicode_FromOrdinal(int ordinal)
1224{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001225 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001226
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001227 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 PyErr_SetString(PyExc_ValueError,
1229 "chr() arg not in range(0x110000)");
1230 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001231 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001232
1233#ifndef Py_UNICODE_WIDE
1234 if (ordinal > 0xffff) {
1235 ordinal -= 0x10000;
1236 s[0] = 0xD800 | (ordinal >> 10);
1237 s[1] = 0xDC00 | (ordinal & 0x3FF);
1238 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001239 }
1240#endif
1241
Hye-Shik Chang40574832004-04-06 07:24:51 +00001242 s[0] = (Py_UNICODE)ordinal;
1243 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001244}
1245
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246PyObject *PyUnicode_FromObject(register PyObject *obj)
1247{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001248 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001249 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001250 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001251 Py_INCREF(obj);
1252 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001253 }
1254 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001255 /* For a Unicode subtype that's not a Unicode object,
1256 return a true Unicode object with the same data. */
1257 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1258 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001259 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001260 PyErr_Format(PyExc_TypeError,
1261 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001262 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001263 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001264}
1265
1266PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001267 const char *encoding,
1268 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001269{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001270 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001271 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001272
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001274 PyErr_BadInternalCall();
1275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001277
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001278 /* Decoding bytes objects is the most common case and should be fast */
1279 if (PyBytes_Check(obj)) {
1280 if (PyBytes_GET_SIZE(obj) == 0) {
1281 Py_INCREF(unicode_empty);
1282 v = (PyObject *) unicode_empty;
1283 }
1284 else {
1285 v = PyUnicode_Decode(
1286 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1287 encoding, errors);
1288 }
1289 return v;
1290 }
1291
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001292 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001293 PyErr_SetString(PyExc_TypeError,
1294 "decoding str is not supported");
1295 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001296 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001297
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001298 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1299 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1300 PyErr_Format(PyExc_TypeError,
1301 "coercing to str: need bytes, bytearray "
1302 "or buffer-like object, %.80s found",
1303 Py_TYPE(obj)->tp_name);
1304 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001305 }
Tim Petersced69f82003-09-16 20:30:58 +00001306
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001307 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001308 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001309 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 }
Tim Petersced69f82003-09-16 20:30:58 +00001311 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001312 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001313
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001314 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001315 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316}
1317
Victor Stinner600d3be2010-06-10 12:00:55 +00001318/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001319 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1320 1 on success. */
1321static int
1322normalize_encoding(const char *encoding,
1323 char *lower,
1324 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001326 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001327 char *l;
1328 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001329
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001330 e = encoding;
1331 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001332 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001333 while (*e) {
1334 if (l == l_end)
1335 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001336 if (ISUPPER(*e)) {
1337 *l++ = TOLOWER(*e++);
1338 }
1339 else if (*e == '_') {
1340 *l++ = '-';
1341 e++;
1342 }
1343 else {
1344 *l++ = *e++;
1345 }
1346 }
1347 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001348 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001349}
1350
1351PyObject *PyUnicode_Decode(const char *s,
1352 Py_ssize_t size,
1353 const char *encoding,
1354 const char *errors)
1355{
1356 PyObject *buffer = NULL, *unicode;
1357 Py_buffer info;
1358 char lower[11]; /* Enough for any encoding shortcut */
1359
1360 if (encoding == NULL)
1361 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001362
1363 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001364 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1365 if (strcmp(lower, "utf-8") == 0)
1366 return PyUnicode_DecodeUTF8(s, size, errors);
1367 else if ((strcmp(lower, "latin-1") == 0) ||
1368 (strcmp(lower, "iso-8859-1") == 0))
1369 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001370#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001371 else if (strcmp(lower, "mbcs") == 0)
1372 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001373#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001374 else if (strcmp(lower, "ascii") == 0)
1375 return PyUnicode_DecodeASCII(s, size, errors);
1376 else if (strcmp(lower, "utf-16") == 0)
1377 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1378 else if (strcmp(lower, "utf-32") == 0)
1379 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381
1382 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001383 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001384 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001385 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001386 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 if (buffer == NULL)
1388 goto onError;
1389 unicode = PyCodec_Decode(buffer, encoding, errors);
1390 if (unicode == NULL)
1391 goto onError;
1392 if (!PyUnicode_Check(unicode)) {
1393 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001394 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001395 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 Py_DECREF(unicode);
1397 goto onError;
1398 }
1399 Py_DECREF(buffer);
1400 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001401
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 Py_XDECREF(buffer);
1404 return NULL;
1405}
1406
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001407PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1408 const char *encoding,
1409 const char *errors)
1410{
1411 PyObject *v;
1412
1413 if (!PyUnicode_Check(unicode)) {
1414 PyErr_BadArgument();
1415 goto onError;
1416 }
1417
1418 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001419 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001420
1421 /* Decode via the codec registry */
1422 v = PyCodec_Decode(unicode, encoding, errors);
1423 if (v == NULL)
1424 goto onError;
1425 return v;
1426
Benjamin Peterson29060642009-01-31 22:14:21 +00001427 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001428 return NULL;
1429}
1430
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001431PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1432 const char *encoding,
1433 const char *errors)
1434{
1435 PyObject *v;
1436
1437 if (!PyUnicode_Check(unicode)) {
1438 PyErr_BadArgument();
1439 goto onError;
1440 }
1441
1442 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001443 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001444
1445 /* Decode via the codec registry */
1446 v = PyCodec_Decode(unicode, encoding, errors);
1447 if (v == NULL)
1448 goto onError;
1449 if (!PyUnicode_Check(v)) {
1450 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001451 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001452 Py_TYPE(v)->tp_name);
1453 Py_DECREF(v);
1454 goto onError;
1455 }
1456 return v;
1457
Benjamin Peterson29060642009-01-31 22:14:21 +00001458 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001459 return NULL;
1460}
1461
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001463 Py_ssize_t size,
1464 const char *encoding,
1465 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466{
1467 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001468
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 unicode = PyUnicode_FromUnicode(s, size);
1470 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001472 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1473 Py_DECREF(unicode);
1474 return v;
1475}
1476
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001477PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1478 const char *encoding,
1479 const char *errors)
1480{
1481 PyObject *v;
1482
1483 if (!PyUnicode_Check(unicode)) {
1484 PyErr_BadArgument();
1485 goto onError;
1486 }
1487
1488 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001489 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001490
1491 /* Encode via the codec registry */
1492 v = PyCodec_Encode(unicode, encoding, errors);
1493 if (v == NULL)
1494 goto onError;
1495 return v;
1496
Benjamin Peterson29060642009-01-31 22:14:21 +00001497 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001498 return NULL;
1499}
1500
Victor Stinnerae6265f2010-05-15 16:27:27 +00001501PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1502{
Victor Stinner313a1202010-06-11 23:56:51 +00001503 if (Py_FileSystemDefaultEncoding) {
1504#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1505 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1506 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1507 PyUnicode_GET_SIZE(unicode),
1508 NULL);
1509#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001510 return PyUnicode_AsEncodedString(unicode,
1511 Py_FileSystemDefaultEncoding,
1512 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001513 }
1514 else {
1515 /* if you change the default encoding, update also
1516 PyUnicode_DecodeFSDefaultAndSize() and redecode_filenames() */
Victor Stinnerae6265f2010-05-15 16:27:27 +00001517 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Victor Stinner3119ed72010-08-18 22:26:50 +00001518 PyUnicode_GET_SIZE(unicode),
1519 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001520 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00001521}
1522
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1524 const char *encoding,
1525 const char *errors)
1526{
1527 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001528 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001529
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 if (!PyUnicode_Check(unicode)) {
1531 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001532 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 }
Fred Drakee4315f52000-05-09 19:53:39 +00001534
Tim Petersced69f82003-09-16 20:30:58 +00001535 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001536 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001537
1538 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001539 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1540 if (strcmp(lower, "utf-8") == 0)
1541 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1542 PyUnicode_GET_SIZE(unicode),
1543 errors);
1544 else if ((strcmp(lower, "latin-1") == 0) ||
1545 (strcmp(lower, "iso-8859-1") == 0))
1546 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1547 PyUnicode_GET_SIZE(unicode),
1548 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001549#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001550 else if (strcmp(lower, "mbcs") == 0)
1551 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1552 PyUnicode_GET_SIZE(unicode),
1553 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001554#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001555 else if (strcmp(lower, "ascii") == 0)
1556 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1557 PyUnicode_GET_SIZE(unicode),
1558 errors);
1559 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001560 /* During bootstrap, we may need to find the encodings
1561 package, to load the file system encoding, and require the
1562 file system encoding in order to load the encodings
1563 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001564
Victor Stinner59e62db2010-05-15 13:14:32 +00001565 Break out of this dependency by assuming that the path to
1566 the encodings module is ASCII-only. XXX could try wcstombs
1567 instead, if the file system encoding is the locale's
1568 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001569 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001570 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1571 !PyThreadState_GET()->interp->codecs_initialized)
1572 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1573 PyUnicode_GET_SIZE(unicode),
1574 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001575
1576 /* Encode via the codec registry */
1577 v = PyCodec_Encode(unicode, encoding, errors);
1578 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001579 return NULL;
1580
1581 /* The normal path */
1582 if (PyBytes_Check(v))
1583 return v;
1584
1585 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001586 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001587 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001588 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001589
1590 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1591 "encoder %s returned bytearray instead of bytes",
1592 encoding);
1593 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001594 Py_DECREF(v);
1595 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001596 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001597
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001598 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1599 Py_DECREF(v);
1600 return b;
1601 }
1602
1603 PyErr_Format(PyExc_TypeError,
1604 "encoder did not return a bytes object (type=%.400s)",
1605 Py_TYPE(v)->tp_name);
1606 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001607 return NULL;
1608}
1609
1610PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1611 const char *encoding,
1612 const char *errors)
1613{
1614 PyObject *v;
1615
1616 if (!PyUnicode_Check(unicode)) {
1617 PyErr_BadArgument();
1618 goto onError;
1619 }
1620
1621 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001622 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001623
1624 /* Encode via the codec registry */
1625 v = PyCodec_Encode(unicode, encoding, errors);
1626 if (v == NULL)
1627 goto onError;
1628 if (!PyUnicode_Check(v)) {
1629 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001630 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001631 Py_TYPE(v)->tp_name);
1632 Py_DECREF(v);
1633 goto onError;
1634 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001635 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001636
Benjamin Peterson29060642009-01-31 22:14:21 +00001637 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 return NULL;
1639}
1640
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001641PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001642 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001643{
1644 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001645 if (v)
1646 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001647 if (errors != NULL)
1648 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001649 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001650 PyUnicode_GET_SIZE(unicode),
1651 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001652 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001653 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001654 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001655 return v;
1656}
1657
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001658PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001659PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001660 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001661 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1662}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001663
Christian Heimes5894ba72007-11-04 11:43:14 +00001664PyObject*
1665PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1666{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001667 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1668 can be undefined. If it is case, decode using UTF-8. The following assumes
1669 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1670 bootstrapping process where the codecs aren't ready yet.
1671 */
1672 if (Py_FileSystemDefaultEncoding) {
1673#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001674 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001675 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001676 }
1677#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001678 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001679 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001680 }
1681#endif
1682 return PyUnicode_Decode(s, size,
1683 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001684 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001685 }
1686 else {
Victor Stinnerc39211f2010-09-29 16:35:47 +00001687 /* if you change the default encoding, update also
1688 PyUnicode_EncodeFSDefault() and redecode_filenames() */
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001689 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001690 }
1691}
1692
Martin v. Löwis011e8422009-05-05 04:43:17 +00001693
1694int
1695PyUnicode_FSConverter(PyObject* arg, void* addr)
1696{
1697 PyObject *output = NULL;
1698 Py_ssize_t size;
1699 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001700 if (arg == NULL) {
1701 Py_DECREF(*(PyObject**)addr);
1702 return 1;
1703 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001704 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001705 output = arg;
1706 Py_INCREF(output);
1707 }
1708 else {
1709 arg = PyUnicode_FromObject(arg);
1710 if (!arg)
1711 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001712 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001713 Py_DECREF(arg);
1714 if (!output)
1715 return 0;
1716 if (!PyBytes_Check(output)) {
1717 Py_DECREF(output);
1718 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1719 return 0;
1720 }
1721 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001722 size = PyBytes_GET_SIZE(output);
1723 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001724 if (size != strlen(data)) {
1725 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1726 Py_DECREF(output);
1727 return 0;
1728 }
1729 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001730 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001731}
1732
1733
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001734int
1735PyUnicode_FSDecoder(PyObject* arg, void* addr)
1736{
1737 PyObject *output = NULL;
1738 Py_ssize_t size;
1739 void *data;
1740 if (arg == NULL) {
1741 Py_DECREF(*(PyObject**)addr);
1742 return 1;
1743 }
1744 if (PyUnicode_Check(arg)) {
1745 output = arg;
1746 Py_INCREF(output);
1747 }
1748 else {
1749 arg = PyBytes_FromObject(arg);
1750 if (!arg)
1751 return 0;
1752 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1753 PyBytes_GET_SIZE(arg));
1754 Py_DECREF(arg);
1755 if (!output)
1756 return 0;
1757 if (!PyUnicode_Check(output)) {
1758 Py_DECREF(output);
1759 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1760 return 0;
1761 }
1762 }
1763 size = PyUnicode_GET_SIZE(output);
1764 data = PyUnicode_AS_UNICODE(output);
1765 if (size != Py_UNICODE_strlen(data)) {
1766 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1767 Py_DECREF(output);
1768 return 0;
1769 }
1770 *(PyObject**)addr = output;
1771 return Py_CLEANUP_SUPPORTED;
1772}
1773
1774
Martin v. Löwis5b222132007-06-10 09:51:05 +00001775char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001776_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001777{
Christian Heimesf3863112007-11-22 07:46:41 +00001778 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001779 if (!PyUnicode_Check(unicode)) {
1780 PyErr_BadArgument();
1781 return NULL;
1782 }
Christian Heimesf3863112007-11-22 07:46:41 +00001783 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1784 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001785 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001786 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001787 *psize = PyBytes_GET_SIZE(bytes);
1788 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001789}
1790
1791char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001792_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001793{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001794 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001795}
1796
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1798{
1799 if (!PyUnicode_Check(unicode)) {
1800 PyErr_BadArgument();
1801 goto onError;
1802 }
1803 return PyUnicode_AS_UNICODE(unicode);
1804
Benjamin Peterson29060642009-01-31 22:14:21 +00001805 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 return NULL;
1807}
1808
Martin v. Löwis18e16552006-02-15 17:27:45 +00001809Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810{
1811 if (!PyUnicode_Check(unicode)) {
1812 PyErr_BadArgument();
1813 goto onError;
1814 }
1815 return PyUnicode_GET_SIZE(unicode);
1816
Benjamin Peterson29060642009-01-31 22:14:21 +00001817 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818 return -1;
1819}
1820
Thomas Wouters78890102000-07-22 19:25:51 +00001821const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001822{
Victor Stinner42cb4622010-09-01 19:39:01 +00001823 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001824}
1825
Victor Stinner554f3f02010-06-16 23:33:54 +00001826/* create or adjust a UnicodeDecodeError */
1827static void
1828make_decode_exception(PyObject **exceptionObject,
1829 const char *encoding,
1830 const char *input, Py_ssize_t length,
1831 Py_ssize_t startpos, Py_ssize_t endpos,
1832 const char *reason)
1833{
1834 if (*exceptionObject == NULL) {
1835 *exceptionObject = PyUnicodeDecodeError_Create(
1836 encoding, input, length, startpos, endpos, reason);
1837 }
1838 else {
1839 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1840 goto onError;
1841 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1842 goto onError;
1843 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1844 goto onError;
1845 }
1846 return;
1847
1848onError:
1849 Py_DECREF(*exceptionObject);
1850 *exceptionObject = NULL;
1851}
1852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001853/* error handling callback helper:
1854 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001855 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001856 and adjust various state variables.
1857 return 0 on success, -1 on error
1858*/
1859
1860static
1861int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001862 const char *encoding, const char *reason,
1863 const char **input, const char **inend, Py_ssize_t *startinpos,
1864 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1865 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001866{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001867 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868
1869 PyObject *restuple = NULL;
1870 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001871 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001872 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001873 Py_ssize_t requiredsize;
1874 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001875 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001876 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001877 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 int res = -1;
1879
1880 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 *errorHandler = PyCodec_LookupError(errors);
1882 if (*errorHandler == NULL)
1883 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 }
1885
Victor Stinner554f3f02010-06-16 23:33:54 +00001886 make_decode_exception(exceptionObject,
1887 encoding,
1888 *input, *inend - *input,
1889 *startinpos, *endinpos,
1890 reason);
1891 if (*exceptionObject == NULL)
1892 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001893
1894 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1895 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001896 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001897 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001898 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001899 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001900 }
1901 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001902 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001903
1904 /* Copy back the bytes variables, which might have been modified by the
1905 callback */
1906 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1907 if (!inputobj)
1908 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001909 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001910 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001911 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001912 *input = PyBytes_AS_STRING(inputobj);
1913 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001914 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001915 /* we can DECREF safely, as the exception has another reference,
1916 so the object won't go away. */
1917 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001918
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001920 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001921 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001922 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1923 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001924 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001925
1926 /* need more space? (at least enough for what we
1927 have+the replacement+the rest of the string (starting
1928 at the new input position), so we won't have to check space
1929 when there are no errors in the rest of the string) */
1930 repptr = PyUnicode_AS_UNICODE(repunicode);
1931 repsize = PyUnicode_GET_SIZE(repunicode);
1932 requiredsize = *outpos + repsize + insize-newpos;
1933 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001934 if (requiredsize<2*outsize)
1935 requiredsize = 2*outsize;
1936 if (_PyUnicode_Resize(output, requiredsize) < 0)
1937 goto onError;
1938 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001939 }
1940 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001941 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001942 Py_UNICODE_COPY(*outptr, repptr, repsize);
1943 *outptr += repsize;
1944 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001945
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001946 /* we made it! */
1947 res = 0;
1948
Benjamin Peterson29060642009-01-31 22:14:21 +00001949 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001950 Py_XDECREF(restuple);
1951 return res;
1952}
1953
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001954/* --- UTF-7 Codec -------------------------------------------------------- */
1955
Antoine Pitrou244651a2009-05-04 18:56:13 +00001956/* See RFC2152 for details. We encode conservatively and decode liberally. */
1957
1958/* Three simple macros defining base-64. */
1959
1960/* Is c a base-64 character? */
1961
1962#define IS_BASE64(c) \
1963 (((c) >= 'A' && (c) <= 'Z') || \
1964 ((c) >= 'a' && (c) <= 'z') || \
1965 ((c) >= '0' && (c) <= '9') || \
1966 (c) == '+' || (c) == '/')
1967
1968/* given that c is a base-64 character, what is its base-64 value? */
1969
1970#define FROM_BASE64(c) \
1971 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1972 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1973 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1974 (c) == '+' ? 62 : 63)
1975
1976/* What is the base-64 character of the bottom 6 bits of n? */
1977
1978#define TO_BASE64(n) \
1979 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1980
1981/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1982 * decoded as itself. We are permissive on decoding; the only ASCII
1983 * byte not decoding to itself is the + which begins a base64
1984 * string. */
1985
1986#define DECODE_DIRECT(c) \
1987 ((c) <= 127 && (c) != '+')
1988
1989/* The UTF-7 encoder treats ASCII characters differently according to
1990 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1991 * the above). See RFC2152. This array identifies these different
1992 * sets:
1993 * 0 : "Set D"
1994 * alphanumeric and '(),-./:?
1995 * 1 : "Set O"
1996 * !"#$%&*;<=>@[]^_`{|}
1997 * 2 : "whitespace"
1998 * ht nl cr sp
1999 * 3 : special (must be base64 encoded)
2000 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2001 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002002
Tim Petersced69f82003-09-16 20:30:58 +00002003static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002004char utf7_category[128] = {
2005/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2006 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2007/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2008 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2009/* sp ! " # $ % & ' ( ) * + , - . / */
2010 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2011/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2012 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2013/* @ A B C D E F G H I J K L M N O */
2014 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2015/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2016 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2017/* ` a b c d e f g h i j k l m n o */
2018 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2019/* p q r s t u v w x y z { | } ~ del */
2020 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002021};
2022
Antoine Pitrou244651a2009-05-04 18:56:13 +00002023/* ENCODE_DIRECT: this character should be encoded as itself. The
2024 * answer depends on whether we are encoding set O as itself, and also
2025 * on whether we are encoding whitespace as itself. RFC2152 makes it
2026 * clear that the answers to these questions vary between
2027 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002028
Antoine Pitrou244651a2009-05-04 18:56:13 +00002029#define ENCODE_DIRECT(c, directO, directWS) \
2030 ((c) < 128 && (c) > 0 && \
2031 ((utf7_category[(c)] == 0) || \
2032 (directWS && (utf7_category[(c)] == 2)) || \
2033 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002034
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002035PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002036 Py_ssize_t size,
2037 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002038{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002039 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2040}
2041
Antoine Pitrou244651a2009-05-04 18:56:13 +00002042/* The decoder. The only state we preserve is our read position,
2043 * i.e. how many characters we have consumed. So if we end in the
2044 * middle of a shift sequence we have to back off the read position
2045 * and the output to the beginning of the sequence, otherwise we lose
2046 * all the shift state (seen bits, number of bits seen, high
2047 * surrogate). */
2048
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002049PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002050 Py_ssize_t size,
2051 const char *errors,
2052 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002053{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002055 Py_ssize_t startinpos;
2056 Py_ssize_t endinpos;
2057 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002058 const char *e;
2059 PyUnicodeObject *unicode;
2060 Py_UNICODE *p;
2061 const char *errmsg = "";
2062 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002063 Py_UNICODE *shiftOutStart;
2064 unsigned int base64bits = 0;
2065 unsigned long base64buffer = 0;
2066 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002067 PyObject *errorHandler = NULL;
2068 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002069
2070 unicode = _PyUnicode_New(size);
2071 if (!unicode)
2072 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002073 if (size == 0) {
2074 if (consumed)
2075 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002076 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002077 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002078
2079 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002080 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002081 e = s + size;
2082
2083 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002085 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002086 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002087
Antoine Pitrou244651a2009-05-04 18:56:13 +00002088 if (inShift) { /* in a base-64 section */
2089 if (IS_BASE64(ch)) { /* consume a base-64 character */
2090 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2091 base64bits += 6;
2092 s++;
2093 if (base64bits >= 16) {
2094 /* we have enough bits for a UTF-16 value */
2095 Py_UNICODE outCh = (Py_UNICODE)
2096 (base64buffer >> (base64bits-16));
2097 base64bits -= 16;
2098 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2099 if (surrogate) {
2100 /* expecting a second surrogate */
2101 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2102#ifdef Py_UNICODE_WIDE
2103 *p++ = (((surrogate & 0x3FF)<<10)
2104 | (outCh & 0x3FF)) + 0x10000;
2105#else
2106 *p++ = surrogate;
2107 *p++ = outCh;
2108#endif
2109 surrogate = 0;
2110 }
2111 else {
2112 surrogate = 0;
2113 errmsg = "second surrogate missing";
2114 goto utf7Error;
2115 }
2116 }
2117 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2118 /* first surrogate */
2119 surrogate = outCh;
2120 }
2121 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2122 errmsg = "unexpected second surrogate";
2123 goto utf7Error;
2124 }
2125 else {
2126 *p++ = outCh;
2127 }
2128 }
2129 }
2130 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002131 inShift = 0;
2132 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002133 if (surrogate) {
2134 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002135 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002136 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002137 if (base64bits > 0) { /* left-over bits */
2138 if (base64bits >= 6) {
2139 /* We've seen at least one base-64 character */
2140 errmsg = "partial character in shift sequence";
2141 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002142 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002143 else {
2144 /* Some bits remain; they should be zero */
2145 if (base64buffer != 0) {
2146 errmsg = "non-zero padding bits in shift sequence";
2147 goto utf7Error;
2148 }
2149 }
2150 }
2151 if (ch != '-') {
2152 /* '-' is absorbed; other terminating
2153 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002154 *p++ = ch;
2155 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002156 }
2157 }
2158 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002159 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002160 s++; /* consume '+' */
2161 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002162 s++;
2163 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002164 }
2165 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002166 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002167 shiftOutStart = p;
2168 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002169 }
2170 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002171 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002172 *p++ = ch;
2173 s++;
2174 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002175 else {
2176 startinpos = s-starts;
2177 s++;
2178 errmsg = "unexpected special character";
2179 goto utf7Error;
2180 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002181 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002182utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002183 outpos = p-PyUnicode_AS_UNICODE(unicode);
2184 endinpos = s-starts;
2185 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002186 errors, &errorHandler,
2187 "utf7", errmsg,
2188 &starts, &e, &startinpos, &endinpos, &exc, &s,
2189 &unicode, &outpos, &p))
2190 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002191 }
2192
Antoine Pitrou244651a2009-05-04 18:56:13 +00002193 /* end of string */
2194
2195 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2196 /* if we're in an inconsistent state, that's an error */
2197 if (surrogate ||
2198 (base64bits >= 6) ||
2199 (base64bits > 0 && base64buffer != 0)) {
2200 outpos = p-PyUnicode_AS_UNICODE(unicode);
2201 endinpos = size;
2202 if (unicode_decode_call_errorhandler(
2203 errors, &errorHandler,
2204 "utf7", "unterminated shift sequence",
2205 &starts, &e, &startinpos, &endinpos, &exc, &s,
2206 &unicode, &outpos, &p))
2207 goto onError;
2208 if (s < e)
2209 goto restart;
2210 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002211 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002212
2213 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002214 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002215 if (inShift) {
2216 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002217 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002218 }
2219 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002220 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002221 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002222 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002223
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002224 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002225 goto onError;
2226
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002227 Py_XDECREF(errorHandler);
2228 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002229 return (PyObject *)unicode;
2230
Benjamin Peterson29060642009-01-31 22:14:21 +00002231 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002232 Py_XDECREF(errorHandler);
2233 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002234 Py_DECREF(unicode);
2235 return NULL;
2236}
2237
2238
2239PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002240 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002241 int base64SetO,
2242 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002243 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002244{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002245 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002246 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002247 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002248 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002249 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002250 unsigned int base64bits = 0;
2251 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002252 char * out;
2253 char * start;
2254
2255 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002256 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002257
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002258 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002259 return PyErr_NoMemory();
2260
Antoine Pitrou244651a2009-05-04 18:56:13 +00002261 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002262 if (v == NULL)
2263 return NULL;
2264
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002265 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002266 for (;i < size; ++i) {
2267 Py_UNICODE ch = s[i];
2268
Antoine Pitrou244651a2009-05-04 18:56:13 +00002269 if (inShift) {
2270 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2271 /* shifting out */
2272 if (base64bits) { /* output remaining bits */
2273 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2274 base64buffer = 0;
2275 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002276 }
2277 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002278 /* Characters not in the BASE64 set implicitly unshift the sequence
2279 so no '-' is required, except if the character is itself a '-' */
2280 if (IS_BASE64(ch) || ch == '-') {
2281 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002282 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002283 *out++ = (char) ch;
2284 }
2285 else {
2286 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002287 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002288 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002289 else { /* not in a shift sequence */
2290 if (ch == '+') {
2291 *out++ = '+';
2292 *out++ = '-';
2293 }
2294 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2295 *out++ = (char) ch;
2296 }
2297 else {
2298 *out++ = '+';
2299 inShift = 1;
2300 goto encode_char;
2301 }
2302 }
2303 continue;
2304encode_char:
2305#ifdef Py_UNICODE_WIDE
2306 if (ch >= 0x10000) {
2307 /* code first surrogate */
2308 base64bits += 16;
2309 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2310 while (base64bits >= 6) {
2311 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2312 base64bits -= 6;
2313 }
2314 /* prepare second surrogate */
2315 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2316 }
2317#endif
2318 base64bits += 16;
2319 base64buffer = (base64buffer << 16) | ch;
2320 while (base64bits >= 6) {
2321 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2322 base64bits -= 6;
2323 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002324 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002325 if (base64bits)
2326 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2327 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002328 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002329 if (_PyBytes_Resize(&v, out - start) < 0)
2330 return NULL;
2331 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002332}
2333
Antoine Pitrou244651a2009-05-04 18:56:13 +00002334#undef IS_BASE64
2335#undef FROM_BASE64
2336#undef TO_BASE64
2337#undef DECODE_DIRECT
2338#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002339
Guido van Rossumd57fd912000-03-10 22:53:23 +00002340/* --- UTF-8 Codec -------------------------------------------------------- */
2341
Tim Petersced69f82003-09-16 20:30:58 +00002342static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002344 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2345 illegal prefix. See RFC 3629 for details */
2346 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2347 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002348 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2350 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2351 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2352 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002353 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2354 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2356 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002357 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2358 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2359 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2360 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2361 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362};
2363
Guido van Rossumd57fd912000-03-10 22:53:23 +00002364PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002365 Py_ssize_t size,
2366 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367{
Walter Dörwald69652032004-09-07 20:24:22 +00002368 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2369}
2370
Antoine Pitrouab868312009-01-10 15:40:25 +00002371/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2372#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2373
2374/* Mask to quickly check whether a C 'long' contains a
2375 non-ASCII, UTF8-encoded char. */
2376#if (SIZEOF_LONG == 8)
2377# define ASCII_CHAR_MASK 0x8080808080808080L
2378#elif (SIZEOF_LONG == 4)
2379# define ASCII_CHAR_MASK 0x80808080L
2380#else
2381# error C 'long' size should be either 4 or 8!
2382#endif
2383
Walter Dörwald69652032004-09-07 20:24:22 +00002384PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002385 Py_ssize_t size,
2386 const char *errors,
2387 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002388{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002389 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002391 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002392 Py_ssize_t startinpos;
2393 Py_ssize_t endinpos;
2394 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002395 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002396 PyUnicodeObject *unicode;
2397 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002398 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002399 PyObject *errorHandler = NULL;
2400 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002401
2402 /* Note: size will always be longer than the resulting Unicode
2403 character count */
2404 unicode = _PyUnicode_New(size);
2405 if (!unicode)
2406 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002407 if (size == 0) {
2408 if (consumed)
2409 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412
2413 /* Unpack UTF-8 encoded data */
2414 p = unicode->str;
2415 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002416 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002417
2418 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002419 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002420
2421 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002422 /* Fast path for runs of ASCII characters. Given that common UTF-8
2423 input will consist of an overwhelming majority of ASCII
2424 characters, we try to optimize for this case by checking
2425 as many characters as a C 'long' can contain.
2426 First, check if we can do an aligned read, as most CPUs have
2427 a penalty for unaligned reads.
2428 */
2429 if (!((size_t) s & LONG_PTR_MASK)) {
2430 /* Help register allocation */
2431 register const char *_s = s;
2432 register Py_UNICODE *_p = p;
2433 while (_s < aligned_end) {
2434 /* Read a whole long at a time (either 4 or 8 bytes),
2435 and do a fast unrolled copy if it only contains ASCII
2436 characters. */
2437 unsigned long data = *(unsigned long *) _s;
2438 if (data & ASCII_CHAR_MASK)
2439 break;
2440 _p[0] = (unsigned char) _s[0];
2441 _p[1] = (unsigned char) _s[1];
2442 _p[2] = (unsigned char) _s[2];
2443 _p[3] = (unsigned char) _s[3];
2444#if (SIZEOF_LONG == 8)
2445 _p[4] = (unsigned char) _s[4];
2446 _p[5] = (unsigned char) _s[5];
2447 _p[6] = (unsigned char) _s[6];
2448 _p[7] = (unsigned char) _s[7];
2449#endif
2450 _s += SIZEOF_LONG;
2451 _p += SIZEOF_LONG;
2452 }
2453 s = _s;
2454 p = _p;
2455 if (s == e)
2456 break;
2457 ch = (unsigned char)*s;
2458 }
2459 }
2460
2461 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002462 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 s++;
2464 continue;
2465 }
2466
2467 n = utf8_code_length[ch];
2468
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002469 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002470 if (consumed)
2471 break;
2472 else {
2473 errmsg = "unexpected end of data";
2474 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002475 endinpos = startinpos+1;
2476 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2477 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002478 goto utf8Error;
2479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481
2482 switch (n) {
2483
2484 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002485 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002486 startinpos = s-starts;
2487 endinpos = startinpos+1;
2488 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489
2490 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002491 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002492 startinpos = s-starts;
2493 endinpos = startinpos+1;
2494 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495
2496 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002497 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002498 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002499 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002500 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002501 goto utf8Error;
2502 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002504 assert ((ch > 0x007F) && (ch <= 0x07FF));
2505 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506 break;
2507
2508 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002509 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2510 will result in surrogates in range d800-dfff. Surrogates are
2511 not valid UTF-8 so they are rejected.
2512 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2513 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002514 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002515 (s[2] & 0xc0) != 0x80 ||
2516 ((unsigned char)s[0] == 0xE0 &&
2517 (unsigned char)s[1] < 0xA0) ||
2518 ((unsigned char)s[0] == 0xED &&
2519 (unsigned char)s[1] > 0x9F)) {
2520 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002521 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002522 endinpos = startinpos + 1;
2523
2524 /* if s[1] first two bits are 1 and 0, then the invalid
2525 continuation byte is s[2], so increment endinpos by 1,
2526 if not, s[1] is invalid and endinpos doesn't need to
2527 be incremented. */
2528 if ((s[1] & 0xC0) == 0x80)
2529 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002530 goto utf8Error;
2531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002533 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2534 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002535 break;
2536
2537 case 4:
2538 if ((s[1] & 0xc0) != 0x80 ||
2539 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002540 (s[3] & 0xc0) != 0x80 ||
2541 ((unsigned char)s[0] == 0xF0 &&
2542 (unsigned char)s[1] < 0x90) ||
2543 ((unsigned char)s[0] == 0xF4 &&
2544 (unsigned char)s[1] > 0x8F)) {
2545 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002546 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002547 endinpos = startinpos + 1;
2548 if ((s[1] & 0xC0) == 0x80) {
2549 endinpos++;
2550 if ((s[2] & 0xC0) == 0x80)
2551 endinpos++;
2552 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002553 goto utf8Error;
2554 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002555 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002556 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2557 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2558
Fredrik Lundh8f455852001-06-27 18:59:43 +00002559#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002560 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002561#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002562 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002563
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002564 /* translate from 10000..10FFFF to 0..FFFF */
2565 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002566
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002567 /* high surrogate = top 10 bits added to D800 */
2568 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002569
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002570 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002571 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002572#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 }
2575 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002576 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002577
Benjamin Peterson29060642009-01-31 22:14:21 +00002578 utf8Error:
2579 outpos = p-PyUnicode_AS_UNICODE(unicode);
2580 if (unicode_decode_call_errorhandler(
2581 errors, &errorHandler,
2582 "utf8", errmsg,
2583 &starts, &e, &startinpos, &endinpos, &exc, &s,
2584 &unicode, &outpos, &p))
2585 goto onError;
2586 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 }
Walter Dörwald69652032004-09-07 20:24:22 +00002588 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002589 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590
2591 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002592 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 goto onError;
2594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002595 Py_XDECREF(errorHandler);
2596 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 return (PyObject *)unicode;
2598
Benjamin Peterson29060642009-01-31 22:14:21 +00002599 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600 Py_XDECREF(errorHandler);
2601 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 Py_DECREF(unicode);
2603 return NULL;
2604}
2605
Antoine Pitrouab868312009-01-10 15:40:25 +00002606#undef ASCII_CHAR_MASK
2607
2608
Tim Peters602f7402002-04-27 18:03:26 +00002609/* Allocation strategy: if the string is short, convert into a stack buffer
2610 and allocate exactly as much space needed at the end. Else allocate the
2611 maximum possible needed (4 result bytes per Unicode character), and return
2612 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002613*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002614PyObject *
2615PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002616 Py_ssize_t size,
2617 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618{
Tim Peters602f7402002-04-27 18:03:26 +00002619#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002620
Guido van Rossum98297ee2007-11-06 21:34:58 +00002621 Py_ssize_t i; /* index into s of next input byte */
2622 PyObject *result; /* result string object */
2623 char *p; /* next free byte in output buffer */
2624 Py_ssize_t nallocated; /* number of result bytes allocated */
2625 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002626 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002627 PyObject *errorHandler = NULL;
2628 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002629
Tim Peters602f7402002-04-27 18:03:26 +00002630 assert(s != NULL);
2631 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632
Tim Peters602f7402002-04-27 18:03:26 +00002633 if (size <= MAX_SHORT_UNICHARS) {
2634 /* Write into the stack buffer; nallocated can't overflow.
2635 * At the end, we'll allocate exactly as much heap space as it
2636 * turns out we need.
2637 */
2638 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002639 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002640 p = stackbuf;
2641 }
2642 else {
2643 /* Overallocate on the heap, and give the excess back at the end. */
2644 nallocated = size * 4;
2645 if (nallocated / 4 != size) /* overflow! */
2646 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002647 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002648 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002649 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002650 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002651 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002652
Tim Peters602f7402002-04-27 18:03:26 +00002653 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002654 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002655
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002656 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002657 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002658 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002659
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002661 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002662 *p++ = (char)(0xc0 | (ch >> 6));
2663 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002664 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002665#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002666 /* Special case: check for high and low surrogate */
2667 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2668 Py_UCS4 ch2 = s[i];
2669 /* Combine the two surrogates to form a UCS4 value */
2670 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2671 i++;
2672
2673 /* Encode UCS4 Unicode ordinals */
2674 *p++ = (char)(0xf0 | (ch >> 18));
2675 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002676 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2677 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002678 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002679#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002680 Py_ssize_t newpos;
2681 PyObject *rep;
2682 Py_ssize_t repsize, k;
2683 rep = unicode_encode_call_errorhandler
2684 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2685 s, size, &exc, i-1, i, &newpos);
2686 if (!rep)
2687 goto error;
2688
2689 if (PyBytes_Check(rep))
2690 repsize = PyBytes_GET_SIZE(rep);
2691 else
2692 repsize = PyUnicode_GET_SIZE(rep);
2693
2694 if (repsize > 4) {
2695 Py_ssize_t offset;
2696
2697 if (result == NULL)
2698 offset = p - stackbuf;
2699 else
2700 offset = p - PyBytes_AS_STRING(result);
2701
2702 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2703 /* integer overflow */
2704 PyErr_NoMemory();
2705 goto error;
2706 }
2707 nallocated += repsize - 4;
2708 if (result != NULL) {
2709 if (_PyBytes_Resize(&result, nallocated) < 0)
2710 goto error;
2711 } else {
2712 result = PyBytes_FromStringAndSize(NULL, nallocated);
2713 if (result == NULL)
2714 goto error;
2715 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2716 }
2717 p = PyBytes_AS_STRING(result) + offset;
2718 }
2719
2720 if (PyBytes_Check(rep)) {
2721 char *prep = PyBytes_AS_STRING(rep);
2722 for(k = repsize; k > 0; k--)
2723 *p++ = *prep++;
2724 } else /* rep is unicode */ {
2725 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2726 Py_UNICODE c;
2727
2728 for(k=0; k<repsize; k++) {
2729 c = prep[k];
2730 if (0x80 <= c) {
2731 raise_encode_exception(&exc, "utf-8", s, size,
2732 i-1, i, "surrogates not allowed");
2733 goto error;
2734 }
2735 *p++ = (char)prep[k];
2736 }
2737 }
2738 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002739#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002740 }
Victor Stinner445a6232010-04-22 20:01:57 +00002741#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002742 } else if (ch < 0x10000) {
2743 *p++ = (char)(0xe0 | (ch >> 12));
2744 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2745 *p++ = (char)(0x80 | (ch & 0x3f));
2746 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002747 /* Encode UCS4 Unicode ordinals */
2748 *p++ = (char)(0xf0 | (ch >> 18));
2749 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2750 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2751 *p++ = (char)(0x80 | (ch & 0x3f));
2752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002754
Guido van Rossum98297ee2007-11-06 21:34:58 +00002755 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002756 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002757 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002758 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002759 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002760 }
2761 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002762 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002763 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002764 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002765 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002766 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002767 Py_XDECREF(errorHandler);
2768 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002769 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002770 error:
2771 Py_XDECREF(errorHandler);
2772 Py_XDECREF(exc);
2773 Py_XDECREF(result);
2774 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002775
Tim Peters602f7402002-04-27 18:03:26 +00002776#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777}
2778
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2780{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 if (!PyUnicode_Check(unicode)) {
2782 PyErr_BadArgument();
2783 return NULL;
2784 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002785 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 PyUnicode_GET_SIZE(unicode),
2787 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788}
2789
Walter Dörwald41980ca2007-08-16 21:55:45 +00002790/* --- UTF-32 Codec ------------------------------------------------------- */
2791
2792PyObject *
2793PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002794 Py_ssize_t size,
2795 const char *errors,
2796 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002797{
2798 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2799}
2800
2801PyObject *
2802PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002803 Py_ssize_t size,
2804 const char *errors,
2805 int *byteorder,
2806 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002807{
2808 const char *starts = s;
2809 Py_ssize_t startinpos;
2810 Py_ssize_t endinpos;
2811 Py_ssize_t outpos;
2812 PyUnicodeObject *unicode;
2813 Py_UNICODE *p;
2814#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002815 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002816 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002817#else
2818 const int pairs = 0;
2819#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002820 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002821 int bo = 0; /* assume native ordering by default */
2822 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002823 /* Offsets from q for retrieving bytes in the right order. */
2824#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2825 int iorder[] = {0, 1, 2, 3};
2826#else
2827 int iorder[] = {3, 2, 1, 0};
2828#endif
2829 PyObject *errorHandler = NULL;
2830 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002831
Walter Dörwald41980ca2007-08-16 21:55:45 +00002832 q = (unsigned char *)s;
2833 e = q + size;
2834
2835 if (byteorder)
2836 bo = *byteorder;
2837
2838 /* Check for BOM marks (U+FEFF) in the input and adjust current
2839 byte order setting accordingly. In native mode, the leading BOM
2840 mark is skipped, in all other modes, it is copied to the output
2841 stream as-is (giving a ZWNBSP character). */
2842 if (bo == 0) {
2843 if (size >= 4) {
2844 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002845 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002846#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002847 if (bom == 0x0000FEFF) {
2848 q += 4;
2849 bo = -1;
2850 }
2851 else if (bom == 0xFFFE0000) {
2852 q += 4;
2853 bo = 1;
2854 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002855#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002856 if (bom == 0x0000FEFF) {
2857 q += 4;
2858 bo = 1;
2859 }
2860 else if (bom == 0xFFFE0000) {
2861 q += 4;
2862 bo = -1;
2863 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002864#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002865 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002866 }
2867
2868 if (bo == -1) {
2869 /* force LE */
2870 iorder[0] = 0;
2871 iorder[1] = 1;
2872 iorder[2] = 2;
2873 iorder[3] = 3;
2874 }
2875 else if (bo == 1) {
2876 /* force BE */
2877 iorder[0] = 3;
2878 iorder[1] = 2;
2879 iorder[2] = 1;
2880 iorder[3] = 0;
2881 }
2882
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002883 /* On narrow builds we split characters outside the BMP into two
2884 codepoints => count how much extra space we need. */
2885#ifndef Py_UNICODE_WIDE
2886 for (qq = q; qq < e; qq += 4)
2887 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2888 pairs++;
2889#endif
2890
2891 /* This might be one to much, because of a BOM */
2892 unicode = _PyUnicode_New((size+3)/4+pairs);
2893 if (!unicode)
2894 return NULL;
2895 if (size == 0)
2896 return (PyObject *)unicode;
2897
2898 /* Unpack UTF-32 encoded data */
2899 p = unicode->str;
2900
Walter Dörwald41980ca2007-08-16 21:55:45 +00002901 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002902 Py_UCS4 ch;
2903 /* remaining bytes at the end? (size should be divisible by 4) */
2904 if (e-q<4) {
2905 if (consumed)
2906 break;
2907 errmsg = "truncated data";
2908 startinpos = ((const char *)q)-starts;
2909 endinpos = ((const char *)e)-starts;
2910 goto utf32Error;
2911 /* The remaining input chars are ignored if the callback
2912 chooses to skip the input */
2913 }
2914 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2915 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002916
Benjamin Peterson29060642009-01-31 22:14:21 +00002917 if (ch >= 0x110000)
2918 {
2919 errmsg = "codepoint not in range(0x110000)";
2920 startinpos = ((const char *)q)-starts;
2921 endinpos = startinpos+4;
2922 goto utf32Error;
2923 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002924#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002925 if (ch >= 0x10000)
2926 {
2927 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2928 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2929 }
2930 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002931#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002932 *p++ = ch;
2933 q += 4;
2934 continue;
2935 utf32Error:
2936 outpos = p-PyUnicode_AS_UNICODE(unicode);
2937 if (unicode_decode_call_errorhandler(
2938 errors, &errorHandler,
2939 "utf32", errmsg,
2940 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2941 &unicode, &outpos, &p))
2942 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002943 }
2944
2945 if (byteorder)
2946 *byteorder = bo;
2947
2948 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002950
2951 /* Adjust length */
2952 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2953 goto onError;
2954
2955 Py_XDECREF(errorHandler);
2956 Py_XDECREF(exc);
2957 return (PyObject *)unicode;
2958
Benjamin Peterson29060642009-01-31 22:14:21 +00002959 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002960 Py_DECREF(unicode);
2961 Py_XDECREF(errorHandler);
2962 Py_XDECREF(exc);
2963 return NULL;
2964}
2965
2966PyObject *
2967PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002968 Py_ssize_t size,
2969 const char *errors,
2970 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002971{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002972 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002973 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002974 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002975#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002976 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002977#else
2978 const int pairs = 0;
2979#endif
2980 /* Offsets from p for storing byte pairs in the right order. */
2981#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2982 int iorder[] = {0, 1, 2, 3};
2983#else
2984 int iorder[] = {3, 2, 1, 0};
2985#endif
2986
Benjamin Peterson29060642009-01-31 22:14:21 +00002987#define STORECHAR(CH) \
2988 do { \
2989 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2990 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2991 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2992 p[iorder[0]] = (CH) & 0xff; \
2993 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002994 } while(0)
2995
2996 /* In narrow builds we can output surrogate pairs as one codepoint,
2997 so we need less space. */
2998#ifndef Py_UNICODE_WIDE
2999 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003000 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3001 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3002 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003003#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003004 nsize = (size - pairs + (byteorder == 0));
3005 bytesize = nsize * 4;
3006 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003008 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003009 if (v == NULL)
3010 return NULL;
3011
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003012 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003013 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003014 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003015 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003016 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003017
3018 if (byteorder == -1) {
3019 /* force LE */
3020 iorder[0] = 0;
3021 iorder[1] = 1;
3022 iorder[2] = 2;
3023 iorder[3] = 3;
3024 }
3025 else if (byteorder == 1) {
3026 /* force BE */
3027 iorder[0] = 3;
3028 iorder[1] = 2;
3029 iorder[2] = 1;
3030 iorder[3] = 0;
3031 }
3032
3033 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003034 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003035#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3037 Py_UCS4 ch2 = *s;
3038 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3039 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3040 s++;
3041 size--;
3042 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003043 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003044#endif
3045 STORECHAR(ch);
3046 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003047
3048 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003049 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003050#undef STORECHAR
3051}
3052
3053PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3054{
3055 if (!PyUnicode_Check(unicode)) {
3056 PyErr_BadArgument();
3057 return NULL;
3058 }
3059 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003060 PyUnicode_GET_SIZE(unicode),
3061 NULL,
3062 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003063}
3064
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065/* --- UTF-16 Codec ------------------------------------------------------- */
3066
Tim Peters772747b2001-08-09 22:21:55 +00003067PyObject *
3068PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 Py_ssize_t size,
3070 const char *errors,
3071 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072{
Walter Dörwald69652032004-09-07 20:24:22 +00003073 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3074}
3075
Antoine Pitrouab868312009-01-10 15:40:25 +00003076/* Two masks for fast checking of whether a C 'long' may contain
3077 UTF16-encoded surrogate characters. This is an efficient heuristic,
3078 assuming that non-surrogate characters with a code point >= 0x8000 are
3079 rare in most input.
3080 FAST_CHAR_MASK is used when the input is in native byte ordering,
3081 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003082*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003083#if (SIZEOF_LONG == 8)
3084# define FAST_CHAR_MASK 0x8000800080008000L
3085# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3086#elif (SIZEOF_LONG == 4)
3087# define FAST_CHAR_MASK 0x80008000L
3088# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3089#else
3090# error C 'long' size should be either 4 or 8!
3091#endif
3092
Walter Dörwald69652032004-09-07 20:24:22 +00003093PyObject *
3094PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003095 Py_ssize_t size,
3096 const char *errors,
3097 int *byteorder,
3098 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003099{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003100 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003101 Py_ssize_t startinpos;
3102 Py_ssize_t endinpos;
3103 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 PyUnicodeObject *unicode;
3105 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003106 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003107 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003108 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003109 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003110 /* Offsets from q for retrieving byte pairs in the right order. */
3111#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3112 int ihi = 1, ilo = 0;
3113#else
3114 int ihi = 0, ilo = 1;
3115#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003116 PyObject *errorHandler = NULL;
3117 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118
3119 /* Note: size will always be longer than the resulting Unicode
3120 character count */
3121 unicode = _PyUnicode_New(size);
3122 if (!unicode)
3123 return NULL;
3124 if (size == 0)
3125 return (PyObject *)unicode;
3126
3127 /* Unpack UTF-16 encoded data */
3128 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003129 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003130 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131
3132 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003133 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003135 /* Check for BOM marks (U+FEFF) in the input and adjust current
3136 byte order setting accordingly. In native mode, the leading BOM
3137 mark is skipped, in all other modes, it is copied to the output
3138 stream as-is (giving a ZWNBSP character). */
3139 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003140 if (size >= 2) {
3141 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003142#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003143 if (bom == 0xFEFF) {
3144 q += 2;
3145 bo = -1;
3146 }
3147 else if (bom == 0xFFFE) {
3148 q += 2;
3149 bo = 1;
3150 }
Tim Petersced69f82003-09-16 20:30:58 +00003151#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003152 if (bom == 0xFEFF) {
3153 q += 2;
3154 bo = 1;
3155 }
3156 else if (bom == 0xFFFE) {
3157 q += 2;
3158 bo = -1;
3159 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003160#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003161 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163
Tim Peters772747b2001-08-09 22:21:55 +00003164 if (bo == -1) {
3165 /* force LE */
3166 ihi = 1;
3167 ilo = 0;
3168 }
3169 else if (bo == 1) {
3170 /* force BE */
3171 ihi = 0;
3172 ilo = 1;
3173 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003174#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3175 native_ordering = ilo < ihi;
3176#else
3177 native_ordering = ilo > ihi;
3178#endif
Tim Peters772747b2001-08-09 22:21:55 +00003179
Antoine Pitrouab868312009-01-10 15:40:25 +00003180 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003181 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003182 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003183 /* First check for possible aligned read of a C 'long'. Unaligned
3184 reads are more expensive, better to defer to another iteration. */
3185 if (!((size_t) q & LONG_PTR_MASK)) {
3186 /* Fast path for runs of non-surrogate chars. */
3187 register const unsigned char *_q = q;
3188 Py_UNICODE *_p = p;
3189 if (native_ordering) {
3190 /* Native ordering is simple: as long as the input cannot
3191 possibly contain a surrogate char, do an unrolled copy
3192 of several 16-bit code points to the target object.
3193 The non-surrogate check is done on several input bytes
3194 at a time (as many as a C 'long' can contain). */
3195 while (_q < aligned_end) {
3196 unsigned long data = * (unsigned long *) _q;
3197 if (data & FAST_CHAR_MASK)
3198 break;
3199 _p[0] = ((unsigned short *) _q)[0];
3200 _p[1] = ((unsigned short *) _q)[1];
3201#if (SIZEOF_LONG == 8)
3202 _p[2] = ((unsigned short *) _q)[2];
3203 _p[3] = ((unsigned short *) _q)[3];
3204#endif
3205 _q += SIZEOF_LONG;
3206 _p += SIZEOF_LONG / 2;
3207 }
3208 }
3209 else {
3210 /* Byteswapped ordering is similar, but we must decompose
3211 the copy bytewise, and take care of zero'ing out the
3212 upper bytes if the target object is in 32-bit units
3213 (that is, in UCS-4 builds). */
3214 while (_q < aligned_end) {
3215 unsigned long data = * (unsigned long *) _q;
3216 if (data & SWAPPED_FAST_CHAR_MASK)
3217 break;
3218 /* Zero upper bytes in UCS-4 builds */
3219#if (Py_UNICODE_SIZE > 2)
3220 _p[0] = 0;
3221 _p[1] = 0;
3222#if (SIZEOF_LONG == 8)
3223 _p[2] = 0;
3224 _p[3] = 0;
3225#endif
3226#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003227 /* Issue #4916; UCS-4 builds on big endian machines must
3228 fill the two last bytes of each 4-byte unit. */
3229#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3230# define OFF 2
3231#else
3232# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003233#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003234 ((unsigned char *) _p)[OFF + 1] = _q[0];
3235 ((unsigned char *) _p)[OFF + 0] = _q[1];
3236 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3237 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3238#if (SIZEOF_LONG == 8)
3239 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3240 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3241 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3242 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3243#endif
3244#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003245 _q += SIZEOF_LONG;
3246 _p += SIZEOF_LONG / 2;
3247 }
3248 }
3249 p = _p;
3250 q = _q;
3251 if (q >= e)
3252 break;
3253 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003254 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003255
Benjamin Peterson14339b62009-01-31 16:36:08 +00003256 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003257
3258 if (ch < 0xD800 || ch > 0xDFFF) {
3259 *p++ = ch;
3260 continue;
3261 }
3262
3263 /* UTF-16 code pair: */
3264 if (q > e) {
3265 errmsg = "unexpected end of data";
3266 startinpos = (((const char *)q) - 2) - starts;
3267 endinpos = ((const char *)e) + 1 - starts;
3268 goto utf16Error;
3269 }
3270 if (0xD800 <= ch && ch <= 0xDBFF) {
3271 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3272 q += 2;
3273 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003274#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 *p++ = ch;
3276 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003277#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003278 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003279#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003280 continue;
3281 }
3282 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003283 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003284 startinpos = (((const char *)q)-4)-starts;
3285 endinpos = startinpos+2;
3286 goto utf16Error;
3287 }
3288
Benjamin Peterson14339b62009-01-31 16:36:08 +00003289 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003290 errmsg = "illegal encoding";
3291 startinpos = (((const char *)q)-2)-starts;
3292 endinpos = startinpos+2;
3293 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003294
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 utf16Error:
3296 outpos = p - PyUnicode_AS_UNICODE(unicode);
3297 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003298 errors,
3299 &errorHandler,
3300 "utf16", errmsg,
3301 &starts,
3302 (const char **)&e,
3303 &startinpos,
3304 &endinpos,
3305 &exc,
3306 (const char **)&q,
3307 &unicode,
3308 &outpos,
3309 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003312 /* remaining byte at the end? (size should be even) */
3313 if (e == q) {
3314 if (!consumed) {
3315 errmsg = "truncated data";
3316 startinpos = ((const char *)q) - starts;
3317 endinpos = ((const char *)e) + 1 - starts;
3318 outpos = p - PyUnicode_AS_UNICODE(unicode);
3319 if (unicode_decode_call_errorhandler(
3320 errors,
3321 &errorHandler,
3322 "utf16", errmsg,
3323 &starts,
3324 (const char **)&e,
3325 &startinpos,
3326 &endinpos,
3327 &exc,
3328 (const char **)&q,
3329 &unicode,
3330 &outpos,
3331 &p))
3332 goto onError;
3333 /* The remaining input chars are ignored if the callback
3334 chooses to skip the input */
3335 }
3336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337
3338 if (byteorder)
3339 *byteorder = bo;
3340
Walter Dörwald69652032004-09-07 20:24:22 +00003341 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003343
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003345 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 goto onError;
3347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 Py_XDECREF(errorHandler);
3349 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 return (PyObject *)unicode;
3351
Benjamin Peterson29060642009-01-31 22:14:21 +00003352 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003354 Py_XDECREF(errorHandler);
3355 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 return NULL;
3357}
3358
Antoine Pitrouab868312009-01-10 15:40:25 +00003359#undef FAST_CHAR_MASK
3360#undef SWAPPED_FAST_CHAR_MASK
3361
Tim Peters772747b2001-08-09 22:21:55 +00003362PyObject *
3363PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003364 Py_ssize_t size,
3365 const char *errors,
3366 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003368 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003369 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003370 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003371#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003372 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003373#else
3374 const int pairs = 0;
3375#endif
Tim Peters772747b2001-08-09 22:21:55 +00003376 /* Offsets from p for storing byte pairs in the right order. */
3377#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3378 int ihi = 1, ilo = 0;
3379#else
3380 int ihi = 0, ilo = 1;
3381#endif
3382
Benjamin Peterson29060642009-01-31 22:14:21 +00003383#define STORECHAR(CH) \
3384 do { \
3385 p[ihi] = ((CH) >> 8) & 0xff; \
3386 p[ilo] = (CH) & 0xff; \
3387 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003388 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003390#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003391 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003392 if (s[i] >= 0x10000)
3393 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003394#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003395 /* 2 * (size + pairs + (byteorder == 0)) */
3396 if (size > PY_SSIZE_T_MAX ||
3397 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003398 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003399 nsize = size + pairs + (byteorder == 0);
3400 bytesize = nsize * 2;
3401 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003402 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003403 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404 if (v == NULL)
3405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003407 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003409 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003410 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003411 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003412
3413 if (byteorder == -1) {
3414 /* force LE */
3415 ihi = 1;
3416 ilo = 0;
3417 }
3418 else if (byteorder == 1) {
3419 /* force BE */
3420 ihi = 0;
3421 ilo = 1;
3422 }
3423
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003424 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003425 Py_UNICODE ch = *s++;
3426 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003427#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 if (ch >= 0x10000) {
3429 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3430 ch = 0xD800 | ((ch-0x10000) >> 10);
3431 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003432#endif
Tim Peters772747b2001-08-09 22:21:55 +00003433 STORECHAR(ch);
3434 if (ch2)
3435 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003436 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003437
3438 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003439 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003440#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441}
3442
3443PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3444{
3445 if (!PyUnicode_Check(unicode)) {
3446 PyErr_BadArgument();
3447 return NULL;
3448 }
3449 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003450 PyUnicode_GET_SIZE(unicode),
3451 NULL,
3452 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453}
3454
3455/* --- Unicode Escape Codec ----------------------------------------------- */
3456
Fredrik Lundh06d12682001-01-24 07:59:11 +00003457static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003458
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003460 Py_ssize_t size,
3461 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003464 Py_ssize_t startinpos;
3465 Py_ssize_t endinpos;
3466 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003471 char* message;
3472 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 PyObject *errorHandler = NULL;
3474 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003475
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 /* Escaped strings will always be longer than the resulting
3477 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 length after conversion to the true value.
3479 (but if the error callback returns a long replacement string
3480 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481 v = _PyUnicode_New(size);
3482 if (v == NULL)
3483 goto onError;
3484 if (size == 0)
3485 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003486
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003489
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490 while (s < end) {
3491 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003492 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003493 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494
3495 /* Non-escape characters are interpreted as Unicode ordinals */
3496 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003497 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498 continue;
3499 }
3500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 /* \ - Escapes */
3503 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003504 c = *s++;
3505 if (s > end)
3506 c = '\0'; /* Invalid after \ */
3507 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508
Benjamin Peterson29060642009-01-31 22:14:21 +00003509 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 case '\n': break;
3511 case '\\': *p++ = '\\'; break;
3512 case '\'': *p++ = '\''; break;
3513 case '\"': *p++ = '\"'; break;
3514 case 'b': *p++ = '\b'; break;
3515 case 'f': *p++ = '\014'; break; /* FF */
3516 case 't': *p++ = '\t'; break;
3517 case 'n': *p++ = '\n'; break;
3518 case 'r': *p++ = '\r'; break;
3519 case 'v': *p++ = '\013'; break; /* VT */
3520 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3521
Benjamin Peterson29060642009-01-31 22:14:21 +00003522 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003523 case '0': case '1': case '2': case '3':
3524 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003525 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003526 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003527 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003528 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003529 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003531 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 break;
3533
Benjamin Peterson29060642009-01-31 22:14:21 +00003534 /* hex escapes */
3535 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003537 digits = 2;
3538 message = "truncated \\xXX escape";
3539 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540
Benjamin Peterson29060642009-01-31 22:14:21 +00003541 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003543 digits = 4;
3544 message = "truncated \\uXXXX escape";
3545 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546
Benjamin Peterson29060642009-01-31 22:14:21 +00003547 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003548 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003549 digits = 8;
3550 message = "truncated \\UXXXXXXXX escape";
3551 hexescape:
3552 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 outpos = p-PyUnicode_AS_UNICODE(v);
3554 if (s+digits>end) {
3555 endinpos = size;
3556 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003557 errors, &errorHandler,
3558 "unicodeescape", "end of string in escape sequence",
3559 &starts, &end, &startinpos, &endinpos, &exc, &s,
3560 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 goto onError;
3562 goto nextByte;
3563 }
3564 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003565 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003566 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567 endinpos = (s+i+1)-starts;
3568 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 errors, &errorHandler,
3570 "unicodeescape", message,
3571 &starts, &end, &startinpos, &endinpos, &exc, &s,
3572 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003573 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003575 }
3576 chr = (chr<<4) & ~0xF;
3577 if (c >= '0' && c <= '9')
3578 chr += c - '0';
3579 else if (c >= 'a' && c <= 'f')
3580 chr += 10 + c - 'a';
3581 else
3582 chr += 10 + c - 'A';
3583 }
3584 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003585 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003586 /* _decoding_error will have already written into the
3587 target buffer. */
3588 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003589 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003590 /* when we get here, chr is a 32-bit unicode character */
3591 if (chr <= 0xffff)
3592 /* UCS-2 character */
3593 *p++ = (Py_UNICODE) chr;
3594 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003595 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003596 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003597#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003598 *p++ = chr;
3599#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003600 chr -= 0x10000L;
3601 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003602 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003603#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003604 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605 endinpos = s-starts;
3606 outpos = p-PyUnicode_AS_UNICODE(v);
3607 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003608 errors, &errorHandler,
3609 "unicodeescape", "illegal Unicode character",
3610 &starts, &end, &startinpos, &endinpos, &exc, &s,
3611 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003612 goto onError;
3613 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003614 break;
3615
Benjamin Peterson29060642009-01-31 22:14:21 +00003616 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003617 case 'N':
3618 message = "malformed \\N character escape";
3619 if (ucnhash_CAPI == NULL) {
3620 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003621 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003622 if (ucnhash_CAPI == NULL)
3623 goto ucnhashError;
3624 }
3625 if (*s == '{') {
3626 const char *start = s+1;
3627 /* look for the closing brace */
3628 while (*s != '}' && s < end)
3629 s++;
3630 if (s > start && s < end && *s == '}') {
3631 /* found a name. look it up in the unicode database */
3632 message = "unknown Unicode character name";
3633 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003634 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003635 goto store;
3636 }
3637 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 endinpos = s-starts;
3639 outpos = p-PyUnicode_AS_UNICODE(v);
3640 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 errors, &errorHandler,
3642 "unicodeescape", message,
3643 &starts, &end, &startinpos, &endinpos, &exc, &s,
3644 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003645 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003646 break;
3647
3648 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003649 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 message = "\\ at end of string";
3651 s--;
3652 endinpos = s-starts;
3653 outpos = p-PyUnicode_AS_UNICODE(v);
3654 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 errors, &errorHandler,
3656 "unicodeescape", message,
3657 &starts, &end, &startinpos, &endinpos, &exc, &s,
3658 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003659 goto onError;
3660 }
3661 else {
3662 *p++ = '\\';
3663 *p++ = (unsigned char)s[-1];
3664 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003665 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003667 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003668 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003669 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003670 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003671 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003672 Py_XDECREF(errorHandler);
3673 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003675
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003677 PyErr_SetString(
3678 PyExc_UnicodeError,
3679 "\\N escapes not supported (can't load unicodedata module)"
3680 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003681 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 Py_XDECREF(errorHandler);
3683 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003684 return NULL;
3685
Benjamin Peterson29060642009-01-31 22:14:21 +00003686 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 Py_XDECREF(errorHandler);
3689 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 return NULL;
3691}
3692
3693/* Return a Unicode-Escape string version of the Unicode object.
3694
3695 If quotes is true, the string is enclosed in u"" or u'' quotes as
3696 appropriate.
3697
3698*/
3699
Thomas Wouters477c8d52006-05-27 19:21:47 +00003700Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 Py_ssize_t size,
3702 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003703{
3704 /* like wcschr, but doesn't stop at NULL characters */
3705
3706 while (size-- > 0) {
3707 if (*s == ch)
3708 return s;
3709 s++;
3710 }
3711
3712 return NULL;
3713}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003714
Walter Dörwald79e913e2007-05-12 11:08:06 +00003715static const char *hexdigits = "0123456789abcdef";
3716
3717PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003718 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003720 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003723#ifdef Py_UNICODE_WIDE
3724 const Py_ssize_t expandsize = 10;
3725#else
3726 const Py_ssize_t expandsize = 6;
3727#endif
3728
Thomas Wouters89f507f2006-12-13 04:49:30 +00003729 /* XXX(nnorwitz): rather than over-allocating, it would be
3730 better to choose a different scheme. Perhaps scan the
3731 first N-chars of the string and allocate based on that size.
3732 */
3733 /* Initial allocation is based on the longest-possible unichr
3734 escape.
3735
3736 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3737 unichr, so in this case it's the longest unichr escape. In
3738 narrow (UTF-16) builds this is five chars per source unichr
3739 since there are two unichrs in the surrogate pair, so in narrow
3740 (UTF-16) builds it's not the longest unichr escape.
3741
3742 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3743 so in the narrow (UTF-16) build case it's the longest unichr
3744 escape.
3745 */
3746
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003747 if (size == 0)
3748 return PyBytes_FromStringAndSize(NULL, 0);
3749
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003750 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003751 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003752
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003753 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 2
3755 + expandsize*size
3756 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 if (repr == NULL)
3758 return NULL;
3759
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003760 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 while (size-- > 0) {
3763 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003764
Walter Dörwald79e913e2007-05-12 11:08:06 +00003765 /* Escape backslashes */
3766 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767 *p++ = '\\';
3768 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003769 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003770 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003771
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003772#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003773 /* Map 21-bit characters to '\U00xxxxxx' */
3774 else if (ch >= 0x10000) {
3775 *p++ = '\\';
3776 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003777 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3778 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3779 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3780 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3781 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3782 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3783 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3784 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003785 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003786 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003787#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003788 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3789 else if (ch >= 0xD800 && ch < 0xDC00) {
3790 Py_UNICODE ch2;
3791 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003792
Benjamin Peterson29060642009-01-31 22:14:21 +00003793 ch2 = *s++;
3794 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003795 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003796 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3797 *p++ = '\\';
3798 *p++ = 'U';
3799 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3800 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3801 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3802 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3803 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3804 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3805 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3806 *p++ = hexdigits[ucs & 0x0000000F];
3807 continue;
3808 }
3809 /* Fall through: isolated surrogates are copied as-is */
3810 s--;
3811 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003812 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003813#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003814
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003816 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 *p++ = '\\';
3818 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003819 *p++ = hexdigits[(ch >> 12) & 0x000F];
3820 *p++ = hexdigits[(ch >> 8) & 0x000F];
3821 *p++ = hexdigits[(ch >> 4) & 0x000F];
3822 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003824
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003825 /* Map special whitespace to '\t', \n', '\r' */
3826 else if (ch == '\t') {
3827 *p++ = '\\';
3828 *p++ = 't';
3829 }
3830 else if (ch == '\n') {
3831 *p++ = '\\';
3832 *p++ = 'n';
3833 }
3834 else if (ch == '\r') {
3835 *p++ = '\\';
3836 *p++ = 'r';
3837 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003838
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003839 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003840 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003842 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003843 *p++ = hexdigits[(ch >> 4) & 0x000F];
3844 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003845 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003846
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847 /* Copy everything else as-is */
3848 else
3849 *p++ = (char) ch;
3850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003852 assert(p - PyBytes_AS_STRING(repr) > 0);
3853 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3854 return NULL;
3855 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856}
3857
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003858PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003860 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 if (!PyUnicode_Check(unicode)) {
3862 PyErr_BadArgument();
3863 return NULL;
3864 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003865 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3866 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003867 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868}
3869
3870/* --- Raw Unicode Escape Codec ------------------------------------------- */
3871
3872PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003873 Py_ssize_t size,
3874 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003877 Py_ssize_t startinpos;
3878 Py_ssize_t endinpos;
3879 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882 const char *end;
3883 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003884 PyObject *errorHandler = NULL;
3885 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003886
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887 /* Escaped strings will always be longer than the resulting
3888 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003889 length after conversion to the true value. (But decoding error
3890 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 v = _PyUnicode_New(size);
3892 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003893 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003895 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897 end = s + size;
3898 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003899 unsigned char c;
3900 Py_UCS4 x;
3901 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003902 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903
Benjamin Peterson29060642009-01-31 22:14:21 +00003904 /* Non-escape characters are interpreted as Unicode ordinals */
3905 if (*s != '\\') {
3906 *p++ = (unsigned char)*s++;
3907 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003908 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003909 startinpos = s-starts;
3910
3911 /* \u-escapes are only interpreted iff the number of leading
3912 backslashes if odd */
3913 bs = s;
3914 for (;s < end;) {
3915 if (*s != '\\')
3916 break;
3917 *p++ = (unsigned char)*s++;
3918 }
3919 if (((s - bs) & 1) == 0 ||
3920 s >= end ||
3921 (*s != 'u' && *s != 'U')) {
3922 continue;
3923 }
3924 p--;
3925 count = *s=='u' ? 4 : 8;
3926 s++;
3927
3928 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3929 outpos = p-PyUnicode_AS_UNICODE(v);
3930 for (x = 0, i = 0; i < count; ++i, ++s) {
3931 c = (unsigned char)*s;
3932 if (!ISXDIGIT(c)) {
3933 endinpos = s-starts;
3934 if (unicode_decode_call_errorhandler(
3935 errors, &errorHandler,
3936 "rawunicodeescape", "truncated \\uXXXX",
3937 &starts, &end, &startinpos, &endinpos, &exc, &s,
3938 &v, &outpos, &p))
3939 goto onError;
3940 goto nextByte;
3941 }
3942 x = (x<<4) & ~0xF;
3943 if (c >= '0' && c <= '9')
3944 x += c - '0';
3945 else if (c >= 'a' && c <= 'f')
3946 x += 10 + c - 'a';
3947 else
3948 x += 10 + c - 'A';
3949 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003950 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003951 /* UCS-2 character */
3952 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003953 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003954 /* UCS-4 character. Either store directly, or as
3955 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003956#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003957 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003958#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003959 x -= 0x10000L;
3960 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3961 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003962#endif
3963 } else {
3964 endinpos = s-starts;
3965 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003966 if (unicode_decode_call_errorhandler(
3967 errors, &errorHandler,
3968 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003969 &starts, &end, &startinpos, &endinpos, &exc, &s,
3970 &v, &outpos, &p))
3971 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003972 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003973 nextByte:
3974 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003976 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003977 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978 Py_XDECREF(errorHandler);
3979 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003981
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 Py_XDECREF(errorHandler);
3985 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986 return NULL;
3987}
3988
3989PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003992 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993 char *p;
3994 char *q;
3995
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003996#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003997 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003998#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003999 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004001
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004002 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004004
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004005 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 if (repr == NULL)
4007 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004008 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004009 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004011 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012 while (size-- > 0) {
4013 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004014#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 /* Map 32-bit characters to '\Uxxxxxxxx' */
4016 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004017 *p++ = '\\';
4018 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004019 *p++ = hexdigits[(ch >> 28) & 0xf];
4020 *p++ = hexdigits[(ch >> 24) & 0xf];
4021 *p++ = hexdigits[(ch >> 20) & 0xf];
4022 *p++ = hexdigits[(ch >> 16) & 0xf];
4023 *p++ = hexdigits[(ch >> 12) & 0xf];
4024 *p++ = hexdigits[(ch >> 8) & 0xf];
4025 *p++ = hexdigits[(ch >> 4) & 0xf];
4026 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004027 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004028 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004029#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004030 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4031 if (ch >= 0xD800 && ch < 0xDC00) {
4032 Py_UNICODE ch2;
4033 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004034
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 ch2 = *s++;
4036 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004037 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004038 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4039 *p++ = '\\';
4040 *p++ = 'U';
4041 *p++ = hexdigits[(ucs >> 28) & 0xf];
4042 *p++ = hexdigits[(ucs >> 24) & 0xf];
4043 *p++ = hexdigits[(ucs >> 20) & 0xf];
4044 *p++ = hexdigits[(ucs >> 16) & 0xf];
4045 *p++ = hexdigits[(ucs >> 12) & 0xf];
4046 *p++ = hexdigits[(ucs >> 8) & 0xf];
4047 *p++ = hexdigits[(ucs >> 4) & 0xf];
4048 *p++ = hexdigits[ucs & 0xf];
4049 continue;
4050 }
4051 /* Fall through: isolated surrogates are copied as-is */
4052 s--;
4053 size++;
4054 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004055#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004056 /* Map 16-bit characters to '\uxxxx' */
4057 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 *p++ = '\\';
4059 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004060 *p++ = hexdigits[(ch >> 12) & 0xf];
4061 *p++ = hexdigits[(ch >> 8) & 0xf];
4062 *p++ = hexdigits[(ch >> 4) & 0xf];
4063 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 /* Copy everything else as-is */
4066 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 *p++ = (char) ch;
4068 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004069 size = p - q;
4070
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004071 assert(size > 0);
4072 if (_PyBytes_Resize(&repr, size) < 0)
4073 return NULL;
4074 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075}
4076
4077PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4078{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004079 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004081 PyErr_BadArgument();
4082 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004084 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4085 PyUnicode_GET_SIZE(unicode));
4086
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004087 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088}
4089
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004090/* --- Unicode Internal Codec ------------------------------------------- */
4091
4092PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004093 Py_ssize_t size,
4094 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004095{
4096 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004097 Py_ssize_t startinpos;
4098 Py_ssize_t endinpos;
4099 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004100 PyUnicodeObject *v;
4101 Py_UNICODE *p;
4102 const char *end;
4103 const char *reason;
4104 PyObject *errorHandler = NULL;
4105 PyObject *exc = NULL;
4106
Neal Norwitzd43069c2006-01-08 01:12:10 +00004107#ifdef Py_UNICODE_WIDE
4108 Py_UNICODE unimax = PyUnicode_GetMax();
4109#endif
4110
Thomas Wouters89f507f2006-12-13 04:49:30 +00004111 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004112 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4113 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004115 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004117 p = PyUnicode_AS_UNICODE(v);
4118 end = s + size;
4119
4120 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004121 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004122 /* We have to sanity check the raw data, otherwise doom looms for
4123 some malformed UCS-4 data. */
4124 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004125#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004126 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004127#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004128 end-s < Py_UNICODE_SIZE
4129 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004131 startinpos = s - starts;
4132 if (end-s < Py_UNICODE_SIZE) {
4133 endinpos = end-starts;
4134 reason = "truncated input";
4135 }
4136 else {
4137 endinpos = s - starts + Py_UNICODE_SIZE;
4138 reason = "illegal code point (> 0x10FFFF)";
4139 }
4140 outpos = p - PyUnicode_AS_UNICODE(v);
4141 if (unicode_decode_call_errorhandler(
4142 errors, &errorHandler,
4143 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004144 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004145 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004146 goto onError;
4147 }
4148 }
4149 else {
4150 p++;
4151 s += Py_UNICODE_SIZE;
4152 }
4153 }
4154
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004155 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004156 goto onError;
4157 Py_XDECREF(errorHandler);
4158 Py_XDECREF(exc);
4159 return (PyObject *)v;
4160
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004162 Py_XDECREF(v);
4163 Py_XDECREF(errorHandler);
4164 Py_XDECREF(exc);
4165 return NULL;
4166}
4167
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168/* --- Latin-1 Codec ------------------------------------------------------ */
4169
4170PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004171 Py_ssize_t size,
4172 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173{
4174 PyUnicodeObject *v;
4175 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004176 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004177
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004179 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 Py_UNICODE r = *(unsigned char*)s;
4181 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004182 }
4183
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184 v = _PyUnicode_New(size);
4185 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004190 e = s + size;
4191 /* Unrolling the copy makes it much faster by reducing the looping
4192 overhead. This is similar to what many memcpy() implementations do. */
4193 unrolled_end = e - 4;
4194 while (s < unrolled_end) {
4195 p[0] = (unsigned char) s[0];
4196 p[1] = (unsigned char) s[1];
4197 p[2] = (unsigned char) s[2];
4198 p[3] = (unsigned char) s[3];
4199 s += 4;
4200 p += 4;
4201 }
4202 while (s < e)
4203 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004205
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 Py_XDECREF(v);
4208 return NULL;
4209}
4210
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211/* create or adjust a UnicodeEncodeError */
4212static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 const char *encoding,
4214 const Py_UNICODE *unicode, Py_ssize_t size,
4215 Py_ssize_t startpos, Py_ssize_t endpos,
4216 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004217{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004219 *exceptionObject = PyUnicodeEncodeError_Create(
4220 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004221 }
4222 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004223 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4224 goto onError;
4225 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4226 goto onError;
4227 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4228 goto onError;
4229 return;
4230 onError:
4231 Py_DECREF(*exceptionObject);
4232 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 }
4234}
4235
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236/* raises a UnicodeEncodeError */
4237static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 const char *encoding,
4239 const Py_UNICODE *unicode, Py_ssize_t size,
4240 Py_ssize_t startpos, Py_ssize_t endpos,
4241 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242{
4243 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247}
4248
4249/* error handling callback helper:
4250 build arguments, call the callback and check the arguments,
4251 put the result into newpos and return the replacement string, which
4252 has to be freed by the caller */
4253static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 PyObject **errorHandler,
4255 const char *encoding, const char *reason,
4256 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4257 Py_ssize_t startpos, Py_ssize_t endpos,
4258 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004260 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261
4262 PyObject *restuple;
4263 PyObject *resunicode;
4264
4265 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004266 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 }
4270
4271 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004272 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004273 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275
4276 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004277 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004279 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004281 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 Py_DECREF(restuple);
4283 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004285 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 &resunicode, newpos)) {
4287 Py_DECREF(restuple);
4288 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004290 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4291 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4292 Py_DECREF(restuple);
4293 return NULL;
4294 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004297 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004298 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4299 Py_DECREF(restuple);
4300 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004301 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 Py_INCREF(resunicode);
4303 Py_DECREF(restuple);
4304 return resunicode;
4305}
4306
4307static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 Py_ssize_t size,
4309 const char *errors,
4310 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004311{
4312 /* output object */
4313 PyObject *res;
4314 /* pointers to the beginning and end+1 of input */
4315 const Py_UNICODE *startp = p;
4316 const Py_UNICODE *endp = p + size;
4317 /* pointer to the beginning of the unencodable characters */
4318 /* const Py_UNICODE *badp = NULL; */
4319 /* pointer into the output */
4320 char *str;
4321 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004322 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004323 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4324 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004325 PyObject *errorHandler = NULL;
4326 PyObject *exc = NULL;
4327 /* the following variable is used for caching string comparisons
4328 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4329 int known_errorHandler = -1;
4330
4331 /* allocate enough for a simple encoding without
4332 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004333 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004334 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004335 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004337 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004338 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339 ressize = size;
4340
4341 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004342 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004343
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 /* can we encode this? */
4345 if (c<limit) {
4346 /* no overflow check, because we know that the space is enough */
4347 *str++ = (char)c;
4348 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004349 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004350 else {
4351 Py_ssize_t unicodepos = p-startp;
4352 Py_ssize_t requiredsize;
4353 PyObject *repunicode;
4354 Py_ssize_t repsize;
4355 Py_ssize_t newpos;
4356 Py_ssize_t respos;
4357 Py_UNICODE *uni2;
4358 /* startpos for collecting unencodable chars */
4359 const Py_UNICODE *collstart = p;
4360 const Py_UNICODE *collend = p;
4361 /* find all unecodable characters */
4362 while ((collend < endp) && ((*collend)>=limit))
4363 ++collend;
4364 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4365 if (known_errorHandler==-1) {
4366 if ((errors==NULL) || (!strcmp(errors, "strict")))
4367 known_errorHandler = 1;
4368 else if (!strcmp(errors, "replace"))
4369 known_errorHandler = 2;
4370 else if (!strcmp(errors, "ignore"))
4371 known_errorHandler = 3;
4372 else if (!strcmp(errors, "xmlcharrefreplace"))
4373 known_errorHandler = 4;
4374 else
4375 known_errorHandler = 0;
4376 }
4377 switch (known_errorHandler) {
4378 case 1: /* strict */
4379 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4380 goto onError;
4381 case 2: /* replace */
4382 while (collstart++<collend)
4383 *str++ = '?'; /* fall through */
4384 case 3: /* ignore */
4385 p = collend;
4386 break;
4387 case 4: /* xmlcharrefreplace */
4388 respos = str - PyBytes_AS_STRING(res);
4389 /* determine replacement size (temporarily (mis)uses p) */
4390 for (p = collstart, repsize = 0; p < collend; ++p) {
4391 if (*p<10)
4392 repsize += 2+1+1;
4393 else if (*p<100)
4394 repsize += 2+2+1;
4395 else if (*p<1000)
4396 repsize += 2+3+1;
4397 else if (*p<10000)
4398 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004399#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 else
4401 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004402#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004403 else if (*p<100000)
4404 repsize += 2+5+1;
4405 else if (*p<1000000)
4406 repsize += 2+6+1;
4407 else
4408 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004409#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 }
4411 requiredsize = respos+repsize+(endp-collend);
4412 if (requiredsize > ressize) {
4413 if (requiredsize<2*ressize)
4414 requiredsize = 2*ressize;
4415 if (_PyBytes_Resize(&res, requiredsize))
4416 goto onError;
4417 str = PyBytes_AS_STRING(res) + respos;
4418 ressize = requiredsize;
4419 }
4420 /* generate replacement (temporarily (mis)uses p) */
4421 for (p = collstart; p < collend; ++p) {
4422 str += sprintf(str, "&#%d;", (int)*p);
4423 }
4424 p = collend;
4425 break;
4426 default:
4427 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4428 encoding, reason, startp, size, &exc,
4429 collstart-startp, collend-startp, &newpos);
4430 if (repunicode == NULL)
4431 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004432 if (PyBytes_Check(repunicode)) {
4433 /* Directly copy bytes result to output. */
4434 repsize = PyBytes_Size(repunicode);
4435 if (repsize > 1) {
4436 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004437 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004438 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4439 Py_DECREF(repunicode);
4440 goto onError;
4441 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004442 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004443 ressize += repsize-1;
4444 }
4445 memcpy(str, PyBytes_AsString(repunicode), repsize);
4446 str += repsize;
4447 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004448 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004449 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004450 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004451 /* need more space? (at least enough for what we
4452 have+the replacement+the rest of the string, so
4453 we won't have to check space for encodable characters) */
4454 respos = str - PyBytes_AS_STRING(res);
4455 repsize = PyUnicode_GET_SIZE(repunicode);
4456 requiredsize = respos+repsize+(endp-collend);
4457 if (requiredsize > ressize) {
4458 if (requiredsize<2*ressize)
4459 requiredsize = 2*ressize;
4460 if (_PyBytes_Resize(&res, requiredsize)) {
4461 Py_DECREF(repunicode);
4462 goto onError;
4463 }
4464 str = PyBytes_AS_STRING(res) + respos;
4465 ressize = requiredsize;
4466 }
4467 /* check if there is anything unencodable in the replacement
4468 and copy it to the output */
4469 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4470 c = *uni2;
4471 if (c >= limit) {
4472 raise_encode_exception(&exc, encoding, startp, size,
4473 unicodepos, unicodepos+1, reason);
4474 Py_DECREF(repunicode);
4475 goto onError;
4476 }
4477 *str = (char)c;
4478 }
4479 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004480 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004482 }
4483 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004484 /* Resize if we allocated to much */
4485 size = str - PyBytes_AS_STRING(res);
4486 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004487 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004488 if (_PyBytes_Resize(&res, size) < 0)
4489 goto onError;
4490 }
4491
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 Py_XDECREF(errorHandler);
4493 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004494 return res;
4495
4496 onError:
4497 Py_XDECREF(res);
4498 Py_XDECREF(errorHandler);
4499 Py_XDECREF(exc);
4500 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501}
4502
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 Py_ssize_t size,
4505 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508}
4509
4510PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4511{
4512 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 PyErr_BadArgument();
4514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 }
4516 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 PyUnicode_GET_SIZE(unicode),
4518 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519}
4520
4521/* --- 7-bit ASCII Codec -------------------------------------------------- */
4522
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 Py_ssize_t size,
4525 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 PyUnicodeObject *v;
4529 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004530 Py_ssize_t startinpos;
4531 Py_ssize_t endinpos;
4532 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533 const char *e;
4534 PyObject *errorHandler = NULL;
4535 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004536
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004538 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 Py_UNICODE r = *(unsigned char*)s;
4540 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004541 }
Tim Petersced69f82003-09-16 20:30:58 +00004542
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 v = _PyUnicode_New(size);
4544 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004547 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 e = s + size;
4550 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 register unsigned char c = (unsigned char)*s;
4552 if (c < 128) {
4553 *p++ = c;
4554 ++s;
4555 }
4556 else {
4557 startinpos = s-starts;
4558 endinpos = startinpos + 1;
4559 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4560 if (unicode_decode_call_errorhandler(
4561 errors, &errorHandler,
4562 "ascii", "ordinal not in range(128)",
4563 &starts, &e, &startinpos, &endinpos, &exc, &s,
4564 &v, &outpos, &p))
4565 goto onError;
4566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004568 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4570 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 Py_XDECREF(errorHandler);
4572 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004574
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 Py_XDECREF(errorHandler);
4578 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 return NULL;
4580}
4581
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 Py_ssize_t size,
4584 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587}
4588
4589PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4590{
4591 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004592 PyErr_BadArgument();
4593 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594 }
4595 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 PyUnicode_GET_SIZE(unicode),
4597 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598}
4599
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004600#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004601
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004602/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004603
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004604#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004605#define NEED_RETRY
4606#endif
4607
4608/* XXX This code is limited to "true" double-byte encodings, as
4609 a) it assumes an incomplete character consists of a single byte, and
4610 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004612
4613static int is_dbcs_lead_byte(const char *s, int offset)
4614{
4615 const char *curr = s + offset;
4616
4617 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 const char *prev = CharPrev(s, curr);
4619 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004620 }
4621 return 0;
4622}
4623
4624/*
4625 * Decode MBCS string into unicode object. If 'final' is set, converts
4626 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4627 */
4628static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004629 const char *s, /* MBCS string */
4630 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004631 int final,
4632 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004633{
4634 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004635 Py_ssize_t n;
4636 DWORD usize;
4637 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004638
4639 assert(size >= 0);
4640
Victor Stinner554f3f02010-06-16 23:33:54 +00004641 /* check and handle 'errors' arg */
4642 if (errors==NULL || strcmp(errors, "strict")==0)
4643 flags = MB_ERR_INVALID_CHARS;
4644 else if (strcmp(errors, "ignore")==0)
4645 flags = 0;
4646 else {
4647 PyErr_Format(PyExc_ValueError,
4648 "mbcs encoding does not support errors='%s'",
4649 errors);
4650 return -1;
4651 }
4652
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004653 /* Skip trailing lead-byte unless 'final' is set */
4654 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004656
4657 /* First get the size of the result */
4658 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004659 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4660 if (usize==0)
4661 goto mbcs_decode_error;
4662 } else
4663 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004664
4665 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 /* Create unicode object */
4667 *v = _PyUnicode_New(usize);
4668 if (*v == NULL)
4669 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004670 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004671 }
4672 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004673 /* Extend unicode object */
4674 n = PyUnicode_GET_SIZE(*v);
4675 if (_PyUnicode_Resize(v, n + usize) < 0)
4676 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004677 }
4678
4679 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004680 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004682 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4683 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004685 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004686 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004687
4688mbcs_decode_error:
4689 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4690 we raise a UnicodeDecodeError - else it is a 'generic'
4691 windows error
4692 */
4693 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4694 /* Ideally, we should get reason from FormatMessage - this
4695 is the Windows 2000 English version of the message
4696 */
4697 PyObject *exc = NULL;
4698 const char *reason = "No mapping for the Unicode character exists "
4699 "in the target multi-byte code page.";
4700 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4701 if (exc != NULL) {
4702 PyCodec_StrictErrors(exc);
4703 Py_DECREF(exc);
4704 }
4705 } else {
4706 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4707 }
4708 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004709}
4710
4711PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 Py_ssize_t size,
4713 const char *errors,
4714 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004715{
4716 PyUnicodeObject *v = NULL;
4717 int done;
4718
4719 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004721
4722#ifdef NEED_RETRY
4723 retry:
4724 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004725 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004726 else
4727#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004728 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004729
4730 if (done < 0) {
4731 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004733 }
4734
4735 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004736 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004737
4738#ifdef NEED_RETRY
4739 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 s += done;
4741 size -= done;
4742 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004743 }
4744#endif
4745
4746 return (PyObject *)v;
4747}
4748
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004749PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 Py_ssize_t size,
4751 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004752{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004753 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4754}
4755
4756/*
4757 * Convert unicode into string object (MBCS).
4758 * Returns 0 if succeed, -1 otherwise.
4759 */
4760static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004761 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004762 int size, /* size of unicode */
4763 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004764{
Victor Stinner554f3f02010-06-16 23:33:54 +00004765 BOOL usedDefaultChar = FALSE;
4766 BOOL *pusedDefaultChar;
4767 int mbcssize;
4768 Py_ssize_t n;
4769 PyObject *exc = NULL;
4770 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004771
4772 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004773
Victor Stinner554f3f02010-06-16 23:33:54 +00004774 /* check and handle 'errors' arg */
4775 if (errors==NULL || strcmp(errors, "strict")==0) {
4776 flags = WC_NO_BEST_FIT_CHARS;
4777 pusedDefaultChar = &usedDefaultChar;
4778 } else if (strcmp(errors, "replace")==0) {
4779 flags = 0;
4780 pusedDefaultChar = NULL;
4781 } else {
4782 PyErr_Format(PyExc_ValueError,
4783 "mbcs encoding does not support errors='%s'",
4784 errors);
4785 return -1;
4786 }
4787
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004788 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004789 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004790 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4791 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 if (mbcssize == 0) {
4793 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4794 return -1;
4795 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004796 /* If we used a default char, then we failed! */
4797 if (pusedDefaultChar && *pusedDefaultChar)
4798 goto mbcs_encode_error;
4799 } else {
4800 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004801 }
4802
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004803 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004804 /* Create string object */
4805 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4806 if (*repr == NULL)
4807 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004808 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004809 }
4810 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004811 /* Extend string object */
4812 n = PyBytes_Size(*repr);
4813 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4814 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004815 }
4816
4817 /* Do the conversion */
4818 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004820 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4821 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004822 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4823 return -1;
4824 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004825 if (pusedDefaultChar && *pusedDefaultChar)
4826 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004827 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004828 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004829
4830mbcs_encode_error:
4831 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4832 Py_XDECREF(exc);
4833 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004834}
4835
4836PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004837 Py_ssize_t size,
4838 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004839{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004840 PyObject *repr = NULL;
4841 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004842
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004843#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004845 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004846 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004847 else
4848#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004849 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004850
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004851 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 Py_XDECREF(repr);
4853 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004854 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004855
4856#ifdef NEED_RETRY
4857 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 p += INT_MAX;
4859 size -= INT_MAX;
4860 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004861 }
4862#endif
4863
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004864 return repr;
4865}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004866
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004867PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4868{
4869 if (!PyUnicode_Check(unicode)) {
4870 PyErr_BadArgument();
4871 return NULL;
4872 }
4873 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004874 PyUnicode_GET_SIZE(unicode),
4875 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004876}
4877
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004878#undef NEED_RETRY
4879
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004880#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004881
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882/* --- Character Mapping Codec -------------------------------------------- */
4883
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 Py_ssize_t size,
4886 PyObject *mapping,
4887 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004890 Py_ssize_t startinpos;
4891 Py_ssize_t endinpos;
4892 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004893 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 PyUnicodeObject *v;
4895 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004896 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 PyObject *errorHandler = NULL;
4898 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004899 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004900 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004901
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 /* Default to Latin-1 */
4903 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004904 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905
4906 v = _PyUnicode_New(size);
4907 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004908 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004912 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004913 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 mapstring = PyUnicode_AS_UNICODE(mapping);
4915 maplen = PyUnicode_GET_SIZE(mapping);
4916 while (s < e) {
4917 unsigned char ch = *s;
4918 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 if (ch < maplen)
4921 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922
Benjamin Peterson29060642009-01-31 22:14:21 +00004923 if (x == 0xfffe) {
4924 /* undefined mapping */
4925 outpos = p-PyUnicode_AS_UNICODE(v);
4926 startinpos = s-starts;
4927 endinpos = startinpos+1;
4928 if (unicode_decode_call_errorhandler(
4929 errors, &errorHandler,
4930 "charmap", "character maps to <undefined>",
4931 &starts, &e, &startinpos, &endinpos, &exc, &s,
4932 &v, &outpos, &p)) {
4933 goto onError;
4934 }
4935 continue;
4936 }
4937 *p++ = x;
4938 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004939 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004940 }
4941 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 while (s < e) {
4943 unsigned char ch = *s;
4944 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004945
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4947 w = PyLong_FromLong((long)ch);
4948 if (w == NULL)
4949 goto onError;
4950 x = PyObject_GetItem(mapping, w);
4951 Py_DECREF(w);
4952 if (x == NULL) {
4953 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4954 /* No mapping found means: mapping is undefined. */
4955 PyErr_Clear();
4956 x = Py_None;
4957 Py_INCREF(x);
4958 } else
4959 goto onError;
4960 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004961
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 /* Apply mapping */
4963 if (PyLong_Check(x)) {
4964 long value = PyLong_AS_LONG(x);
4965 if (value < 0 || value > 65535) {
4966 PyErr_SetString(PyExc_TypeError,
4967 "character mapping must be in range(65536)");
4968 Py_DECREF(x);
4969 goto onError;
4970 }
4971 *p++ = (Py_UNICODE)value;
4972 }
4973 else if (x == Py_None) {
4974 /* undefined mapping */
4975 outpos = p-PyUnicode_AS_UNICODE(v);
4976 startinpos = s-starts;
4977 endinpos = startinpos+1;
4978 if (unicode_decode_call_errorhandler(
4979 errors, &errorHandler,
4980 "charmap", "character maps to <undefined>",
4981 &starts, &e, &startinpos, &endinpos, &exc, &s,
4982 &v, &outpos, &p)) {
4983 Py_DECREF(x);
4984 goto onError;
4985 }
4986 Py_DECREF(x);
4987 continue;
4988 }
4989 else if (PyUnicode_Check(x)) {
4990 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004991
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 if (targetsize == 1)
4993 /* 1-1 mapping */
4994 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004995
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 else if (targetsize > 1) {
4997 /* 1-n mapping */
4998 if (targetsize > extrachars) {
4999 /* resize first */
5000 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5001 Py_ssize_t needed = (targetsize - extrachars) + \
5002 (targetsize << 2);
5003 extrachars += needed;
5004 /* XXX overflow detection missing */
5005 if (_PyUnicode_Resize(&v,
5006 PyUnicode_GET_SIZE(v) + needed) < 0) {
5007 Py_DECREF(x);
5008 goto onError;
5009 }
5010 p = PyUnicode_AS_UNICODE(v) + oldpos;
5011 }
5012 Py_UNICODE_COPY(p,
5013 PyUnicode_AS_UNICODE(x),
5014 targetsize);
5015 p += targetsize;
5016 extrachars -= targetsize;
5017 }
5018 /* 1-0 mapping: skip the character */
5019 }
5020 else {
5021 /* wrong return value */
5022 PyErr_SetString(PyExc_TypeError,
5023 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005024 Py_DECREF(x);
5025 goto onError;
5026 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005027 Py_DECREF(x);
5028 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 }
5031 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5033 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005034 Py_XDECREF(errorHandler);
5035 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005037
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005039 Py_XDECREF(errorHandler);
5040 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 Py_XDECREF(v);
5042 return NULL;
5043}
5044
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005045/* Charmap encoding: the lookup table */
5046
5047struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 PyObject_HEAD
5049 unsigned char level1[32];
5050 int count2, count3;
5051 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005052};
5053
5054static PyObject*
5055encoding_map_size(PyObject *obj, PyObject* args)
5056{
5057 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005058 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005059 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005060}
5061
5062static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005063 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 PyDoc_STR("Return the size (in bytes) of this object") },
5065 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005066};
5067
5068static void
5069encoding_map_dealloc(PyObject* o)
5070{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005071 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005072}
5073
5074static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005075 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 "EncodingMap", /*tp_name*/
5077 sizeof(struct encoding_map), /*tp_basicsize*/
5078 0, /*tp_itemsize*/
5079 /* methods */
5080 encoding_map_dealloc, /*tp_dealloc*/
5081 0, /*tp_print*/
5082 0, /*tp_getattr*/
5083 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005084 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 0, /*tp_repr*/
5086 0, /*tp_as_number*/
5087 0, /*tp_as_sequence*/
5088 0, /*tp_as_mapping*/
5089 0, /*tp_hash*/
5090 0, /*tp_call*/
5091 0, /*tp_str*/
5092 0, /*tp_getattro*/
5093 0, /*tp_setattro*/
5094 0, /*tp_as_buffer*/
5095 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5096 0, /*tp_doc*/
5097 0, /*tp_traverse*/
5098 0, /*tp_clear*/
5099 0, /*tp_richcompare*/
5100 0, /*tp_weaklistoffset*/
5101 0, /*tp_iter*/
5102 0, /*tp_iternext*/
5103 encoding_map_methods, /*tp_methods*/
5104 0, /*tp_members*/
5105 0, /*tp_getset*/
5106 0, /*tp_base*/
5107 0, /*tp_dict*/
5108 0, /*tp_descr_get*/
5109 0, /*tp_descr_set*/
5110 0, /*tp_dictoffset*/
5111 0, /*tp_init*/
5112 0, /*tp_alloc*/
5113 0, /*tp_new*/
5114 0, /*tp_free*/
5115 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005116};
5117
5118PyObject*
5119PyUnicode_BuildEncodingMap(PyObject* string)
5120{
5121 Py_UNICODE *decode;
5122 PyObject *result;
5123 struct encoding_map *mresult;
5124 int i;
5125 int need_dict = 0;
5126 unsigned char level1[32];
5127 unsigned char level2[512];
5128 unsigned char *mlevel1, *mlevel2, *mlevel3;
5129 int count2 = 0, count3 = 0;
5130
5131 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5132 PyErr_BadArgument();
5133 return NULL;
5134 }
5135 decode = PyUnicode_AS_UNICODE(string);
5136 memset(level1, 0xFF, sizeof level1);
5137 memset(level2, 0xFF, sizeof level2);
5138
5139 /* If there isn't a one-to-one mapping of NULL to \0,
5140 or if there are non-BMP characters, we need to use
5141 a mapping dictionary. */
5142 if (decode[0] != 0)
5143 need_dict = 1;
5144 for (i = 1; i < 256; i++) {
5145 int l1, l2;
5146 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005147#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005148 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005149#endif
5150 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005151 need_dict = 1;
5152 break;
5153 }
5154 if (decode[i] == 0xFFFE)
5155 /* unmapped character */
5156 continue;
5157 l1 = decode[i] >> 11;
5158 l2 = decode[i] >> 7;
5159 if (level1[l1] == 0xFF)
5160 level1[l1] = count2++;
5161 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005162 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005163 }
5164
5165 if (count2 >= 0xFF || count3 >= 0xFF)
5166 need_dict = 1;
5167
5168 if (need_dict) {
5169 PyObject *result = PyDict_New();
5170 PyObject *key, *value;
5171 if (!result)
5172 return NULL;
5173 for (i = 0; i < 256; i++) {
5174 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005175 key = PyLong_FromLong(decode[i]);
5176 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005177 if (!key || !value)
5178 goto failed1;
5179 if (PyDict_SetItem(result, key, value) == -1)
5180 goto failed1;
5181 Py_DECREF(key);
5182 Py_DECREF(value);
5183 }
5184 return result;
5185 failed1:
5186 Py_XDECREF(key);
5187 Py_XDECREF(value);
5188 Py_DECREF(result);
5189 return NULL;
5190 }
5191
5192 /* Create a three-level trie */
5193 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5194 16*count2 + 128*count3 - 1);
5195 if (!result)
5196 return PyErr_NoMemory();
5197 PyObject_Init(result, &EncodingMapType);
5198 mresult = (struct encoding_map*)result;
5199 mresult->count2 = count2;
5200 mresult->count3 = count3;
5201 mlevel1 = mresult->level1;
5202 mlevel2 = mresult->level23;
5203 mlevel3 = mresult->level23 + 16*count2;
5204 memcpy(mlevel1, level1, 32);
5205 memset(mlevel2, 0xFF, 16*count2);
5206 memset(mlevel3, 0, 128*count3);
5207 count3 = 0;
5208 for (i = 1; i < 256; i++) {
5209 int o1, o2, o3, i2, i3;
5210 if (decode[i] == 0xFFFE)
5211 /* unmapped character */
5212 continue;
5213 o1 = decode[i]>>11;
5214 o2 = (decode[i]>>7) & 0xF;
5215 i2 = 16*mlevel1[o1] + o2;
5216 if (mlevel2[i2] == 0xFF)
5217 mlevel2[i2] = count3++;
5218 o3 = decode[i] & 0x7F;
5219 i3 = 128*mlevel2[i2] + o3;
5220 mlevel3[i3] = i;
5221 }
5222 return result;
5223}
5224
5225static int
5226encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5227{
5228 struct encoding_map *map = (struct encoding_map*)mapping;
5229 int l1 = c>>11;
5230 int l2 = (c>>7) & 0xF;
5231 int l3 = c & 0x7F;
5232 int i;
5233
5234#ifdef Py_UNICODE_WIDE
5235 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005237 }
5238#endif
5239 if (c == 0)
5240 return 0;
5241 /* level 1*/
5242 i = map->level1[l1];
5243 if (i == 0xFF) {
5244 return -1;
5245 }
5246 /* level 2*/
5247 i = map->level23[16*i+l2];
5248 if (i == 0xFF) {
5249 return -1;
5250 }
5251 /* level 3 */
5252 i = map->level23[16*map->count2 + 128*i + l3];
5253 if (i == 0) {
5254 return -1;
5255 }
5256 return i;
5257}
5258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005259/* Lookup the character ch in the mapping. If the character
5260 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005261 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005262static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263{
Christian Heimes217cfd12007-12-02 14:31:20 +00005264 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005265 PyObject *x;
5266
5267 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005269 x = PyObject_GetItem(mapping, w);
5270 Py_DECREF(w);
5271 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5273 /* No mapping found means: mapping is undefined. */
5274 PyErr_Clear();
5275 x = Py_None;
5276 Py_INCREF(x);
5277 return x;
5278 } else
5279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005281 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005283 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 long value = PyLong_AS_LONG(x);
5285 if (value < 0 || value > 255) {
5286 PyErr_SetString(PyExc_TypeError,
5287 "character mapping must be in range(256)");
5288 Py_DECREF(x);
5289 return NULL;
5290 }
5291 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005293 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005294 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 /* wrong return value */
5297 PyErr_Format(PyExc_TypeError,
5298 "character mapping must return integer, bytes or None, not %.400s",
5299 x->ob_type->tp_name);
5300 Py_DECREF(x);
5301 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302 }
5303}
5304
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005305static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005306charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005307{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005308 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5309 /* exponentially overallocate to minimize reallocations */
5310 if (requiredsize < 2*outsize)
5311 requiredsize = 2*outsize;
5312 if (_PyBytes_Resize(outobj, requiredsize))
5313 return -1;
5314 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005315}
5316
Benjamin Peterson14339b62009-01-31 16:36:08 +00005317typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005319}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005320/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005321 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005322 space is available. Return a new reference to the object that
5323 was put in the output buffer, or Py_None, if the mapping was undefined
5324 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005325 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005326static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005327charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005329{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005330 PyObject *rep;
5331 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005332 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005333
Christian Heimes90aa7642007-12-19 02:45:37 +00005334 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005335 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005337 if (res == -1)
5338 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 if (outsize<requiredsize)
5340 if (charmapencode_resize(outobj, outpos, requiredsize))
5341 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005342 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 outstart[(*outpos)++] = (char)res;
5344 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005345 }
5346
5347 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005348 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005350 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 Py_DECREF(rep);
5352 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005353 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 if (PyLong_Check(rep)) {
5355 Py_ssize_t requiredsize = *outpos+1;
5356 if (outsize<requiredsize)
5357 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5358 Py_DECREF(rep);
5359 return enc_EXCEPTION;
5360 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005361 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005363 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 else {
5365 const char *repchars = PyBytes_AS_STRING(rep);
5366 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5367 Py_ssize_t requiredsize = *outpos+repsize;
5368 if (outsize<requiredsize)
5369 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5370 Py_DECREF(rep);
5371 return enc_EXCEPTION;
5372 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005373 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 memcpy(outstart + *outpos, repchars, repsize);
5375 *outpos += repsize;
5376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005378 Py_DECREF(rep);
5379 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380}
5381
5382/* handle an error in PyUnicode_EncodeCharmap
5383 Return 0 on success, -1 on error */
5384static
5385int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005386 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005387 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005388 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005389 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005390{
5391 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005392 Py_ssize_t repsize;
5393 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005394 Py_UNICODE *uni2;
5395 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005396 Py_ssize_t collstartpos = *inpos;
5397 Py_ssize_t collendpos = *inpos+1;
5398 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005399 char *encoding = "charmap";
5400 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005401 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005402
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005403 /* find all unencodable characters */
5404 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005405 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005406 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 int res = encoding_map_lookup(p[collendpos], mapping);
5408 if (res != -1)
5409 break;
5410 ++collendpos;
5411 continue;
5412 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005413
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 rep = charmapencode_lookup(p[collendpos], mapping);
5415 if (rep==NULL)
5416 return -1;
5417 else if (rep!=Py_None) {
5418 Py_DECREF(rep);
5419 break;
5420 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005421 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005423 }
5424 /* cache callback name lookup
5425 * (if not done yet, i.e. it's the first error) */
5426 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 if ((errors==NULL) || (!strcmp(errors, "strict")))
5428 *known_errorHandler = 1;
5429 else if (!strcmp(errors, "replace"))
5430 *known_errorHandler = 2;
5431 else if (!strcmp(errors, "ignore"))
5432 *known_errorHandler = 3;
5433 else if (!strcmp(errors, "xmlcharrefreplace"))
5434 *known_errorHandler = 4;
5435 else
5436 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437 }
5438 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005439 case 1: /* strict */
5440 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5441 return -1;
5442 case 2: /* replace */
5443 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 x = charmapencode_output('?', mapping, res, respos);
5445 if (x==enc_EXCEPTION) {
5446 return -1;
5447 }
5448 else if (x==enc_FAILED) {
5449 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5450 return -1;
5451 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005452 }
5453 /* fall through */
5454 case 3: /* ignore */
5455 *inpos = collendpos;
5456 break;
5457 case 4: /* xmlcharrefreplace */
5458 /* generate replacement (temporarily (mis)uses p) */
5459 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 char buffer[2+29+1+1];
5461 char *cp;
5462 sprintf(buffer, "&#%d;", (int)p[collpos]);
5463 for (cp = buffer; *cp; ++cp) {
5464 x = charmapencode_output(*cp, mapping, res, respos);
5465 if (x==enc_EXCEPTION)
5466 return -1;
5467 else if (x==enc_FAILED) {
5468 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5469 return -1;
5470 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005471 }
5472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005473 *inpos = collendpos;
5474 break;
5475 default:
5476 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005477 encoding, reason, p, size, exceptionObject,
5478 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005479 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005481 if (PyBytes_Check(repunicode)) {
5482 /* Directly copy bytes result to output. */
5483 Py_ssize_t outsize = PyBytes_Size(*res);
5484 Py_ssize_t requiredsize;
5485 repsize = PyBytes_Size(repunicode);
5486 requiredsize = *respos + repsize;
5487 if (requiredsize > outsize)
5488 /* Make room for all additional bytes. */
5489 if (charmapencode_resize(res, respos, requiredsize)) {
5490 Py_DECREF(repunicode);
5491 return -1;
5492 }
5493 memcpy(PyBytes_AsString(*res) + *respos,
5494 PyBytes_AsString(repunicode), repsize);
5495 *respos += repsize;
5496 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005497 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005498 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005500 /* generate replacement */
5501 repsize = PyUnicode_GET_SIZE(repunicode);
5502 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 x = charmapencode_output(*uni2, mapping, res, respos);
5504 if (x==enc_EXCEPTION) {
5505 return -1;
5506 }
5507 else if (x==enc_FAILED) {
5508 Py_DECREF(repunicode);
5509 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5510 return -1;
5511 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005512 }
5513 *inpos = newpos;
5514 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 }
5516 return 0;
5517}
5518
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 Py_ssize_t size,
5521 PyObject *mapping,
5522 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 /* output object */
5525 PyObject *res = NULL;
5526 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005527 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005528 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005529 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005530 PyObject *errorHandler = NULL;
5531 PyObject *exc = NULL;
5532 /* the following variable is used for caching string comparisons
5533 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5534 * 3=ignore, 4=xmlcharrefreplace */
5535 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536
5537 /* Default to Latin-1 */
5538 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005541 /* allocate enough for a simple encoding without
5542 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005543 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544 if (res == NULL)
5545 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005546 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005549 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 /* try to encode it */
5551 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5552 if (x==enc_EXCEPTION) /* error */
5553 goto onError;
5554 if (x==enc_FAILED) { /* unencodable character */
5555 if (charmap_encoding_error(p, size, &inpos, mapping,
5556 &exc,
5557 &known_errorHandler, &errorHandler, errors,
5558 &res, &respos)) {
5559 goto onError;
5560 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005561 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 else
5563 /* done with this character => adjust input position */
5564 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005568 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005569 if (_PyBytes_Resize(&res, respos) < 0)
5570 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 Py_XDECREF(exc);
5573 Py_XDECREF(errorHandler);
5574 return res;
5575
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577 Py_XDECREF(res);
5578 Py_XDECREF(exc);
5579 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 return NULL;
5581}
5582
5583PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005584 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585{
5586 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 PyErr_BadArgument();
5588 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 }
5590 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 PyUnicode_GET_SIZE(unicode),
5592 mapping,
5593 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594}
5595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596/* create or adjust a UnicodeTranslateError */
5597static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 const Py_UNICODE *unicode, Py_ssize_t size,
5599 Py_ssize_t startpos, Py_ssize_t endpos,
5600 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005603 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 }
5606 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5608 goto onError;
5609 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5610 goto onError;
5611 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5612 goto onError;
5613 return;
5614 onError:
5615 Py_DECREF(*exceptionObject);
5616 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 }
5618}
5619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620/* raises a UnicodeTranslateError */
5621static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 const Py_UNICODE *unicode, Py_ssize_t size,
5623 Py_ssize_t startpos, Py_ssize_t endpos,
5624 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625{
5626 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630}
5631
5632/* error handling callback helper:
5633 build arguments, call the callback and check the arguments,
5634 put the result into newpos and return the replacement string, which
5635 has to be freed by the caller */
5636static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 PyObject **errorHandler,
5638 const char *reason,
5639 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5640 Py_ssize_t startpos, Py_ssize_t endpos,
5641 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005643 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005644
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005645 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 PyObject *restuple;
5647 PyObject *resunicode;
5648
5649 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 }
5654
5655 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005657 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005659
5660 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005662 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005665 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 Py_DECREF(restuple);
5667 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005668 }
5669 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 &resunicode, &i_newpos)) {
5671 Py_DECREF(restuple);
5672 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005673 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005674 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005676 else
5677 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005678 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5680 Py_DECREF(restuple);
5681 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005682 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683 Py_INCREF(resunicode);
5684 Py_DECREF(restuple);
5685 return resunicode;
5686}
5687
5688/* Lookup the character ch in the mapping and put the result in result,
5689 which must be decrefed by the caller.
5690 Return 0 on success, -1 on error */
5691static
5692int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5693{
Christian Heimes217cfd12007-12-02 14:31:20 +00005694 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 PyObject *x;
5696
5697 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699 x = PyObject_GetItem(mapping, w);
5700 Py_DECREF(w);
5701 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5703 /* No mapping found means: use 1:1 mapping. */
5704 PyErr_Clear();
5705 *result = NULL;
5706 return 0;
5707 } else
5708 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 }
5710 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 *result = x;
5712 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005714 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 long value = PyLong_AS_LONG(x);
5716 long max = PyUnicode_GetMax();
5717 if (value < 0 || value > max) {
5718 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005719 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005720 Py_DECREF(x);
5721 return -1;
5722 }
5723 *result = x;
5724 return 0;
5725 }
5726 else if (PyUnicode_Check(x)) {
5727 *result = x;
5728 return 0;
5729 }
5730 else {
5731 /* wrong return value */
5732 PyErr_SetString(PyExc_TypeError,
5733 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005734 Py_DECREF(x);
5735 return -1;
5736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737}
5738/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 if not reallocate and adjust various state variables.
5740 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741static
Walter Dörwald4894c302003-10-24 14:25:28 +00005742int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005745 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005746 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 /* remember old output position */
5748 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5749 /* exponentially overallocate to minimize reallocations */
5750 if (requiredsize < 2 * oldsize)
5751 requiredsize = 2 * oldsize;
5752 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5753 return -1;
5754 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 }
5756 return 0;
5757}
5758/* lookup the character, put the result in the output string and adjust
5759 various state variables. Return a new reference to the object that
5760 was put in the output buffer in *result, or Py_None, if the mapping was
5761 undefined (in which case no character was written).
5762 The called must decref result.
5763 Return 0 on success, -1 on error. */
5764static
Walter Dörwald4894c302003-10-24 14:25:28 +00005765int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5767 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768{
Walter Dörwald4894c302003-10-24 14:25:28 +00005769 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 /* not found => default to 1:1 mapping */
5773 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 }
5775 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005777 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 /* no overflow check, because we know that the space is enough */
5779 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 }
5781 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5783 if (repsize==1) {
5784 /* no overflow check, because we know that the space is enough */
5785 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5786 }
5787 else if (repsize!=0) {
5788 /* more than one character */
5789 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5790 (insize - (curinp-startinp)) +
5791 repsize - 1;
5792 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5793 return -1;
5794 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5795 *outp += repsize;
5796 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 }
5798 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 return 0;
5801}
5802
5803PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005804 Py_ssize_t size,
5805 PyObject *mapping,
5806 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005808 /* output object */
5809 PyObject *res = NULL;
5810 /* pointers to the beginning and end+1 of input */
5811 const Py_UNICODE *startp = p;
5812 const Py_UNICODE *endp = p + size;
5813 /* pointer into the output */
5814 Py_UNICODE *str;
5815 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005816 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005817 char *reason = "character maps to <undefined>";
5818 PyObject *errorHandler = NULL;
5819 PyObject *exc = NULL;
5820 /* the following variable is used for caching string comparisons
5821 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5822 * 3=ignore, 4=xmlcharrefreplace */
5823 int known_errorHandler = -1;
5824
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 PyErr_BadArgument();
5827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829
5830 /* allocate enough for a simple 1:1 translation without
5831 replacements, if we need more, we'll resize */
5832 res = PyUnicode_FromUnicode(NULL, size);
5833 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005837 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 /* try to encode it */
5841 PyObject *x = NULL;
5842 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5843 Py_XDECREF(x);
5844 goto onError;
5845 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005846 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 if (x!=Py_None) /* it worked => adjust input pointer */
5848 ++p;
5849 else { /* untranslatable character */
5850 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5851 Py_ssize_t repsize;
5852 Py_ssize_t newpos;
5853 Py_UNICODE *uni2;
5854 /* startpos for collecting untranslatable chars */
5855 const Py_UNICODE *collstart = p;
5856 const Py_UNICODE *collend = p+1;
5857 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 /* find all untranslatable characters */
5860 while (collend < endp) {
5861 if (charmaptranslate_lookup(*collend, mapping, &x))
5862 goto onError;
5863 Py_XDECREF(x);
5864 if (x!=Py_None)
5865 break;
5866 ++collend;
5867 }
5868 /* cache callback name lookup
5869 * (if not done yet, i.e. it's the first error) */
5870 if (known_errorHandler==-1) {
5871 if ((errors==NULL) || (!strcmp(errors, "strict")))
5872 known_errorHandler = 1;
5873 else if (!strcmp(errors, "replace"))
5874 known_errorHandler = 2;
5875 else if (!strcmp(errors, "ignore"))
5876 known_errorHandler = 3;
5877 else if (!strcmp(errors, "xmlcharrefreplace"))
5878 known_errorHandler = 4;
5879 else
5880 known_errorHandler = 0;
5881 }
5882 switch (known_errorHandler) {
5883 case 1: /* strict */
5884 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005885 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 case 2: /* replace */
5887 /* No need to check for space, this is a 1:1 replacement */
5888 for (coll = collstart; coll<collend; ++coll)
5889 *str++ = '?';
5890 /* fall through */
5891 case 3: /* ignore */
5892 p = collend;
5893 break;
5894 case 4: /* xmlcharrefreplace */
5895 /* generate replacement (temporarily (mis)uses p) */
5896 for (p = collstart; p < collend; ++p) {
5897 char buffer[2+29+1+1];
5898 char *cp;
5899 sprintf(buffer, "&#%d;", (int)*p);
5900 if (charmaptranslate_makespace(&res, &str,
5901 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5902 goto onError;
5903 for (cp = buffer; *cp; ++cp)
5904 *str++ = *cp;
5905 }
5906 p = collend;
5907 break;
5908 default:
5909 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5910 reason, startp, size, &exc,
5911 collstart-startp, collend-startp, &newpos);
5912 if (repunicode == NULL)
5913 goto onError;
5914 /* generate replacement */
5915 repsize = PyUnicode_GET_SIZE(repunicode);
5916 if (charmaptranslate_makespace(&res, &str,
5917 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5918 Py_DECREF(repunicode);
5919 goto onError;
5920 }
5921 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5922 *str++ = *uni2;
5923 p = startp + newpos;
5924 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005925 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005926 }
5927 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005928 /* Resize if we allocated to much */
5929 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005930 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 if (PyUnicode_Resize(&res, respos) < 0)
5932 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005933 }
5934 Py_XDECREF(exc);
5935 Py_XDECREF(errorHandler);
5936 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005939 Py_XDECREF(res);
5940 Py_XDECREF(exc);
5941 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 return NULL;
5943}
5944
5945PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 PyObject *mapping,
5947 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948{
5949 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005950
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 str = PyUnicode_FromObject(str);
5952 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 PyUnicode_GET_SIZE(str),
5956 mapping,
5957 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 Py_DECREF(str);
5959 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005960
Benjamin Peterson29060642009-01-31 22:14:21 +00005961 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 Py_XDECREF(str);
5963 return NULL;
5964}
Tim Petersced69f82003-09-16 20:30:58 +00005965
Guido van Rossum9e896b32000-04-05 20:11:21 +00005966/* --- Decimal Encoder ---------------------------------------------------- */
5967
5968int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 Py_ssize_t length,
5970 char *output,
5971 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005972{
5973 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005974 PyObject *errorHandler = NULL;
5975 PyObject *exc = NULL;
5976 const char *encoding = "decimal";
5977 const char *reason = "invalid decimal Unicode string";
5978 /* the following variable is used for caching string comparisons
5979 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5980 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005981
5982 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 PyErr_BadArgument();
5984 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005985 }
5986
5987 p = s;
5988 end = s + length;
5989 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 register Py_UNICODE ch = *p;
5991 int decimal;
5992 PyObject *repunicode;
5993 Py_ssize_t repsize;
5994 Py_ssize_t newpos;
5995 Py_UNICODE *uni2;
5996 Py_UNICODE *collstart;
5997 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005998
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006000 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 ++p;
6002 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006003 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 decimal = Py_UNICODE_TODECIMAL(ch);
6005 if (decimal >= 0) {
6006 *output++ = '0' + decimal;
6007 ++p;
6008 continue;
6009 }
6010 if (0 < ch && ch < 256) {
6011 *output++ = (char)ch;
6012 ++p;
6013 continue;
6014 }
6015 /* All other characters are considered unencodable */
6016 collstart = p;
6017 collend = p+1;
6018 while (collend < end) {
6019 if ((0 < *collend && *collend < 256) ||
6020 !Py_UNICODE_ISSPACE(*collend) ||
6021 Py_UNICODE_TODECIMAL(*collend))
6022 break;
6023 }
6024 /* cache callback name lookup
6025 * (if not done yet, i.e. it's the first error) */
6026 if (known_errorHandler==-1) {
6027 if ((errors==NULL) || (!strcmp(errors, "strict")))
6028 known_errorHandler = 1;
6029 else if (!strcmp(errors, "replace"))
6030 known_errorHandler = 2;
6031 else if (!strcmp(errors, "ignore"))
6032 known_errorHandler = 3;
6033 else if (!strcmp(errors, "xmlcharrefreplace"))
6034 known_errorHandler = 4;
6035 else
6036 known_errorHandler = 0;
6037 }
6038 switch (known_errorHandler) {
6039 case 1: /* strict */
6040 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6041 goto onError;
6042 case 2: /* replace */
6043 for (p = collstart; p < collend; ++p)
6044 *output++ = '?';
6045 /* fall through */
6046 case 3: /* ignore */
6047 p = collend;
6048 break;
6049 case 4: /* xmlcharrefreplace */
6050 /* generate replacement (temporarily (mis)uses p) */
6051 for (p = collstart; p < collend; ++p)
6052 output += sprintf(output, "&#%d;", (int)*p);
6053 p = collend;
6054 break;
6055 default:
6056 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6057 encoding, reason, s, length, &exc,
6058 collstart-s, collend-s, &newpos);
6059 if (repunicode == NULL)
6060 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006061 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006062 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006063 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6064 Py_DECREF(repunicode);
6065 goto onError;
6066 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 /* generate replacement */
6068 repsize = PyUnicode_GET_SIZE(repunicode);
6069 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6070 Py_UNICODE ch = *uni2;
6071 if (Py_UNICODE_ISSPACE(ch))
6072 *output++ = ' ';
6073 else {
6074 decimal = Py_UNICODE_TODECIMAL(ch);
6075 if (decimal >= 0)
6076 *output++ = '0' + decimal;
6077 else if (0 < ch && ch < 256)
6078 *output++ = (char)ch;
6079 else {
6080 Py_DECREF(repunicode);
6081 raise_encode_exception(&exc, encoding,
6082 s, length, collstart-s, collend-s, reason);
6083 goto onError;
6084 }
6085 }
6086 }
6087 p = s + newpos;
6088 Py_DECREF(repunicode);
6089 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006090 }
6091 /* 0-terminate the output string */
6092 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093 Py_XDECREF(exc);
6094 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006095 return 0;
6096
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098 Py_XDECREF(exc);
6099 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006100 return -1;
6101}
6102
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103/* --- Helpers ------------------------------------------------------------ */
6104
Eric Smith8c663262007-08-25 02:26:07 +00006105#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006106#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006107
Thomas Wouters477c8d52006-05-27 19:21:47 +00006108#include "stringlib/count.h"
6109#include "stringlib/find.h"
6110#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006111#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006112
Eric Smith5807c412008-05-11 21:00:57 +00006113#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006114#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006115#include "stringlib/localeutil.h"
6116
Thomas Wouters477c8d52006-05-27 19:21:47 +00006117/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006118#define ADJUST_INDICES(start, end, len) \
6119 if (end > len) \
6120 end = len; \
6121 else if (end < 0) { \
6122 end += len; \
6123 if (end < 0) \
6124 end = 0; \
6125 } \
6126 if (start < 0) { \
6127 start += len; \
6128 if (start < 0) \
6129 start = 0; \
6130 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006131
Martin v. Löwis18e16552006-02-15 17:27:45 +00006132Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006133 PyObject *substr,
6134 Py_ssize_t start,
6135 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006137 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006138 PyUnicodeObject* str_obj;
6139 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006140
Thomas Wouters477c8d52006-05-27 19:21:47 +00006141 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6142 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006144 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6145 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 Py_DECREF(str_obj);
6147 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 }
Tim Petersced69f82003-09-16 20:30:58 +00006149
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006150 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006151 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006152 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6153 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006154 );
6155
6156 Py_DECREF(sub_obj);
6157 Py_DECREF(str_obj);
6158
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 return result;
6160}
6161
Martin v. Löwis18e16552006-02-15 17:27:45 +00006162Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006163 PyObject *sub,
6164 Py_ssize_t start,
6165 Py_ssize_t end,
6166 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006168 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006169
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006171 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006173 sub = PyUnicode_FromObject(sub);
6174 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 Py_DECREF(str);
6176 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177 }
Tim Petersced69f82003-09-16 20:30:58 +00006178
Thomas Wouters477c8d52006-05-27 19:21:47 +00006179 if (direction > 0)
6180 result = stringlib_find_slice(
6181 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6182 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6183 start, end
6184 );
6185 else
6186 result = stringlib_rfind_slice(
6187 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6188 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6189 start, end
6190 );
6191
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006193 Py_DECREF(sub);
6194
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 return result;
6196}
6197
Tim Petersced69f82003-09-16 20:30:58 +00006198static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 PyUnicodeObject *substring,
6201 Py_ssize_t start,
6202 Py_ssize_t end,
6203 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 if (substring->length == 0)
6206 return 1;
6207
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006208 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 end -= substring->length;
6210 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212
6213 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 if (Py_UNICODE_MATCH(self, end, substring))
6215 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 } else {
6217 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 }
6220
6221 return 0;
6222}
6223
Martin v. Löwis18e16552006-02-15 17:27:45 +00006224Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006225 PyObject *substr,
6226 Py_ssize_t start,
6227 Py_ssize_t end,
6228 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006230 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006231
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 str = PyUnicode_FromObject(str);
6233 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 substr = PyUnicode_FromObject(substr);
6236 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 Py_DECREF(str);
6238 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 }
Tim Petersced69f82003-09-16 20:30:58 +00006240
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 (PyUnicodeObject *)substr,
6243 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 Py_DECREF(str);
6245 Py_DECREF(substr);
6246 return result;
6247}
6248
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249/* Apply fixfct filter to the Unicode object self and return a
6250 reference to the modified object */
6251
Tim Petersced69f82003-09-16 20:30:58 +00006252static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255{
6256
6257 PyUnicodeObject *u;
6258
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006259 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006262
6263 Py_UNICODE_COPY(u->str, self->str, self->length);
6264
Tim Peters7a29bd52001-09-12 03:03:31 +00006265 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 /* fixfct should return TRUE if it modified the buffer. If
6267 FALSE, return a reference to the original buffer instead
6268 (to save space, not time) */
6269 Py_INCREF(self);
6270 Py_DECREF(u);
6271 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 }
6273 return (PyObject*) u;
6274}
6275
Tim Petersced69f82003-09-16 20:30:58 +00006276static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277int fixupper(PyUnicodeObject *self)
6278{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006279 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 Py_UNICODE *s = self->str;
6281 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006282
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006285
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 ch = Py_UNICODE_TOUPPER(*s);
6287 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 *s = ch;
6290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 s++;
6292 }
6293
6294 return status;
6295}
6296
Tim Petersced69f82003-09-16 20:30:58 +00006297static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298int fixlower(PyUnicodeObject *self)
6299{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006300 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 Py_UNICODE *s = self->str;
6302 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006303
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006306
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 ch = Py_UNICODE_TOLOWER(*s);
6308 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 *s = ch;
6311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 s++;
6313 }
6314
6315 return status;
6316}
6317
Tim Petersced69f82003-09-16 20:30:58 +00006318static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319int fixswapcase(PyUnicodeObject *self)
6320{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006321 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 Py_UNICODE *s = self->str;
6323 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006324
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325 while (len-- > 0) {
6326 if (Py_UNICODE_ISUPPER(*s)) {
6327 *s = Py_UNICODE_TOLOWER(*s);
6328 status = 1;
6329 } else if (Py_UNICODE_ISLOWER(*s)) {
6330 *s = Py_UNICODE_TOUPPER(*s);
6331 status = 1;
6332 }
6333 s++;
6334 }
6335
6336 return status;
6337}
6338
Tim Petersced69f82003-09-16 20:30:58 +00006339static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340int fixcapitalize(PyUnicodeObject *self)
6341{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006342 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006343 Py_UNICODE *s = self->str;
6344 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006345
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006346 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006348 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 *s = Py_UNICODE_TOUPPER(*s);
6350 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006352 s++;
6353 while (--len > 0) {
6354 if (Py_UNICODE_ISUPPER(*s)) {
6355 *s = Py_UNICODE_TOLOWER(*s);
6356 status = 1;
6357 }
6358 s++;
6359 }
6360 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361}
6362
6363static
6364int fixtitle(PyUnicodeObject *self)
6365{
6366 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6367 register Py_UNICODE *e;
6368 int previous_is_cased;
6369
6370 /* Shortcut for single character strings */
6371 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6373 if (*p != ch) {
6374 *p = ch;
6375 return 1;
6376 }
6377 else
6378 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 }
Tim Petersced69f82003-09-16 20:30:58 +00006380
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 e = p + PyUnicode_GET_SIZE(self);
6382 previous_is_cased = 0;
6383 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006385
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 if (previous_is_cased)
6387 *p = Py_UNICODE_TOLOWER(ch);
6388 else
6389 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006390
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 if (Py_UNICODE_ISLOWER(ch) ||
6392 Py_UNICODE_ISUPPER(ch) ||
6393 Py_UNICODE_ISTITLE(ch))
6394 previous_is_cased = 1;
6395 else
6396 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 }
6398 return 1;
6399}
6400
Tim Peters8ce9f162004-08-27 01:49:32 +00006401PyObject *
6402PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403{
Skip Montanaro6543b452004-09-16 03:28:13 +00006404 const Py_UNICODE blank = ' ';
6405 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006406 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006407 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006408 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6409 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006410 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6411 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006412 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006413 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
Tim Peters05eba1f2004-08-27 21:32:02 +00006415 fseq = PySequence_Fast(seq, "");
6416 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006417 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006418 }
6419
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006420 /* NOTE: the following code can't call back into Python code,
6421 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006422 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006423
Tim Peters05eba1f2004-08-27 21:32:02 +00006424 seqlen = PySequence_Fast_GET_SIZE(fseq);
6425 /* If empty sequence, return u"". */
6426 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006427 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6428 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006429 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006430 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006431 /* If singleton sequence with an exact Unicode, return that. */
6432 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 item = items[0];
6434 if (PyUnicode_CheckExact(item)) {
6435 Py_INCREF(item);
6436 res = (PyUnicodeObject *)item;
6437 goto Done;
6438 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006439 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006440 else {
6441 /* Set up sep and seplen */
6442 if (separator == NULL) {
6443 sep = &blank;
6444 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006445 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006446 else {
6447 if (!PyUnicode_Check(separator)) {
6448 PyErr_Format(PyExc_TypeError,
6449 "separator: expected str instance,"
6450 " %.80s found",
6451 Py_TYPE(separator)->tp_name);
6452 goto onError;
6453 }
6454 sep = PyUnicode_AS_UNICODE(separator);
6455 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006456 }
6457 }
6458
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006459 /* There are at least two things to join, or else we have a subclass
6460 * of str in the sequence.
6461 * Do a pre-pass to figure out the total amount of space we'll
6462 * need (sz), and see whether all argument are strings.
6463 */
6464 sz = 0;
6465 for (i = 0; i < seqlen; i++) {
6466 const Py_ssize_t old_sz = sz;
6467 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 if (!PyUnicode_Check(item)) {
6469 PyErr_Format(PyExc_TypeError,
6470 "sequence item %zd: expected str instance,"
6471 " %.80s found",
6472 i, Py_TYPE(item)->tp_name);
6473 goto onError;
6474 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006475 sz += PyUnicode_GET_SIZE(item);
6476 if (i != 0)
6477 sz += seplen;
6478 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6479 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006481 goto onError;
6482 }
6483 }
Tim Petersced69f82003-09-16 20:30:58 +00006484
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006485 res = _PyUnicode_New(sz);
6486 if (res == NULL)
6487 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006488
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006489 /* Catenate everything. */
6490 res_p = PyUnicode_AS_UNICODE(res);
6491 for (i = 0; i < seqlen; ++i) {
6492 Py_ssize_t itemlen;
6493 item = items[i];
6494 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 /* Copy item, and maybe the separator. */
6496 if (i) {
6497 Py_UNICODE_COPY(res_p, sep, seplen);
6498 res_p += seplen;
6499 }
6500 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6501 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006502 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006503
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006505 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 return (PyObject *)res;
6507
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006509 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006510 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 return NULL;
6512}
6513
Tim Petersced69f82003-09-16 20:30:58 +00006514static
6515PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 Py_ssize_t left,
6517 Py_ssize_t right,
6518 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519{
6520 PyUnicodeObject *u;
6521
6522 if (left < 0)
6523 left = 0;
6524 if (right < 0)
6525 right = 0;
6526
Tim Peters7a29bd52001-09-12 03:03:31 +00006527 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 Py_INCREF(self);
6529 return self;
6530 }
6531
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006532 if (left > PY_SSIZE_T_MAX - self->length ||
6533 right > PY_SSIZE_T_MAX - (left + self->length)) {
6534 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6535 return NULL;
6536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 u = _PyUnicode_New(left + self->length + right);
6538 if (u) {
6539 if (left)
6540 Py_UNICODE_FILL(u->str, fill, left);
6541 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6542 if (right)
6543 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6544 }
6545
6546 return u;
6547}
6548
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006549PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
6553 string = PyUnicode_FromObject(string);
6554 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006557 list = stringlib_splitlines(
6558 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6559 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560
6561 Py_DECREF(string);
6562 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563}
6564
Tim Petersced69f82003-09-16 20:30:58 +00006565static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 PyUnicodeObject *substring,
6568 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006571 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006574 return stringlib_split_whitespace(
6575 (PyObject*) self, self->str, self->length, maxcount
6576 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006578 return stringlib_split(
6579 (PyObject*) self, self->str, self->length,
6580 substring->str, substring->length,
6581 maxcount
6582 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583}
6584
Tim Petersced69f82003-09-16 20:30:58 +00006585static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006586PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 PyUnicodeObject *substring,
6588 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006589{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006590 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006591 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006592
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006593 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006594 return stringlib_rsplit_whitespace(
6595 (PyObject*) self, self->str, self->length, maxcount
6596 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006597
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006598 return stringlib_rsplit(
6599 (PyObject*) self, self->str, self->length,
6600 substring->str, substring->length,
6601 maxcount
6602 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006603}
6604
6605static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 PyUnicodeObject *str1,
6608 PyUnicodeObject *str2,
6609 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610{
6611 PyUnicodeObject *u;
6612
6613 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006615 else if (maxcount == 0 || self->length == 0)
6616 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617
Thomas Wouters477c8d52006-05-27 19:21:47 +00006618 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006619 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006620 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006621 if (str1->length == 0)
6622 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006623 if (str1->length == 1) {
6624 /* replace characters */
6625 Py_UNICODE u1, u2;
6626 if (!findchar(self->str, self->length, str1->str[0]))
6627 goto nothing;
6628 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6629 if (!u)
6630 return NULL;
6631 Py_UNICODE_COPY(u->str, self->str, self->length);
6632 u1 = str1->str[0];
6633 u2 = str2->str[0];
6634 for (i = 0; i < u->length; i++)
6635 if (u->str[i] == u1) {
6636 if (--maxcount < 0)
6637 break;
6638 u->str[i] = u2;
6639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006641 i = stringlib_find(
6642 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006644 if (i < 0)
6645 goto nothing;
6646 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6647 if (!u)
6648 return NULL;
6649 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006650
6651 /* change everything in-place, starting with this one */
6652 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6653 i += str1->length;
6654
6655 while ( --maxcount > 0) {
6656 i = stringlib_find(self->str+i, self->length-i,
6657 str1->str, str1->length,
6658 i);
6659 if (i == -1)
6660 break;
6661 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6662 i += str1->length;
6663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006666
6667 Py_ssize_t n, i, j, e;
6668 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 Py_UNICODE *p;
6670
6671 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006672 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6673 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006674 if (n == 0)
6675 goto nothing;
6676 /* new_size = self->length + n * (str2->length - str1->length)); */
6677 delta = (str2->length - str1->length);
6678 if (delta == 0) {
6679 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006681 product = n * (str2->length - str1->length);
6682 if ((product / (str2->length - str1->length)) != n) {
6683 PyErr_SetString(PyExc_OverflowError,
6684 "replace string is too long");
6685 return NULL;
6686 }
6687 new_size = self->length + product;
6688 if (new_size < 0) {
6689 PyErr_SetString(PyExc_OverflowError,
6690 "replace string is too long");
6691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 }
6693 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006694 u = _PyUnicode_New(new_size);
6695 if (!u)
6696 return NULL;
6697 i = 0;
6698 p = u->str;
6699 e = self->length - str1->length;
6700 if (str1->length > 0) {
6701 while (n-- > 0) {
6702 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006703 j = stringlib_find(self->str+i, self->length-i,
6704 str1->str, str1->length,
6705 i);
6706 if (j == -1)
6707 break;
6708 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006709 /* copy unchanged part [i:j] */
6710 Py_UNICODE_COPY(p, self->str+i, j-i);
6711 p += j - i;
6712 }
6713 /* copy substitution string */
6714 if (str2->length > 0) {
6715 Py_UNICODE_COPY(p, str2->str, str2->length);
6716 p += str2->length;
6717 }
6718 i = j + str1->length;
6719 }
6720 if (i < self->length)
6721 /* copy tail [i:] */
6722 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6723 } else {
6724 /* interleave */
6725 while (n > 0) {
6726 Py_UNICODE_COPY(p, str2->str, str2->length);
6727 p += str2->length;
6728 if (--n <= 0)
6729 break;
6730 *p++ = self->str[i++];
6731 }
6732 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006736
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006738 /* nothing to replace; return original string (when possible) */
6739 if (PyUnicode_CheckExact(self)) {
6740 Py_INCREF(self);
6741 return (PyObject *) self;
6742 }
6743 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744}
6745
6746/* --- Unicode Object Methods --------------------------------------------- */
6747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006748PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750\n\
6751Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006752characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753
6754static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006755unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 return fixup(self, fixtitle);
6758}
6759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006760PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762\n\
6763Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006764have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765
6766static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006767unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 return fixup(self, fixcapitalize);
6770}
6771
6772#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006773PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775\n\
6776Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006777normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778
6779static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006780unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781{
6782 PyObject *list;
6783 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006784 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 /* Split into words */
6787 list = split(self, NULL, -1);
6788 if (!list)
6789 return NULL;
6790
6791 /* Capitalize each word */
6792 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6793 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 if (item == NULL)
6796 goto onError;
6797 Py_DECREF(PyList_GET_ITEM(list, i));
6798 PyList_SET_ITEM(list, i, item);
6799 }
6800
6801 /* Join the words to form a new string */
6802 item = PyUnicode_Join(NULL, list);
6803
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 Py_DECREF(list);
6806 return (PyObject *)item;
6807}
6808#endif
6809
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006810/* Argument converter. Coerces to a single unicode character */
6811
6812static int
6813convert_uc(PyObject *obj, void *addr)
6814{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006815 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6816 PyObject *uniobj;
6817 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006818
Benjamin Peterson14339b62009-01-31 16:36:08 +00006819 uniobj = PyUnicode_FromObject(obj);
6820 if (uniobj == NULL) {
6821 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006823 return 0;
6824 }
6825 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6826 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006828 Py_DECREF(uniobj);
6829 return 0;
6830 }
6831 unistr = PyUnicode_AS_UNICODE(uniobj);
6832 *fillcharloc = unistr[0];
6833 Py_DECREF(uniobj);
6834 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006835}
6836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006837PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006840Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006841done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842
6843static PyObject *
6844unicode_center(PyUnicodeObject *self, PyObject *args)
6845{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006846 Py_ssize_t marg, left;
6847 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006848 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849
Thomas Woutersde017742006-02-16 19:34:37 +00006850 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 return NULL;
6852
Tim Peters7a29bd52001-09-12 03:03:31 +00006853 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 Py_INCREF(self);
6855 return (PyObject*) self;
6856 }
6857
6858 marg = width - self->length;
6859 left = marg / 2 + (marg & width & 1);
6860
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006861 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862}
6863
Marc-André Lemburge5034372000-08-08 08:04:29 +00006864#if 0
6865
6866/* This code should go into some future Unicode collation support
6867 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006868 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006869
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006870/* speedy UTF-16 code point order comparison */
6871/* gleaned from: */
6872/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6873
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006874static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006875{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006876 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006877 0, 0, 0, 0, 0, 0, 0, 0,
6878 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006879 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006880};
6881
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882static int
6883unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6884{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006885 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006886
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 Py_UNICODE *s1 = str1->str;
6888 Py_UNICODE *s2 = str2->str;
6889
6890 len1 = str1->length;
6891 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006892
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006894 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006895
6896 c1 = *s1++;
6897 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006898
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 if (c1 > (1<<11) * 26)
6900 c1 += utf16Fixup[c1>>11];
6901 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006902 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006903 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006904
6905 if (c1 != c2)
6906 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006907
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006908 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 }
6910
6911 return (len1 < len2) ? -1 : (len1 != len2);
6912}
6913
Marc-André Lemburge5034372000-08-08 08:04:29 +00006914#else
6915
6916static int
6917unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6918{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006919 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006920
6921 Py_UNICODE *s1 = str1->str;
6922 Py_UNICODE *s2 = str2->str;
6923
6924 len1 = str1->length;
6925 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006926
Marc-André Lemburge5034372000-08-08 08:04:29 +00006927 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006928 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006929
Fredrik Lundh45714e92001-06-26 16:39:36 +00006930 c1 = *s1++;
6931 c2 = *s2++;
6932
6933 if (c1 != c2)
6934 return (c1 < c2) ? -1 : 1;
6935
Marc-André Lemburge5034372000-08-08 08:04:29 +00006936 len1--; len2--;
6937 }
6938
6939 return (len1 < len2) ? -1 : (len1 != len2);
6940}
6941
6942#endif
6943
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006947 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6948 return unicode_compare((PyUnicodeObject *)left,
6949 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006950 PyErr_Format(PyExc_TypeError,
6951 "Can't compare %.100s and %.100s",
6952 left->ob_type->tp_name,
6953 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 return -1;
6955}
6956
Martin v. Löwis5b222132007-06-10 09:51:05 +00006957int
6958PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6959{
6960 int i;
6961 Py_UNICODE *id;
6962 assert(PyUnicode_Check(uni));
6963 id = PyUnicode_AS_UNICODE(uni);
6964 /* Compare Unicode string and source character set string */
6965 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 if (id[i] != str[i])
6967 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006968 /* This check keeps Python strings that end in '\0' from comparing equal
6969 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006970 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006972 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006974 return 0;
6975}
6976
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006977
Benjamin Peterson29060642009-01-31 22:14:21 +00006978#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006979 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006980
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006981PyObject *PyUnicode_RichCompare(PyObject *left,
6982 PyObject *right,
6983 int op)
6984{
6985 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006986
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006987 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6988 PyObject *v;
6989 if (((PyUnicodeObject *) left)->length !=
6990 ((PyUnicodeObject *) right)->length) {
6991 if (op == Py_EQ) {
6992 Py_INCREF(Py_False);
6993 return Py_False;
6994 }
6995 if (op == Py_NE) {
6996 Py_INCREF(Py_True);
6997 return Py_True;
6998 }
6999 }
7000 if (left == right)
7001 result = 0;
7002 else
7003 result = unicode_compare((PyUnicodeObject *)left,
7004 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007005
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007006 /* Convert the return value to a Boolean */
7007 switch (op) {
7008 case Py_EQ:
7009 v = TEST_COND(result == 0);
7010 break;
7011 case Py_NE:
7012 v = TEST_COND(result != 0);
7013 break;
7014 case Py_LE:
7015 v = TEST_COND(result <= 0);
7016 break;
7017 case Py_GE:
7018 v = TEST_COND(result >= 0);
7019 break;
7020 case Py_LT:
7021 v = TEST_COND(result == -1);
7022 break;
7023 case Py_GT:
7024 v = TEST_COND(result == 1);
7025 break;
7026 default:
7027 PyErr_BadArgument();
7028 return NULL;
7029 }
7030 Py_INCREF(v);
7031 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007032 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007033
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007034 Py_INCREF(Py_NotImplemented);
7035 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007036}
7037
Guido van Rossum403d68b2000-03-13 15:55:09 +00007038int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007040{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007041 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007042 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007043
7044 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007045 sub = PyUnicode_FromObject(element);
7046 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 PyErr_Format(PyExc_TypeError,
7048 "'in <string>' requires string as left operand, not %s",
7049 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007050 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007051 }
7052
Thomas Wouters477c8d52006-05-27 19:21:47 +00007053 str = PyUnicode_FromObject(container);
7054 if (!str) {
7055 Py_DECREF(sub);
7056 return -1;
7057 }
7058
7059 result = stringlib_contains_obj(str, sub);
7060
7061 Py_DECREF(str);
7062 Py_DECREF(sub);
7063
Guido van Rossum403d68b2000-03-13 15:55:09 +00007064 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007065}
7066
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067/* Concat to string or Unicode object giving a new Unicode object. */
7068
7069PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071{
7072 PyUnicodeObject *u = NULL, *v = NULL, *w;
7073
7074 /* Coerce the two arguments */
7075 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7076 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7079 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081
7082 /* Shortcuts */
7083 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 Py_DECREF(v);
7085 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086 }
7087 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 Py_DECREF(u);
7089 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 }
7091
7092 /* Concat the two Unicode strings */
7093 w = _PyUnicode_New(u->length + v->length);
7094 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 Py_UNICODE_COPY(w->str, u->str, u->length);
7097 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7098
7099 Py_DECREF(u);
7100 Py_DECREF(v);
7101 return (PyObject *)w;
7102
Benjamin Peterson29060642009-01-31 22:14:21 +00007103 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 Py_XDECREF(u);
7105 Py_XDECREF(v);
7106 return NULL;
7107}
7108
Walter Dörwald1ab83302007-05-18 17:15:44 +00007109void
7110PyUnicode_Append(PyObject **pleft, PyObject *right)
7111{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007112 PyObject *new;
7113 if (*pleft == NULL)
7114 return;
7115 if (right == NULL || !PyUnicode_Check(*pleft)) {
7116 Py_DECREF(*pleft);
7117 *pleft = NULL;
7118 return;
7119 }
7120 new = PyUnicode_Concat(*pleft, right);
7121 Py_DECREF(*pleft);
7122 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007123}
7124
7125void
7126PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7127{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007128 PyUnicode_Append(pleft, right);
7129 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007130}
7131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007132PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007135Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007136string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007137interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
7139static PyObject *
7140unicode_count(PyUnicodeObject *self, PyObject *args)
7141{
7142 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007143 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007144 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 PyObject *result;
7146
Guido van Rossumb8872e62000-05-09 14:14:27 +00007147 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 return NULL;
7150
7151 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007152 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007155
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007156 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007157 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007158 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007159 substring->str, substring->length,
7160 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007161 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162
7163 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007164
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 return result;
7166}
7167
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007168PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007171Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007172to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007173handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007174a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7175'xmlcharrefreplace' as well as any other name registered with\n\
7176codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177
7178static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007179unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007181 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 char *encoding = NULL;
7183 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007184 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007185
Benjamin Peterson308d6372009-09-18 21:42:35 +00007186 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7187 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007189 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007190 if (v == NULL)
7191 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007192 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007193 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007194 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007195 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007196 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007197 Py_DECREF(v);
7198 return NULL;
7199 }
7200 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007201
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007203 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007204}
7205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007206PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208\n\
7209Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211
7212static PyObject*
7213unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7214{
7215 Py_UNICODE *e;
7216 Py_UNICODE *p;
7217 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007218 Py_UNICODE *qe;
7219 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220 PyUnicodeObject *u;
7221 int tabsize = 8;
7222
7223 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225
Thomas Wouters7e474022000-07-16 12:04:32 +00007226 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007227 i = 0; /* chars up to and including most recent \n or \r */
7228 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7229 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230 for (p = self->str; p < e; p++)
7231 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 if (tabsize > 0) {
7233 incr = tabsize - (j % tabsize); /* cannot overflow */
7234 if (j > PY_SSIZE_T_MAX - incr)
7235 goto overflow1;
7236 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007237 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 if (j > PY_SSIZE_T_MAX - 1)
7241 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 j++;
7243 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 if (i > PY_SSIZE_T_MAX - j)
7245 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007247 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 }
7249 }
7250
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007251 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007252 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007253
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 /* Second pass: create output string and fill it */
7255 u = _PyUnicode_New(i + j);
7256 if (!u)
7257 return NULL;
7258
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007259 j = 0; /* same as in first pass */
7260 q = u->str; /* next output char */
7261 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262
7263 for (p = self->str; p < e; p++)
7264 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007265 if (tabsize > 0) {
7266 i = tabsize - (j % tabsize);
7267 j += i;
7268 while (i--) {
7269 if (q >= qe)
7270 goto overflow2;
7271 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007272 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007274 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007275 else {
7276 if (q >= qe)
7277 goto overflow2;
7278 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007279 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280 if (*p == '\n' || *p == '\r')
7281 j = 0;
7282 }
7283
7284 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007285
7286 overflow2:
7287 Py_DECREF(u);
7288 overflow1:
7289 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291}
7292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007293PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007294 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295\n\
7296Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007297such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298arguments start and end are interpreted as in slice notation.\n\
7299\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007300Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301
7302static PyObject *
7303unicode_find(PyUnicodeObject *self, PyObject *args)
7304{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007305 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007306 Py_ssize_t start;
7307 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007308 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309
Christian Heimes9cd17752007-11-18 19:35:23 +00007310 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312
Thomas Wouters477c8d52006-05-27 19:21:47 +00007313 result = stringlib_find_slice(
7314 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7315 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7316 start, end
7317 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318
7319 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007320
Christian Heimes217cfd12007-12-02 14:31:20 +00007321 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322}
7323
7324static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007325unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326{
7327 if (index < 0 || index >= self->length) {
7328 PyErr_SetString(PyExc_IndexError, "string index out of range");
7329 return NULL;
7330 }
7331
7332 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7333}
7334
Guido van Rossumc2504932007-09-18 19:42:40 +00007335/* Believe it or not, this produces the same value for ASCII strings
7336 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007338unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339{
Guido van Rossumc2504932007-09-18 19:42:40 +00007340 Py_ssize_t len;
7341 Py_UNICODE *p;
7342 long x;
7343
7344 if (self->hash != -1)
7345 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007346 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007347 p = self->str;
7348 x = *p << 7;
7349 while (--len >= 0)
7350 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007351 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007352 if (x == -1)
7353 x = -2;
7354 self->hash = x;
7355 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356}
7357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007358PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007361Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362
7363static PyObject *
7364unicode_index(PyUnicodeObject *self, PyObject *args)
7365{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007366 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007367 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007368 Py_ssize_t start;
7369 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370
Christian Heimes9cd17752007-11-18 19:35:23 +00007371 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
Thomas Wouters477c8d52006-05-27 19:21:47 +00007374 result = stringlib_find_slice(
7375 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7376 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7377 start, end
7378 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379
7380 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007381
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382 if (result < 0) {
7383 PyErr_SetString(PyExc_ValueError, "substring not found");
7384 return NULL;
7385 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007386
Christian Heimes217cfd12007-12-02 14:31:20 +00007387 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388}
7389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007390PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007393Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007394at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395
7396static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007397unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398{
7399 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7400 register const Py_UNICODE *e;
7401 int cased;
7402
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403 /* Shortcut for single character strings */
7404 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007407 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007408 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007410
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 e = p + PyUnicode_GET_SIZE(self);
7412 cased = 0;
7413 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007415
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7417 return PyBool_FromLong(0);
7418 else if (!cased && Py_UNICODE_ISLOWER(ch))
7419 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007421 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422}
7423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007424PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007427Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007428at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429
7430static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007431unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432{
7433 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7434 register const Py_UNICODE *e;
7435 int cased;
7436
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437 /* Shortcut for single character strings */
7438 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007441 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007442 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007444
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 e = p + PyUnicode_GET_SIZE(self);
7446 cased = 0;
7447 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007449
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7451 return PyBool_FromLong(0);
7452 else if (!cased && Py_UNICODE_ISUPPER(ch))
7453 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007455 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456}
7457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007458PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007461Return True if S is a titlecased string and there is at least one\n\
7462character in S, i.e. upper- and titlecase characters may only\n\
7463follow uncased characters and lowercase characters only cased ones.\n\
7464Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465
7466static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007467unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468{
7469 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7470 register const Py_UNICODE *e;
7471 int cased, previous_is_cased;
7472
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473 /* Shortcut for single character strings */
7474 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7476 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007478 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007479 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007481
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 e = p + PyUnicode_GET_SIZE(self);
7483 cased = 0;
7484 previous_is_cased = 0;
7485 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007487
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7489 if (previous_is_cased)
7490 return PyBool_FromLong(0);
7491 previous_is_cased = 1;
7492 cased = 1;
7493 }
7494 else if (Py_UNICODE_ISLOWER(ch)) {
7495 if (!previous_is_cased)
7496 return PyBool_FromLong(0);
7497 previous_is_cased = 1;
7498 cased = 1;
7499 }
7500 else
7501 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007503 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504}
7505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007506PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007509Return True if all characters in S are whitespace\n\
7510and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511
7512static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007513unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514{
7515 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7516 register const Py_UNICODE *e;
7517
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 /* Shortcut for single character strings */
7519 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 Py_UNICODE_ISSPACE(*p))
7521 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007523 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007524 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007526
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 e = p + PyUnicode_GET_SIZE(self);
7528 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 if (!Py_UNICODE_ISSPACE(*p))
7530 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007532 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533}
7534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007535PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007537\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007538Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007539and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007540
7541static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007542unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007543{
7544 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7545 register const Py_UNICODE *e;
7546
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007547 /* Shortcut for single character strings */
7548 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007549 Py_UNICODE_ISALPHA(*p))
7550 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007551
7552 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007553 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007554 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007555
7556 e = p + PyUnicode_GET_SIZE(self);
7557 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 if (!Py_UNICODE_ISALPHA(*p))
7559 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007560 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007561 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007562}
7563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007564PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007565 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007566\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007567Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007568and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007569
7570static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007571unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007572{
7573 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7574 register const Py_UNICODE *e;
7575
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007576 /* Shortcut for single character strings */
7577 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 Py_UNICODE_ISALNUM(*p))
7579 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007580
7581 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007582 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007584
7585 e = p + PyUnicode_GET_SIZE(self);
7586 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 if (!Py_UNICODE_ISALNUM(*p))
7588 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007589 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007590 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007591}
7592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007593PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007596Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007597False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598
7599static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007600unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601{
7602 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7603 register const Py_UNICODE *e;
7604
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 /* Shortcut for single character strings */
7606 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 Py_UNICODE_ISDECIMAL(*p))
7608 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007610 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007611 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007613
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614 e = p + PyUnicode_GET_SIZE(self);
7615 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 if (!Py_UNICODE_ISDECIMAL(*p))
7617 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007619 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620}
7621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007622PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007625Return True if all characters in S are digits\n\
7626and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
7628static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007629unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630{
7631 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7632 register const Py_UNICODE *e;
7633
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634 /* Shortcut for single character strings */
7635 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 Py_UNICODE_ISDIGIT(*p))
7637 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007639 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007640 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007642
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643 e = p + PyUnicode_GET_SIZE(self);
7644 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 if (!Py_UNICODE_ISDIGIT(*p))
7646 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007648 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649}
7650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007651PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007654Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007655False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656
7657static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007658unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659{
7660 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7661 register const Py_UNICODE *e;
7662
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663 /* Shortcut for single character strings */
7664 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 Py_UNICODE_ISNUMERIC(*p))
7666 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007668 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007669 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007671
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672 e = p + PyUnicode_GET_SIZE(self);
7673 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 if (!Py_UNICODE_ISNUMERIC(*p))
7675 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007677 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678}
7679
Martin v. Löwis47383402007-08-15 07:32:56 +00007680int
7681PyUnicode_IsIdentifier(PyObject *self)
7682{
7683 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7684 register const Py_UNICODE *e;
7685
7686 /* Special case for empty strings */
7687 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007689
7690 /* PEP 3131 says that the first character must be in
7691 XID_Start and subsequent characters in XID_Continue,
7692 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007693 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007694 letters, digits, underscore). However, given the current
7695 definition of XID_Start and XID_Continue, it is sufficient
7696 to check just for these, except that _ must be allowed
7697 as starting an identifier. */
7698 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7699 return 0;
7700
7701 e = p + PyUnicode_GET_SIZE(self);
7702 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007703 if (!_PyUnicode_IsXidContinue(*p))
7704 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007705 }
7706 return 1;
7707}
7708
7709PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007711\n\
7712Return True if S is a valid identifier according\n\
7713to the language definition.");
7714
7715static PyObject*
7716unicode_isidentifier(PyObject *self)
7717{
7718 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7719}
7720
Georg Brandl559e5d72008-06-11 18:37:52 +00007721PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007723\n\
7724Return True if all characters in S are considered\n\
7725printable in repr() or S is empty, False otherwise.");
7726
7727static PyObject*
7728unicode_isprintable(PyObject *self)
7729{
7730 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7731 register const Py_UNICODE *e;
7732
7733 /* Shortcut for single character strings */
7734 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7735 Py_RETURN_TRUE;
7736 }
7737
7738 e = p + PyUnicode_GET_SIZE(self);
7739 for (; p < e; p++) {
7740 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7741 Py_RETURN_FALSE;
7742 }
7743 }
7744 Py_RETURN_TRUE;
7745}
7746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007747PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007748 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749\n\
7750Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007751iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752
7753static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007754unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007756 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757}
7758
Martin v. Löwis18e16552006-02-15 17:27:45 +00007759static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760unicode_length(PyUnicodeObject *self)
7761{
7762 return self->length;
7763}
7764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007765PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007768Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007769done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770
7771static PyObject *
7772unicode_ljust(PyUnicodeObject *self, PyObject *args)
7773{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007774 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007775 Py_UNICODE fillchar = ' ';
7776
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007777 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778 return NULL;
7779
Tim Peters7a29bd52001-09-12 03:03:31 +00007780 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781 Py_INCREF(self);
7782 return (PyObject*) self;
7783 }
7784
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007785 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786}
7787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007788PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007791Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792
7793static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007794unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796 return fixup(self, fixlower);
7797}
7798
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007799#define LEFTSTRIP 0
7800#define RIGHTSTRIP 1
7801#define BOTHSTRIP 2
7802
7803/* Arrays indexed by above */
7804static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7805
7806#define STRIPNAME(i) (stripformat[i]+3)
7807
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007808/* externally visible for str.strip(unicode) */
7809PyObject *
7810_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7811{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007812 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7813 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7814 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7815 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7816 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007817
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007819
Benjamin Peterson14339b62009-01-31 16:36:08 +00007820 i = 0;
7821 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7823 i++;
7824 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007825 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007826
Benjamin Peterson14339b62009-01-31 16:36:08 +00007827 j = len;
7828 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 do {
7830 j--;
7831 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7832 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007833 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007834
Benjamin Peterson14339b62009-01-31 16:36:08 +00007835 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 Py_INCREF(self);
7837 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007838 }
7839 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007841}
7842
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843
7844static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007845do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007847 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7848 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007849
Benjamin Peterson14339b62009-01-31 16:36:08 +00007850 i = 0;
7851 if (striptype != RIGHTSTRIP) {
7852 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7853 i++;
7854 }
7855 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007856
Benjamin Peterson14339b62009-01-31 16:36:08 +00007857 j = len;
7858 if (striptype != LEFTSTRIP) {
7859 do {
7860 j--;
7861 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7862 j++;
7863 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007864
Benjamin Peterson14339b62009-01-31 16:36:08 +00007865 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7866 Py_INCREF(self);
7867 return (PyObject*)self;
7868 }
7869 else
7870 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871}
7872
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007873
7874static PyObject *
7875do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7876{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007877 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007878
Benjamin Peterson14339b62009-01-31 16:36:08 +00007879 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7880 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007881
Benjamin Peterson14339b62009-01-31 16:36:08 +00007882 if (sep != NULL && sep != Py_None) {
7883 if (PyUnicode_Check(sep))
7884 return _PyUnicode_XStrip(self, striptype, sep);
7885 else {
7886 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 "%s arg must be None or str",
7888 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007889 return NULL;
7890 }
7891 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007892
Benjamin Peterson14339b62009-01-31 16:36:08 +00007893 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007894}
7895
7896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007897PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007899\n\
7900Return a copy of the string S with leading and trailing\n\
7901whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007902If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007903
7904static PyObject *
7905unicode_strip(PyUnicodeObject *self, PyObject *args)
7906{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007907 if (PyTuple_GET_SIZE(args) == 0)
7908 return do_strip(self, BOTHSTRIP); /* Common case */
7909 else
7910 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007911}
7912
7913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007914PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007915 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007916\n\
7917Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007918If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007919
7920static PyObject *
7921unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7922{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007923 if (PyTuple_GET_SIZE(args) == 0)
7924 return do_strip(self, LEFTSTRIP); /* Common case */
7925 else
7926 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007927}
7928
7929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007930PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007932\n\
7933Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007934If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007935
7936static PyObject *
7937unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7938{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007939 if (PyTuple_GET_SIZE(args) == 0)
7940 return do_strip(self, RIGHTSTRIP); /* Common case */
7941 else
7942 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007943}
7944
7945
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007947unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948{
7949 PyUnicodeObject *u;
7950 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007951 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007952 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953
Georg Brandl222de0f2009-04-12 12:01:50 +00007954 if (len < 1) {
7955 Py_INCREF(unicode_empty);
7956 return (PyObject *)unicode_empty;
7957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958
Tim Peters7a29bd52001-09-12 03:03:31 +00007959 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 /* no repeat, return original string */
7961 Py_INCREF(str);
7962 return (PyObject*) str;
7963 }
Tim Peters8f422462000-09-09 06:13:41 +00007964
7965 /* ensure # of chars needed doesn't overflow int and # of bytes
7966 * needed doesn't overflow size_t
7967 */
7968 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007969 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007970 PyErr_SetString(PyExc_OverflowError,
7971 "repeated string is too long");
7972 return NULL;
7973 }
7974 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7975 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7976 PyErr_SetString(PyExc_OverflowError,
7977 "repeated string is too long");
7978 return NULL;
7979 }
7980 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 if (!u)
7982 return NULL;
7983
7984 p = u->str;
7985
Georg Brandl222de0f2009-04-12 12:01:50 +00007986 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007987 Py_UNICODE_FILL(p, str->str[0], len);
7988 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007989 Py_ssize_t done = str->length; /* number of characters copied this far */
7990 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007992 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007993 Py_UNICODE_COPY(p+done, p, n);
7994 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 }
7997
7998 return (PyObject*) u;
7999}
8000
8001PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 PyObject *subobj,
8003 PyObject *replobj,
8004 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005{
8006 PyObject *self;
8007 PyObject *str1;
8008 PyObject *str2;
8009 PyObject *result;
8010
8011 self = PyUnicode_FromObject(obj);
8012 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 str1 = PyUnicode_FromObject(subobj);
8015 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 Py_DECREF(self);
8017 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 }
8019 str2 = PyUnicode_FromObject(replobj);
8020 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 Py_DECREF(self);
8022 Py_DECREF(str1);
8023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 }
Tim Petersced69f82003-09-16 20:30:58 +00008025 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 (PyUnicodeObject *)str1,
8027 (PyUnicodeObject *)str2,
8028 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 Py_DECREF(self);
8030 Py_DECREF(str1);
8031 Py_DECREF(str2);
8032 return result;
8033}
8034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008035PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008036 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037\n\
8038Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008039old replaced by new. If the optional argument count is\n\
8040given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041
8042static PyObject*
8043unicode_replace(PyUnicodeObject *self, PyObject *args)
8044{
8045 PyUnicodeObject *str1;
8046 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008047 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 PyObject *result;
8049
Martin v. Löwis18e16552006-02-15 17:27:45 +00008050 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 return NULL;
8052 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8053 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008056 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 Py_DECREF(str1);
8058 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060
8061 result = replace(self, str1, str2, maxcount);
8062
8063 Py_DECREF(str1);
8064 Py_DECREF(str2);
8065 return result;
8066}
8067
8068static
8069PyObject *unicode_repr(PyObject *unicode)
8070{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008071 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008072 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008073 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8074 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8075
8076 /* XXX(nnorwitz): rather than over-allocating, it would be
8077 better to choose a different scheme. Perhaps scan the
8078 first N-chars of the string and allocate based on that size.
8079 */
8080 /* Initial allocation is based on the longest-possible unichr
8081 escape.
8082
8083 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8084 unichr, so in this case it's the longest unichr escape. In
8085 narrow (UTF-16) builds this is five chars per source unichr
8086 since there are two unichrs in the surrogate pair, so in narrow
8087 (UTF-16) builds it's not the longest unichr escape.
8088
8089 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8090 so in the narrow (UTF-16) build case it's the longest unichr
8091 escape.
8092 */
8093
Walter Dörwald1ab83302007-05-18 17:15:44 +00008094 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008096#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008098#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008100#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008102 if (repr == NULL)
8103 return NULL;
8104
Walter Dörwald1ab83302007-05-18 17:15:44 +00008105 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008106
8107 /* Add quote */
8108 *p++ = (findchar(s, size, '\'') &&
8109 !findchar(s, size, '"')) ? '"' : '\'';
8110 while (size-- > 0) {
8111 Py_UNICODE ch = *s++;
8112
8113 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008114 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008115 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008116 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008117 continue;
8118 }
8119
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008121 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008122 *p++ = '\\';
8123 *p++ = 't';
8124 }
8125 else if (ch == '\n') {
8126 *p++ = '\\';
8127 *p++ = 'n';
8128 }
8129 else if (ch == '\r') {
8130 *p++ = '\\';
8131 *p++ = 'r';
8132 }
8133
8134 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008135 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008136 *p++ = '\\';
8137 *p++ = 'x';
8138 *p++ = hexdigits[(ch >> 4) & 0x000F];
8139 *p++ = hexdigits[ch & 0x000F];
8140 }
8141
Georg Brandl559e5d72008-06-11 18:37:52 +00008142 /* Copy ASCII characters as-is */
8143 else if (ch < 0x7F) {
8144 *p++ = ch;
8145 }
8146
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008148 else {
8149 Py_UCS4 ucs = ch;
8150
8151#ifndef Py_UNICODE_WIDE
8152 Py_UNICODE ch2 = 0;
8153 /* Get code point from surrogate pair */
8154 if (size > 0) {
8155 ch2 = *s;
8156 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008161 size--;
8162 }
8163 }
8164#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008165 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008166 (categories Z* and C* except ASCII space)
8167 */
8168 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8169 /* Map 8-bit characters to '\xhh' */
8170 if (ucs <= 0xff) {
8171 *p++ = '\\';
8172 *p++ = 'x';
8173 *p++ = hexdigits[(ch >> 4) & 0x000F];
8174 *p++ = hexdigits[ch & 0x000F];
8175 }
8176 /* Map 21-bit characters to '\U00xxxxxx' */
8177 else if (ucs >= 0x10000) {
8178 *p++ = '\\';
8179 *p++ = 'U';
8180 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8181 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8182 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8183 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8184 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8185 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8186 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8187 *p++ = hexdigits[ucs & 0x0000000F];
8188 }
8189 /* Map 16-bit characters to '\uxxxx' */
8190 else {
8191 *p++ = '\\';
8192 *p++ = 'u';
8193 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8194 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8195 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8196 *p++ = hexdigits[ucs & 0x000F];
8197 }
8198 }
8199 /* Copy characters as-is */
8200 else {
8201 *p++ = ch;
8202#ifndef Py_UNICODE_WIDE
8203 if (ucs >= 0x10000)
8204 *p++ = ch2;
8205#endif
8206 }
8207 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008208 }
8209 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008210 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008211
8212 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008213 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008214 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215}
8216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008217PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219\n\
8220Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008221such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222arguments start and end are interpreted as in slice notation.\n\
8223\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008224Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225
8226static PyObject *
8227unicode_rfind(PyUnicodeObject *self, PyObject *args)
8228{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008229 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008230 Py_ssize_t start;
8231 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008232 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233
Christian Heimes9cd17752007-11-18 19:35:23 +00008234 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236
Thomas Wouters477c8d52006-05-27 19:21:47 +00008237 result = stringlib_rfind_slice(
8238 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8239 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8240 start, end
8241 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242
8243 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008244
Christian Heimes217cfd12007-12-02 14:31:20 +00008245 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246}
8247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008248PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008251Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252
8253static PyObject *
8254unicode_rindex(PyUnicodeObject *self, PyObject *args)
8255{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008256 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008257 Py_ssize_t start;
8258 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008259 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260
Christian Heimes9cd17752007-11-18 19:35:23 +00008261 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263
Thomas Wouters477c8d52006-05-27 19:21:47 +00008264 result = stringlib_rfind_slice(
8265 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8266 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8267 start, end
8268 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269
8270 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008271
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 if (result < 0) {
8273 PyErr_SetString(PyExc_ValueError, "substring not found");
8274 return NULL;
8275 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008276 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277}
8278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008279PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008282Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008283done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284
8285static PyObject *
8286unicode_rjust(PyUnicodeObject *self, PyObject *args)
8287{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008288 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008289 Py_UNICODE fillchar = ' ';
8290
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008291 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 return NULL;
8293
Tim Peters7a29bd52001-09-12 03:03:31 +00008294 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 Py_INCREF(self);
8296 return (PyObject*) self;
8297 }
8298
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008299 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300}
8301
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 PyObject *sep,
8304 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305{
8306 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008307
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 s = PyUnicode_FromObject(s);
8309 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008310 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 if (sep != NULL) {
8312 sep = PyUnicode_FromObject(sep);
8313 if (sep == NULL) {
8314 Py_DECREF(s);
8315 return NULL;
8316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 }
8318
8319 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8320
8321 Py_DECREF(s);
8322 Py_XDECREF(sep);
8323 return result;
8324}
8325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008326PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328\n\
8329Return a list of the words in S, using sep as the\n\
8330delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008331splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008332whitespace string is a separator and empty strings are\n\
8333removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334
8335static PyObject*
8336unicode_split(PyUnicodeObject *self, PyObject *args)
8337{
8338 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008339 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340
Martin v. Löwis18e16552006-02-15 17:27:45 +00008341 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 return NULL;
8343
8344 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350}
8351
Thomas Wouters477c8d52006-05-27 19:21:47 +00008352PyObject *
8353PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8354{
8355 PyObject* str_obj;
8356 PyObject* sep_obj;
8357 PyObject* out;
8358
8359 str_obj = PyUnicode_FromObject(str_in);
8360 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008362 sep_obj = PyUnicode_FromObject(sep_in);
8363 if (!sep_obj) {
8364 Py_DECREF(str_obj);
8365 return NULL;
8366 }
8367
8368 out = stringlib_partition(
8369 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8370 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8371 );
8372
8373 Py_DECREF(sep_obj);
8374 Py_DECREF(str_obj);
8375
8376 return out;
8377}
8378
8379
8380PyObject *
8381PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8382{
8383 PyObject* str_obj;
8384 PyObject* sep_obj;
8385 PyObject* out;
8386
8387 str_obj = PyUnicode_FromObject(str_in);
8388 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008390 sep_obj = PyUnicode_FromObject(sep_in);
8391 if (!sep_obj) {
8392 Py_DECREF(str_obj);
8393 return NULL;
8394 }
8395
8396 out = stringlib_rpartition(
8397 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8398 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8399 );
8400
8401 Py_DECREF(sep_obj);
8402 Py_DECREF(str_obj);
8403
8404 return out;
8405}
8406
8407PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008409\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008410Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008411the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008412found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008413
8414static PyObject*
8415unicode_partition(PyUnicodeObject *self, PyObject *separator)
8416{
8417 return PyUnicode_Partition((PyObject *)self, separator);
8418}
8419
8420PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008421 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008422\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008423Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008424the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008425separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008426
8427static PyObject*
8428unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8429{
8430 return PyUnicode_RPartition((PyObject *)self, separator);
8431}
8432
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008433PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 PyObject *sep,
8435 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008436{
8437 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008439 s = PyUnicode_FromObject(s);
8440 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 if (sep != NULL) {
8443 sep = PyUnicode_FromObject(sep);
8444 if (sep == NULL) {
8445 Py_DECREF(s);
8446 return NULL;
8447 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008448 }
8449
8450 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8451
8452 Py_DECREF(s);
8453 Py_XDECREF(sep);
8454 return result;
8455}
8456
8457PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008459\n\
8460Return a list of the words in S, using sep as the\n\
8461delimiter string, starting at the end of the string and\n\
8462working to the front. If maxsplit is given, at most maxsplit\n\
8463splits are done. If sep is not specified, any whitespace string\n\
8464is a separator.");
8465
8466static PyObject*
8467unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8468{
8469 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008470 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008471
Martin v. Löwis18e16552006-02-15 17:27:45 +00008472 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008473 return NULL;
8474
8475 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008477 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008479 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008481}
8482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008483PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485\n\
8486Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008487Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008488is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489
8490static PyObject*
8491unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8492{
Guido van Rossum86662912000-04-11 15:38:46 +00008493 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494
Guido van Rossum86662912000-04-11 15:38:46 +00008495 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496 return NULL;
8497
Guido van Rossum86662912000-04-11 15:38:46 +00008498 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499}
8500
8501static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008502PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503{
Walter Dörwald346737f2007-05-31 10:44:43 +00008504 if (PyUnicode_CheckExact(self)) {
8505 Py_INCREF(self);
8506 return self;
8507 } else
8508 /* Subtype -- return genuine unicode string with the same value. */
8509 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8510 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511}
8512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008513PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515\n\
8516Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008517and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518
8519static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008520unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522 return fixup(self, fixswapcase);
8523}
8524
Georg Brandlceee0772007-11-27 23:48:05 +00008525PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008527\n\
8528Return a translation table usable for str.translate().\n\
8529If there is only one argument, it must be a dictionary mapping Unicode\n\
8530ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008531Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008532If there are two arguments, they must be strings of equal length, and\n\
8533in the resulting dictionary, each character in x will be mapped to the\n\
8534character at the same position in y. If there is a third argument, it\n\
8535must be a string, whose characters will be mapped to None in the result.");
8536
8537static PyObject*
8538unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8539{
8540 PyObject *x, *y = NULL, *z = NULL;
8541 PyObject *new = NULL, *key, *value;
8542 Py_ssize_t i = 0;
8543 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008544
Georg Brandlceee0772007-11-27 23:48:05 +00008545 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8546 return NULL;
8547 new = PyDict_New();
8548 if (!new)
8549 return NULL;
8550 if (y != NULL) {
8551 /* x must be a string too, of equal length */
8552 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8553 if (!PyUnicode_Check(x)) {
8554 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8555 "be a string if there is a second argument");
8556 goto err;
8557 }
8558 if (PyUnicode_GET_SIZE(x) != ylen) {
8559 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8560 "arguments must have equal length");
8561 goto err;
8562 }
8563 /* create entries for translating chars in x to those in y */
8564 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008565 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8566 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008567 if (!key || !value)
8568 goto err;
8569 res = PyDict_SetItem(new, key, value);
8570 Py_DECREF(key);
8571 Py_DECREF(value);
8572 if (res < 0)
8573 goto err;
8574 }
8575 /* create entries for deleting chars in z */
8576 if (z != NULL) {
8577 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008578 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008579 if (!key)
8580 goto err;
8581 res = PyDict_SetItem(new, key, Py_None);
8582 Py_DECREF(key);
8583 if (res < 0)
8584 goto err;
8585 }
8586 }
8587 } else {
8588 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008589 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008590 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8591 "to maketrans it must be a dict");
8592 goto err;
8593 }
8594 /* copy entries into the new dict, converting string keys to int keys */
8595 while (PyDict_Next(x, &i, &key, &value)) {
8596 if (PyUnicode_Check(key)) {
8597 /* convert string keys to integer keys */
8598 PyObject *newkey;
8599 if (PyUnicode_GET_SIZE(key) != 1) {
8600 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8601 "table must be of length 1");
8602 goto err;
8603 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008604 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008605 if (!newkey)
8606 goto err;
8607 res = PyDict_SetItem(new, newkey, value);
8608 Py_DECREF(newkey);
8609 if (res < 0)
8610 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008611 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008612 /* just keep integer keys */
8613 if (PyDict_SetItem(new, key, value) < 0)
8614 goto err;
8615 } else {
8616 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8617 "be strings or integers");
8618 goto err;
8619 }
8620 }
8621 }
8622 return new;
8623 err:
8624 Py_DECREF(new);
8625 return NULL;
8626}
8627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008628PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630\n\
8631Return a copy of the string S, where all characters have been mapped\n\
8632through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008633Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008634Unmapped characters are left untouched. Characters mapped to None\n\
8635are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636
8637static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008638unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639{
Georg Brandlceee0772007-11-27 23:48:05 +00008640 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641}
8642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008643PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008646Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647
8648static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008649unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 return fixup(self, fixupper);
8652}
8653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008654PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008657Pad a numeric string S with zeros on the left, to fill a field\n\
8658of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659
8660static PyObject *
8661unicode_zfill(PyUnicodeObject *self, PyObject *args)
8662{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008663 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664 PyUnicodeObject *u;
8665
Martin v. Löwis18e16552006-02-15 17:27:45 +00008666 Py_ssize_t width;
8667 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 return NULL;
8669
8670 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008671 if (PyUnicode_CheckExact(self)) {
8672 Py_INCREF(self);
8673 return (PyObject*) self;
8674 }
8675 else
8676 return PyUnicode_FromUnicode(
8677 PyUnicode_AS_UNICODE(self),
8678 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 }
8681
8682 fill = width - self->length;
8683
8684 u = pad(self, fill, 0, '0');
8685
Walter Dörwald068325e2002-04-15 13:36:47 +00008686 if (u == NULL)
8687 return NULL;
8688
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 if (u->str[fill] == '+' || u->str[fill] == '-') {
8690 /* move sign to beginning of string */
8691 u->str[0] = u->str[fill];
8692 u->str[fill] = '0';
8693 }
8694
8695 return (PyObject*) u;
8696}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697
8698#if 0
8699static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008700unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701{
Christian Heimes2202f872008-02-06 14:31:34 +00008702 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703}
8704#endif
8705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008706PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008709Return True if S starts with the specified prefix, False otherwise.\n\
8710With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008711With optional end, stop comparing S at that position.\n\
8712prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713
8714static PyObject *
8715unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008718 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008720 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008721 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008722 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008724 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8726 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008727 if (PyTuple_Check(subobj)) {
8728 Py_ssize_t i;
8729 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8730 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008732 if (substring == NULL)
8733 return NULL;
8734 result = tailmatch(self, substring, start, end, -1);
8735 Py_DECREF(substring);
8736 if (result) {
8737 Py_RETURN_TRUE;
8738 }
8739 }
8740 /* nothing matched */
8741 Py_RETURN_FALSE;
8742 }
8743 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008746 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008748 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749}
8750
8751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008752PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008755Return True if S ends with the specified suffix, False otherwise.\n\
8756With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008757With optional end, stop comparing S at that position.\n\
8758suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759
8760static PyObject *
8761unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008764 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008766 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008767 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008768 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008770 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8772 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008773 if (PyTuple_Check(subobj)) {
8774 Py_ssize_t i;
8775 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8776 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008778 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008780 result = tailmatch(self, substring, start, end, +1);
8781 Py_DECREF(substring);
8782 if (result) {
8783 Py_RETURN_TRUE;
8784 }
8785 }
8786 Py_RETURN_FALSE;
8787 }
8788 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008792 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008794 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795}
8796
Eric Smith8c663262007-08-25 02:26:07 +00008797#include "stringlib/string_format.h"
8798
8799PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008801\n\
8802");
8803
Eric Smith4a7d76d2008-05-30 18:10:19 +00008804static PyObject *
8805unicode__format__(PyObject* self, PyObject* args)
8806{
8807 PyObject *format_spec;
8808
8809 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8810 return NULL;
8811
8812 return _PyUnicode_FormatAdvanced(self,
8813 PyUnicode_AS_UNICODE(format_spec),
8814 PyUnicode_GET_SIZE(format_spec));
8815}
8816
Eric Smith8c663262007-08-25 02:26:07 +00008817PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008819\n\
8820");
8821
8822static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008823unicode__sizeof__(PyUnicodeObject *v)
8824{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008825 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8826 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008827}
8828
8829PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008831
8832static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008833unicode_getnewargs(PyUnicodeObject *v)
8834{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008835 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008836}
8837
8838
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839static PyMethodDef unicode_methods[] = {
8840
8841 /* Order is according to common usage: often used methods should
8842 appear first, since lookup is done sequentially. */
8843
Benjamin Peterson308d6372009-09-18 21:42:35 +00008844 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008845 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8846 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008847 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008848 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8849 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8850 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8851 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8852 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8853 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8854 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008855 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008856 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8857 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8858 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008859 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008860 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8861 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8862 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008863 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008864 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008865 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008866 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008867 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8868 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8869 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8870 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8871 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8872 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8873 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8874 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8875 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8876 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8877 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8878 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8879 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8880 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008881 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008882 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008883 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008884 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008885 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008886 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8887 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008888 {"maketrans", (PyCFunction) unicode_maketrans,
8889 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008890 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008891#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008892 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893#endif
8894
8895#if 0
8896 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008897 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898#endif
8899
Benjamin Peterson14339b62009-01-31 16:36:08 +00008900 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901 {NULL, NULL}
8902};
8903
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008904static PyObject *
8905unicode_mod(PyObject *v, PyObject *w)
8906{
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 if (!PyUnicode_Check(v)) {
8908 Py_INCREF(Py_NotImplemented);
8909 return Py_NotImplemented;
8910 }
8911 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008912}
8913
8914static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008915 0, /*nb_add*/
8916 0, /*nb_subtract*/
8917 0, /*nb_multiply*/
8918 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008919};
8920
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008922 (lenfunc) unicode_length, /* sq_length */
8923 PyUnicode_Concat, /* sq_concat */
8924 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8925 (ssizeargfunc) unicode_getitem, /* sq_item */
8926 0, /* sq_slice */
8927 0, /* sq_ass_item */
8928 0, /* sq_ass_slice */
8929 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930};
8931
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008932static PyObject*
8933unicode_subscript(PyUnicodeObject* self, PyObject* item)
8934{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008935 if (PyIndex_Check(item)) {
8936 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008937 if (i == -1 && PyErr_Occurred())
8938 return NULL;
8939 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008940 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008941 return unicode_getitem(self, i);
8942 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008943 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008944 Py_UNICODE* source_buf;
8945 Py_UNICODE* result_buf;
8946 PyObject* result;
8947
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008948 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008950 return NULL;
8951 }
8952
8953 if (slicelength <= 0) {
8954 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008955 } else if (start == 0 && step == 1 && slicelength == self->length &&
8956 PyUnicode_CheckExact(self)) {
8957 Py_INCREF(self);
8958 return (PyObject *)self;
8959 } else if (step == 1) {
8960 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008961 } else {
8962 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008963 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8964 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008965
Benjamin Peterson29060642009-01-31 22:14:21 +00008966 if (result_buf == NULL)
8967 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008968
8969 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8970 result_buf[i] = source_buf[cur];
8971 }
Tim Petersced69f82003-09-16 20:30:58 +00008972
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008973 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008974 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008975 return result;
8976 }
8977 } else {
8978 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8979 return NULL;
8980 }
8981}
8982
8983static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008984 (lenfunc)unicode_length, /* mp_length */
8985 (binaryfunc)unicode_subscript, /* mp_subscript */
8986 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008987};
8988
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990/* Helpers for PyUnicode_Format() */
8991
8992static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008993getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008995 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 (*p_argidx)++;
8998 if (arglen < 0)
8999 return args;
9000 else
9001 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 }
9003 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 return NULL;
9006}
9007
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009008/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009010static PyObject *
9011formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009013 char *p;
9014 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009016
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 x = PyFloat_AsDouble(v);
9018 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009019 return NULL;
9020
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009022 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009023
Eric Smith0923d1d2009-04-16 20:16:10 +00009024 p = PyOS_double_to_string(x, type, prec,
9025 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009026 if (p == NULL)
9027 return NULL;
9028 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009029 PyMem_Free(p);
9030 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031}
9032
Tim Peters38fd5b62000-09-21 05:43:11 +00009033static PyObject*
9034formatlong(PyObject *val, int flags, int prec, int type)
9035{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009036 char *buf;
9037 int len;
9038 PyObject *str; /* temporary string object. */
9039 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009040
Benjamin Peterson14339b62009-01-31 16:36:08 +00009041 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9042 if (!str)
9043 return NULL;
9044 result = PyUnicode_FromStringAndSize(buf, len);
9045 Py_DECREF(str);
9046 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009047}
9048
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049static int
9050formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009051 size_t buflen,
9052 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009054 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009055 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009056 if (PyUnicode_GET_SIZE(v) == 1) {
9057 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9058 buf[1] = '\0';
9059 return 1;
9060 }
9061#ifndef Py_UNICODE_WIDE
9062 if (PyUnicode_GET_SIZE(v) == 2) {
9063 /* Decode a valid surrogate pair */
9064 int c0 = PyUnicode_AS_UNICODE(v)[0];
9065 int c1 = PyUnicode_AS_UNICODE(v)[1];
9066 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9067 0xDC00 <= c1 && c1 <= 0xDFFF) {
9068 buf[0] = c0;
9069 buf[1] = c1;
9070 buf[2] = '\0';
9071 return 2;
9072 }
9073 }
9074#endif
9075 goto onError;
9076 }
9077 else {
9078 /* Integer input truncated to a character */
9079 long x;
9080 x = PyLong_AsLong(v);
9081 if (x == -1 && PyErr_Occurred())
9082 goto onError;
9083
9084 if (x < 0 || x > 0x10ffff) {
9085 PyErr_SetString(PyExc_OverflowError,
9086 "%c arg not in range(0x110000)");
9087 return -1;
9088 }
9089
9090#ifndef Py_UNICODE_WIDE
9091 if (x > 0xffff) {
9092 x -= 0x10000;
9093 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9094 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9095 return 2;
9096 }
9097#endif
9098 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009099 buf[1] = '\0';
9100 return 1;
9101 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009102
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009104 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009106 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107}
9108
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009109/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009110 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009111*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009112#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009113
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116{
9117 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009118 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119 int args_owned = 0;
9120 PyUnicodeObject *result = NULL;
9121 PyObject *dict = NULL;
9122 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009123
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 PyErr_BadInternalCall();
9126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127 }
9128 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009129 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131 fmt = PyUnicode_AS_UNICODE(uformat);
9132 fmtcnt = PyUnicode_GET_SIZE(uformat);
9133
9134 reslen = rescnt = fmtcnt + 100;
9135 result = _PyUnicode_New(reslen);
9136 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138 res = PyUnicode_AS_UNICODE(result);
9139
9140 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 arglen = PyTuple_Size(args);
9142 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143 }
9144 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 arglen = -1;
9146 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009148 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009149 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151
9152 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 if (*fmt != '%') {
9154 if (--rescnt < 0) {
9155 rescnt = fmtcnt + 100;
9156 reslen += rescnt;
9157 if (_PyUnicode_Resize(&result, reslen) < 0)
9158 goto onError;
9159 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9160 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009161 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009162 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009163 }
9164 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009165 /* Got a format specifier */
9166 int flags = 0;
9167 Py_ssize_t width = -1;
9168 int prec = -1;
9169 Py_UNICODE c = '\0';
9170 Py_UNICODE fill;
9171 int isnumok;
9172 PyObject *v = NULL;
9173 PyObject *temp = NULL;
9174 Py_UNICODE *pbuf;
9175 Py_UNICODE sign;
9176 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009177 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178
Benjamin Peterson29060642009-01-31 22:14:21 +00009179 fmt++;
9180 if (*fmt == '(') {
9181 Py_UNICODE *keystart;
9182 Py_ssize_t keylen;
9183 PyObject *key;
9184 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009185
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 if (dict == NULL) {
9187 PyErr_SetString(PyExc_TypeError,
9188 "format requires a mapping");
9189 goto onError;
9190 }
9191 ++fmt;
9192 --fmtcnt;
9193 keystart = fmt;
9194 /* Skip over balanced parentheses */
9195 while (pcount > 0 && --fmtcnt >= 0) {
9196 if (*fmt == ')')
9197 --pcount;
9198 else if (*fmt == '(')
9199 ++pcount;
9200 fmt++;
9201 }
9202 keylen = fmt - keystart - 1;
9203 if (fmtcnt < 0 || pcount > 0) {
9204 PyErr_SetString(PyExc_ValueError,
9205 "incomplete format key");
9206 goto onError;
9207 }
9208#if 0
9209 /* keys are converted to strings using UTF-8 and
9210 then looked up since Python uses strings to hold
9211 variables names etc. in its namespaces and we
9212 wouldn't want to break common idioms. */
9213 key = PyUnicode_EncodeUTF8(keystart,
9214 keylen,
9215 NULL);
9216#else
9217 key = PyUnicode_FromUnicode(keystart, keylen);
9218#endif
9219 if (key == NULL)
9220 goto onError;
9221 if (args_owned) {
9222 Py_DECREF(args);
9223 args_owned = 0;
9224 }
9225 args = PyObject_GetItem(dict, key);
9226 Py_DECREF(key);
9227 if (args == NULL) {
9228 goto onError;
9229 }
9230 args_owned = 1;
9231 arglen = -1;
9232 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009233 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009234 while (--fmtcnt >= 0) {
9235 switch (c = *fmt++) {
9236 case '-': flags |= F_LJUST; continue;
9237 case '+': flags |= F_SIGN; continue;
9238 case ' ': flags |= F_BLANK; continue;
9239 case '#': flags |= F_ALT; continue;
9240 case '0': flags |= F_ZERO; continue;
9241 }
9242 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009243 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009244 if (c == '*') {
9245 v = getnextarg(args, arglen, &argidx);
9246 if (v == NULL)
9247 goto onError;
9248 if (!PyLong_Check(v)) {
9249 PyErr_SetString(PyExc_TypeError,
9250 "* wants int");
9251 goto onError;
9252 }
9253 width = PyLong_AsLong(v);
9254 if (width == -1 && PyErr_Occurred())
9255 goto onError;
9256 if (width < 0) {
9257 flags |= F_LJUST;
9258 width = -width;
9259 }
9260 if (--fmtcnt >= 0)
9261 c = *fmt++;
9262 }
9263 else if (c >= '0' && c <= '9') {
9264 width = c - '0';
9265 while (--fmtcnt >= 0) {
9266 c = *fmt++;
9267 if (c < '0' || c > '9')
9268 break;
9269 if ((width*10) / 10 != width) {
9270 PyErr_SetString(PyExc_ValueError,
9271 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009272 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009273 }
9274 width = width*10 + (c - '0');
9275 }
9276 }
9277 if (c == '.') {
9278 prec = 0;
9279 if (--fmtcnt >= 0)
9280 c = *fmt++;
9281 if (c == '*') {
9282 v = getnextarg(args, arglen, &argidx);
9283 if (v == NULL)
9284 goto onError;
9285 if (!PyLong_Check(v)) {
9286 PyErr_SetString(PyExc_TypeError,
9287 "* wants int");
9288 goto onError;
9289 }
9290 prec = PyLong_AsLong(v);
9291 if (prec == -1 && PyErr_Occurred())
9292 goto onError;
9293 if (prec < 0)
9294 prec = 0;
9295 if (--fmtcnt >= 0)
9296 c = *fmt++;
9297 }
9298 else if (c >= '0' && c <= '9') {
9299 prec = c - '0';
9300 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009301 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009302 if (c < '0' || c > '9')
9303 break;
9304 if ((prec*10) / 10 != prec) {
9305 PyErr_SetString(PyExc_ValueError,
9306 "prec too big");
9307 goto onError;
9308 }
9309 prec = prec*10 + (c - '0');
9310 }
9311 }
9312 } /* prec */
9313 if (fmtcnt >= 0) {
9314 if (c == 'h' || c == 'l' || c == 'L') {
9315 if (--fmtcnt >= 0)
9316 c = *fmt++;
9317 }
9318 }
9319 if (fmtcnt < 0) {
9320 PyErr_SetString(PyExc_ValueError,
9321 "incomplete format");
9322 goto onError;
9323 }
9324 if (c != '%') {
9325 v = getnextarg(args, arglen, &argidx);
9326 if (v == NULL)
9327 goto onError;
9328 }
9329 sign = 0;
9330 fill = ' ';
9331 switch (c) {
9332
9333 case '%':
9334 pbuf = formatbuf;
9335 /* presume that buffer length is at least 1 */
9336 pbuf[0] = '%';
9337 len = 1;
9338 break;
9339
9340 case 's':
9341 case 'r':
9342 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009343 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009344 temp = v;
9345 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009346 }
9347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009348 if (c == 's')
9349 temp = PyObject_Str(v);
9350 else if (c == 'r')
9351 temp = PyObject_Repr(v);
9352 else
9353 temp = PyObject_ASCII(v);
9354 if (temp == NULL)
9355 goto onError;
9356 if (PyUnicode_Check(temp))
9357 /* nothing to do */;
9358 else {
9359 Py_DECREF(temp);
9360 PyErr_SetString(PyExc_TypeError,
9361 "%s argument has non-string str()");
9362 goto onError;
9363 }
9364 }
9365 pbuf = PyUnicode_AS_UNICODE(temp);
9366 len = PyUnicode_GET_SIZE(temp);
9367 if (prec >= 0 && len > prec)
9368 len = prec;
9369 break;
9370
9371 case 'i':
9372 case 'd':
9373 case 'u':
9374 case 'o':
9375 case 'x':
9376 case 'X':
9377 if (c == 'i')
9378 c = 'd';
9379 isnumok = 0;
9380 if (PyNumber_Check(v)) {
9381 PyObject *iobj=NULL;
9382
9383 if (PyLong_Check(v)) {
9384 iobj = v;
9385 Py_INCREF(iobj);
9386 }
9387 else {
9388 iobj = PyNumber_Long(v);
9389 }
9390 if (iobj!=NULL) {
9391 if (PyLong_Check(iobj)) {
9392 isnumok = 1;
9393 temp = formatlong(iobj, flags, prec, c);
9394 Py_DECREF(iobj);
9395 if (!temp)
9396 goto onError;
9397 pbuf = PyUnicode_AS_UNICODE(temp);
9398 len = PyUnicode_GET_SIZE(temp);
9399 sign = 1;
9400 }
9401 else {
9402 Py_DECREF(iobj);
9403 }
9404 }
9405 }
9406 if (!isnumok) {
9407 PyErr_Format(PyExc_TypeError,
9408 "%%%c format: a number is required, "
9409 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9410 goto onError;
9411 }
9412 if (flags & F_ZERO)
9413 fill = '0';
9414 break;
9415
9416 case 'e':
9417 case 'E':
9418 case 'f':
9419 case 'F':
9420 case 'g':
9421 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009422 temp = formatfloat(v, flags, prec, c);
9423 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009424 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009425 pbuf = PyUnicode_AS_UNICODE(temp);
9426 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009427 sign = 1;
9428 if (flags & F_ZERO)
9429 fill = '0';
9430 break;
9431
9432 case 'c':
9433 pbuf = formatbuf;
9434 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9435 if (len < 0)
9436 goto onError;
9437 break;
9438
9439 default:
9440 PyErr_Format(PyExc_ValueError,
9441 "unsupported format character '%c' (0x%x) "
9442 "at index %zd",
9443 (31<=c && c<=126) ? (char)c : '?',
9444 (int)c,
9445 (Py_ssize_t)(fmt - 1 -
9446 PyUnicode_AS_UNICODE(uformat)));
9447 goto onError;
9448 }
9449 if (sign) {
9450 if (*pbuf == '-' || *pbuf == '+') {
9451 sign = *pbuf++;
9452 len--;
9453 }
9454 else if (flags & F_SIGN)
9455 sign = '+';
9456 else if (flags & F_BLANK)
9457 sign = ' ';
9458 else
9459 sign = 0;
9460 }
9461 if (width < len)
9462 width = len;
9463 if (rescnt - (sign != 0) < width) {
9464 reslen -= rescnt;
9465 rescnt = width + fmtcnt + 100;
9466 reslen += rescnt;
9467 if (reslen < 0) {
9468 Py_XDECREF(temp);
9469 PyErr_NoMemory();
9470 goto onError;
9471 }
9472 if (_PyUnicode_Resize(&result, reslen) < 0) {
9473 Py_XDECREF(temp);
9474 goto onError;
9475 }
9476 res = PyUnicode_AS_UNICODE(result)
9477 + reslen - rescnt;
9478 }
9479 if (sign) {
9480 if (fill != ' ')
9481 *res++ = sign;
9482 rescnt--;
9483 if (width > len)
9484 width--;
9485 }
9486 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9487 assert(pbuf[0] == '0');
9488 assert(pbuf[1] == c);
9489 if (fill != ' ') {
9490 *res++ = *pbuf++;
9491 *res++ = *pbuf++;
9492 }
9493 rescnt -= 2;
9494 width -= 2;
9495 if (width < 0)
9496 width = 0;
9497 len -= 2;
9498 }
9499 if (width > len && !(flags & F_LJUST)) {
9500 do {
9501 --rescnt;
9502 *res++ = fill;
9503 } while (--width > len);
9504 }
9505 if (fill == ' ') {
9506 if (sign)
9507 *res++ = sign;
9508 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9509 assert(pbuf[0] == '0');
9510 assert(pbuf[1] == c);
9511 *res++ = *pbuf++;
9512 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009513 }
9514 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009515 Py_UNICODE_COPY(res, pbuf, len);
9516 res += len;
9517 rescnt -= len;
9518 while (--width >= len) {
9519 --rescnt;
9520 *res++ = ' ';
9521 }
9522 if (dict && (argidx < arglen) && c != '%') {
9523 PyErr_SetString(PyExc_TypeError,
9524 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009525 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009526 goto onError;
9527 }
9528 Py_XDECREF(temp);
9529 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 } /* until end */
9531 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 PyErr_SetString(PyExc_TypeError,
9533 "not all arguments converted during string formatting");
9534 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535 }
9536
Thomas Woutersa96affe2006-03-12 00:29:36 +00009537 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541 }
9542 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 return (PyObject *)result;
9544
Benjamin Peterson29060642009-01-31 22:14:21 +00009545 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546 Py_XDECREF(result);
9547 Py_DECREF(uformat);
9548 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550 }
9551 return NULL;
9552}
9553
Jeremy Hylton938ace62002-07-17 16:30:39 +00009554static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009555unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9556
Tim Peters6d6c1a32001-08-02 04:15:00 +00009557static PyObject *
9558unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9559{
Benjamin Peterson29060642009-01-31 22:14:21 +00009560 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009561 static char *kwlist[] = {"object", "encoding", "errors", 0};
9562 char *encoding = NULL;
9563 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009564
Benjamin Peterson14339b62009-01-31 16:36:08 +00009565 if (type != &PyUnicode_Type)
9566 return unicode_subtype_new(type, args, kwds);
9567 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009568 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009569 return NULL;
9570 if (x == NULL)
9571 return (PyObject *)_PyUnicode_New(0);
9572 if (encoding == NULL && errors == NULL)
9573 return PyObject_Str(x);
9574 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009575 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009576}
9577
Guido van Rossume023fe02001-08-30 03:12:59 +00009578static PyObject *
9579unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9580{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009581 PyUnicodeObject *tmp, *pnew;
9582 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009583
Benjamin Peterson14339b62009-01-31 16:36:08 +00009584 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9585 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9586 if (tmp == NULL)
9587 return NULL;
9588 assert(PyUnicode_Check(tmp));
9589 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9590 if (pnew == NULL) {
9591 Py_DECREF(tmp);
9592 return NULL;
9593 }
9594 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9595 if (pnew->str == NULL) {
9596 _Py_ForgetReference((PyObject *)pnew);
9597 PyObject_Del(pnew);
9598 Py_DECREF(tmp);
9599 return PyErr_NoMemory();
9600 }
9601 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9602 pnew->length = n;
9603 pnew->hash = tmp->hash;
9604 Py_DECREF(tmp);
9605 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009606}
9607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009608PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009609 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009610\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009611Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009612encoding defaults to the current default string encoding.\n\
9613errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009614
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009615static PyObject *unicode_iter(PyObject *seq);
9616
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009618 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009619 "str", /* tp_name */
9620 sizeof(PyUnicodeObject), /* tp_size */
9621 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009623 (destructor)unicode_dealloc, /* tp_dealloc */
9624 0, /* tp_print */
9625 0, /* tp_getattr */
9626 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009627 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009628 unicode_repr, /* tp_repr */
9629 &unicode_as_number, /* tp_as_number */
9630 &unicode_as_sequence, /* tp_as_sequence */
9631 &unicode_as_mapping, /* tp_as_mapping */
9632 (hashfunc) unicode_hash, /* tp_hash*/
9633 0, /* tp_call*/
9634 (reprfunc) unicode_str, /* tp_str */
9635 PyObject_GenericGetAttr, /* tp_getattro */
9636 0, /* tp_setattro */
9637 0, /* tp_as_buffer */
9638 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009639 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009640 unicode_doc, /* tp_doc */
9641 0, /* tp_traverse */
9642 0, /* tp_clear */
9643 PyUnicode_RichCompare, /* tp_richcompare */
9644 0, /* tp_weaklistoffset */
9645 unicode_iter, /* tp_iter */
9646 0, /* tp_iternext */
9647 unicode_methods, /* tp_methods */
9648 0, /* tp_members */
9649 0, /* tp_getset */
9650 &PyBaseObject_Type, /* tp_base */
9651 0, /* tp_dict */
9652 0, /* tp_descr_get */
9653 0, /* tp_descr_set */
9654 0, /* tp_dictoffset */
9655 0, /* tp_init */
9656 0, /* tp_alloc */
9657 unicode_new, /* tp_new */
9658 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659};
9660
9661/* Initialize the Unicode implementation */
9662
Thomas Wouters78890102000-07-22 19:25:51 +00009663void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009665 int i;
9666
Thomas Wouters477c8d52006-05-27 19:21:47 +00009667 /* XXX - move this array to unicodectype.c ? */
9668 Py_UNICODE linebreak[] = {
9669 0x000A, /* LINE FEED */
9670 0x000D, /* CARRIAGE RETURN */
9671 0x001C, /* FILE SEPARATOR */
9672 0x001D, /* GROUP SEPARATOR */
9673 0x001E, /* RECORD SEPARATOR */
9674 0x0085, /* NEXT LINE */
9675 0x2028, /* LINE SEPARATOR */
9676 0x2029, /* PARAGRAPH SEPARATOR */
9677 };
9678
Fred Drakee4315f52000-05-09 19:53:39 +00009679 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009680 free_list = NULL;
9681 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009683 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009684 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009685
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009686 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009687 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009688 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009689 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009690
9691 /* initialize the linebreak bloom filter */
9692 bloom_linebreak = make_bloom_mask(
9693 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9694 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009695
9696 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697}
9698
9699/* Finalize the Unicode implementation */
9700
Christian Heimesa156e092008-02-16 07:38:31 +00009701int
9702PyUnicode_ClearFreeList(void)
9703{
9704 int freelist_size = numfree;
9705 PyUnicodeObject *u;
9706
9707 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009708 PyUnicodeObject *v = u;
9709 u = *(PyUnicodeObject **)u;
9710 if (v->str)
9711 PyObject_DEL(v->str);
9712 Py_XDECREF(v->defenc);
9713 PyObject_Del(v);
9714 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009715 }
9716 free_list = NULL;
9717 assert(numfree == 0);
9718 return freelist_size;
9719}
9720
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721void
Thomas Wouters78890102000-07-22 19:25:51 +00009722_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009724 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009726 Py_XDECREF(unicode_empty);
9727 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009728
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009729 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009730 if (unicode_latin1[i]) {
9731 Py_DECREF(unicode_latin1[i]);
9732 unicode_latin1[i] = NULL;
9733 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009734 }
Christian Heimesa156e092008-02-16 07:38:31 +00009735 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009737
Walter Dörwald16807132007-05-25 13:52:07 +00009738void
9739PyUnicode_InternInPlace(PyObject **p)
9740{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009741 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9742 PyObject *t;
9743 if (s == NULL || !PyUnicode_Check(s))
9744 Py_FatalError(
9745 "PyUnicode_InternInPlace: unicode strings only please!");
9746 /* If it's a subclass, we don't really know what putting
9747 it in the interned dict might do. */
9748 if (!PyUnicode_CheckExact(s))
9749 return;
9750 if (PyUnicode_CHECK_INTERNED(s))
9751 return;
9752 if (interned == NULL) {
9753 interned = PyDict_New();
9754 if (interned == NULL) {
9755 PyErr_Clear(); /* Don't leave an exception */
9756 return;
9757 }
9758 }
9759 /* It might be that the GetItem call fails even
9760 though the key is present in the dictionary,
9761 namely when this happens during a stack overflow. */
9762 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009763 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009764 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009765
Benjamin Peterson29060642009-01-31 22:14:21 +00009766 if (t) {
9767 Py_INCREF(t);
9768 Py_DECREF(*p);
9769 *p = t;
9770 return;
9771 }
Walter Dörwald16807132007-05-25 13:52:07 +00009772
Benjamin Peterson14339b62009-01-31 16:36:08 +00009773 PyThreadState_GET()->recursion_critical = 1;
9774 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9775 PyErr_Clear();
9776 PyThreadState_GET()->recursion_critical = 0;
9777 return;
9778 }
9779 PyThreadState_GET()->recursion_critical = 0;
9780 /* The two references in interned are not counted by refcnt.
9781 The deallocator will take care of this */
9782 Py_REFCNT(s) -= 2;
9783 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009784}
9785
9786void
9787PyUnicode_InternImmortal(PyObject **p)
9788{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009789 PyUnicode_InternInPlace(p);
9790 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9791 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9792 Py_INCREF(*p);
9793 }
Walter Dörwald16807132007-05-25 13:52:07 +00009794}
9795
9796PyObject *
9797PyUnicode_InternFromString(const char *cp)
9798{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009799 PyObject *s = PyUnicode_FromString(cp);
9800 if (s == NULL)
9801 return NULL;
9802 PyUnicode_InternInPlace(&s);
9803 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009804}
9805
9806void _Py_ReleaseInternedUnicodeStrings(void)
9807{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009808 PyObject *keys;
9809 PyUnicodeObject *s;
9810 Py_ssize_t i, n;
9811 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009812
Benjamin Peterson14339b62009-01-31 16:36:08 +00009813 if (interned == NULL || !PyDict_Check(interned))
9814 return;
9815 keys = PyDict_Keys(interned);
9816 if (keys == NULL || !PyList_Check(keys)) {
9817 PyErr_Clear();
9818 return;
9819 }
Walter Dörwald16807132007-05-25 13:52:07 +00009820
Benjamin Peterson14339b62009-01-31 16:36:08 +00009821 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9822 detector, interned unicode strings are not forcibly deallocated;
9823 rather, we give them their stolen references back, and then clear
9824 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009825
Benjamin Peterson14339b62009-01-31 16:36:08 +00009826 n = PyList_GET_SIZE(keys);
9827 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009828 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009829 for (i = 0; i < n; i++) {
9830 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9831 switch (s->state) {
9832 case SSTATE_NOT_INTERNED:
9833 /* XXX Shouldn't happen */
9834 break;
9835 case SSTATE_INTERNED_IMMORTAL:
9836 Py_REFCNT(s) += 1;
9837 immortal_size += s->length;
9838 break;
9839 case SSTATE_INTERNED_MORTAL:
9840 Py_REFCNT(s) += 2;
9841 mortal_size += s->length;
9842 break;
9843 default:
9844 Py_FatalError("Inconsistent interned string state.");
9845 }
9846 s->state = SSTATE_NOT_INTERNED;
9847 }
9848 fprintf(stderr, "total size of all interned strings: "
9849 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9850 "mortal/immortal\n", mortal_size, immortal_size);
9851 Py_DECREF(keys);
9852 PyDict_Clear(interned);
9853 Py_DECREF(interned);
9854 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009855}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009856
9857
9858/********************* Unicode Iterator **************************/
9859
9860typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009861 PyObject_HEAD
9862 Py_ssize_t it_index;
9863 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009864} unicodeiterobject;
9865
9866static void
9867unicodeiter_dealloc(unicodeiterobject *it)
9868{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009869 _PyObject_GC_UNTRACK(it);
9870 Py_XDECREF(it->it_seq);
9871 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009872}
9873
9874static int
9875unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9876{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009877 Py_VISIT(it->it_seq);
9878 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009879}
9880
9881static PyObject *
9882unicodeiter_next(unicodeiterobject *it)
9883{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009884 PyUnicodeObject *seq;
9885 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009886
Benjamin Peterson14339b62009-01-31 16:36:08 +00009887 assert(it != NULL);
9888 seq = it->it_seq;
9889 if (seq == NULL)
9890 return NULL;
9891 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009892
Benjamin Peterson14339b62009-01-31 16:36:08 +00009893 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9894 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009895 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009896 if (item != NULL)
9897 ++it->it_index;
9898 return item;
9899 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009900
Benjamin Peterson14339b62009-01-31 16:36:08 +00009901 Py_DECREF(seq);
9902 it->it_seq = NULL;
9903 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009904}
9905
9906static PyObject *
9907unicodeiter_len(unicodeiterobject *it)
9908{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009909 Py_ssize_t len = 0;
9910 if (it->it_seq)
9911 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9912 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009913}
9914
9915PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9916
9917static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009918 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009919 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009920 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009921};
9922
9923PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009924 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9925 "str_iterator", /* tp_name */
9926 sizeof(unicodeiterobject), /* tp_basicsize */
9927 0, /* tp_itemsize */
9928 /* methods */
9929 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9930 0, /* tp_print */
9931 0, /* tp_getattr */
9932 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009933 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009934 0, /* tp_repr */
9935 0, /* tp_as_number */
9936 0, /* tp_as_sequence */
9937 0, /* tp_as_mapping */
9938 0, /* tp_hash */
9939 0, /* tp_call */
9940 0, /* tp_str */
9941 PyObject_GenericGetAttr, /* tp_getattro */
9942 0, /* tp_setattro */
9943 0, /* tp_as_buffer */
9944 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9945 0, /* tp_doc */
9946 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9947 0, /* tp_clear */
9948 0, /* tp_richcompare */
9949 0, /* tp_weaklistoffset */
9950 PyObject_SelfIter, /* tp_iter */
9951 (iternextfunc)unicodeiter_next, /* tp_iternext */
9952 unicodeiter_methods, /* tp_methods */
9953 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009954};
9955
9956static PyObject *
9957unicode_iter(PyObject *seq)
9958{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009959 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009960
Benjamin Peterson14339b62009-01-31 16:36:08 +00009961 if (!PyUnicode_Check(seq)) {
9962 PyErr_BadInternalCall();
9963 return NULL;
9964 }
9965 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9966 if (it == NULL)
9967 return NULL;
9968 it->it_index = 0;
9969 Py_INCREF(seq);
9970 it->it_seq = (PyUnicodeObject *)seq;
9971 _PyObject_GC_TRACK(it);
9972 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009973}
9974
Martin v. Löwis5b222132007-06-10 09:51:05 +00009975size_t
9976Py_UNICODE_strlen(const Py_UNICODE *u)
9977{
9978 int res = 0;
9979 while(*u++)
9980 res++;
9981 return res;
9982}
9983
9984Py_UNICODE*
9985Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9986{
9987 Py_UNICODE *u = s1;
9988 while ((*u++ = *s2++));
9989 return s1;
9990}
9991
9992Py_UNICODE*
9993Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9994{
9995 Py_UNICODE *u = s1;
9996 while ((*u++ = *s2++))
9997 if (n-- == 0)
9998 break;
9999 return s1;
10000}
10001
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010002Py_UNICODE*
10003Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10004{
10005 Py_UNICODE *u1 = s1;
10006 u1 += Py_UNICODE_strlen(u1);
10007 Py_UNICODE_strcpy(u1, s2);
10008 return s1;
10009}
10010
Martin v. Löwis5b222132007-06-10 09:51:05 +000010011int
10012Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10013{
10014 while (*s1 && *s2 && *s1 == *s2)
10015 s1++, s2++;
10016 if (*s1 && *s2)
10017 return (*s1 < *s2) ? -1 : +1;
10018 if (*s1)
10019 return 1;
10020 if (*s2)
10021 return -1;
10022 return 0;
10023}
10024
Victor Stinneref8d95c2010-08-16 22:03:11 +000010025int
10026Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10027{
10028 register Py_UNICODE u1, u2;
10029 for (; n != 0; n--) {
10030 u1 = *s1;
10031 u2 = *s2;
10032 if (u1 != u2)
10033 return (u1 < u2) ? -1 : +1;
10034 if (u1 == '\0')
10035 return 0;
10036 s1++;
10037 s2++;
10038 }
10039 return 0;
10040}
10041
Martin v. Löwis5b222132007-06-10 09:51:05 +000010042Py_UNICODE*
10043Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10044{
10045 const Py_UNICODE *p;
10046 for (p = s; *p; p++)
10047 if (*p == c)
10048 return (Py_UNICODE*)p;
10049 return NULL;
10050}
10051
Victor Stinner331ea922010-08-10 16:37:20 +000010052Py_UNICODE*
10053Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10054{
10055 const Py_UNICODE *p;
10056 p = s + Py_UNICODE_strlen(s);
10057 while (p != s) {
10058 p--;
10059 if (*p == c)
10060 return (Py_UNICODE*)p;
10061 }
10062 return NULL;
10063}
10064
Victor Stinner71133ff2010-09-01 23:43:53 +000010065Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010066PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010067{
10068 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10069 Py_UNICODE *copy;
10070 Py_ssize_t size;
10071
10072 /* Ensure we won't overflow the size. */
10073 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10074 PyErr_NoMemory();
10075 return NULL;
10076 }
10077 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10078 size *= sizeof(Py_UNICODE);
10079 copy = PyMem_Malloc(size);
10080 if (copy == NULL) {
10081 PyErr_NoMemory();
10082 return NULL;
10083 }
10084 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10085 return copy;
10086}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010088#ifdef __cplusplus
10089}
10090#endif