blob: 527e2198966a591e918d1c99c8c09e365d18dc07 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Christian Heimes190d79e2008-01-30 11:58:22 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000119 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000120/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000121/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000122/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000123/* case 0x000C: * FORM FEED */
124/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000127/* case 0x001C: * FILE SEPARATOR */
128/* case 0x001D: * GROUP SEPARATOR */
129/* case 0x001E: * RECORD SEPARATOR */
130/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000131 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson14339b62009-01-31 16:36:08 +0000138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000146};
147
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149 PyObject **errorHandler,const char *encoding, const char *reason,
150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
Victor Stinner31be90b2010-04-22 19:38:16 +0000153static void raise_encode_exception(PyObject **exceptionObject,
154 const char *encoding,
155 const Py_UNICODE *unicode, Py_ssize_t size,
156 Py_ssize_t startpos, Py_ssize_t endpos,
157 const char *reason);
158
Christian Heimes190d79e2008-01-30 11:58:22 +0000159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrouf068f942010-01-13 14:19:12 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
Antoine Pitrouf068f942010-01-13 14:19:12 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
Benjamin Peterson29060642009-01-31 22:14:21 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252}
253
Benjamin Peterson29060642009-01-31 22:14:21 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000278 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000309 Ux0000 terminated; some code (e.g. new_identifier)
310 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000313 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314
315*/
316
317static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319{
320 register PyUnicodeObject *unicode;
321
Thomas Wouters477c8d52006-05-27 19:21:47 +0000322 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 if (length == 0 && unicode_empty != NULL) {
324 Py_INCREF(unicode_empty);
325 return unicode_empty;
326 }
327
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000328 /* Ensure we won't overflow the size. */
329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330 return (PyUnicodeObject *)PyErr_NoMemory();
331 }
332
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000334 if (free_list) {
335 unicode = free_list;
336 free_list = *(PyUnicodeObject **)unicode;
337 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000338 if (unicode->str) {
339 /* Keep-Alive optimization: we only upsize the buffer,
340 never downsize it. */
341 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000342 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000343 PyObject_DEL(unicode->str);
344 unicode->str = NULL;
345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000350 }
351 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 }
353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 if (unicode == NULL)
357 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 }
361
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 PyErr_NoMemory();
364 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000365 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000366 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000367 * the caller fails before initializing str -- unicode_resize()
368 * reads str[0], and the Keep-Alive optimization can keep memory
369 * allocated for str alive across a call to unicode_dealloc(unicode).
370 * We don't want unicode_resize to read uninitialized memory in
371 * that case.
372 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000373 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000377 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000378 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000380
Benjamin Peterson29060642009-01-31 22:14:21 +0000381 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000382 /* XXX UNREF/NEWREF interface should be more symmetrical */
383 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000385 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387}
388
389static
Guido van Rossum9475a232001-10-05 20:51:39 +0000390void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391{
Walter Dörwald16807132007-05-25 13:52:07 +0000392 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_NOT_INTERNED:
394 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000395
Benjamin Peterson29060642009-01-31 22:14:21 +0000396 case SSTATE_INTERNED_MORTAL:
397 /* revive dead object temporarily for DelItem */
398 Py_REFCNT(unicode) = 3;
399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400 Py_FatalError(
401 "deletion of interned string failed");
402 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 case SSTATE_INTERNED_IMMORTAL:
405 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406
Benjamin Peterson29060642009-01-31 22:14:21 +0000407 default:
408 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000409 }
410
Guido van Rossum604ddf82001-12-06 20:03:56 +0000411 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000412 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000413 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415 PyObject_DEL(unicode->str);
416 unicode->str = NULL;
417 unicode->length = 0;
418 }
419 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000420 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 }
422 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000423 *(PyUnicodeObject **)unicode = free_list;
424 free_list = unicode;
425 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000426 }
427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyObject_DEL(unicode->str);
429 Py_XDECREF(unicode->defenc);
430 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431 }
432}
433
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000436{
437 register PyUnicodeObject *v;
438
439 /* Argument checks */
440 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 PyErr_BadInternalCall();
442 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000443 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 PyErr_BadInternalCall();
447 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 }
449
450 /* Resizing unicode_empty and single character objects is not
451 possible since these are being shared. We simply return a fresh
452 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000453 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000454 (v == unicode_empty || v->length == 1)) {
455 PyUnicodeObject *w = _PyUnicode_New(length);
456 if (w == NULL)
457 return -1;
458 Py_UNICODE_COPY(w->str, v->str,
459 length < v->length ? length : v->length);
460 Py_DECREF(*unicode);
461 *unicode = w;
462 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000463 }
464
465 /* Note that we don't have to modify *unicode for unshared Unicode
466 objects, since we can modify them in-place. */
467 return unicode_resize(v, length);
468}
469
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477{
478 PyUnicodeObject *unicode;
479
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects. */
482 if (u != NULL) {
483
Benjamin Peterson29060642009-01-31 22:14:21 +0000484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000489
490 /* Single character Unicode objects in the Latin-1 range are
491 shared when using this constructor */
492 if (size == 1 && *u < 256) {
493 unicode = unicode_latin1[*u];
494 if (!unicode) {
495 unicode = _PyUnicode_New(1);
496 if (!unicode)
497 return NULL;
498 unicode->str[0] = *u;
499 unicode_latin1[*u] = unicode;
500 }
501 Py_INCREF(unicode);
502 return (PyObject *)unicode;
503 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000504 }
Tim Petersced69f82003-09-16 20:30:58 +0000505
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 unicode = _PyUnicode_New(size);
507 if (!unicode)
508 return NULL;
509
510 /* Copy the Unicode data into the new object */
511 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000512 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 return (PyObject *)unicode;
515}
516
Walter Dörwaldd2034312007-05-18 16:29:38 +0000517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518{
519 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000520
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 if (size < 0) {
522 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 return NULL;
525 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000526
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000528 some optimizations which share commonly used objects.
529 Also, this means the input must be UTF-8, so fall back to the
530 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (u != NULL) {
532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533 /* Optimization for empty strings */
534 if (size == 0 && unicode_empty != NULL) {
535 Py_INCREF(unicode_empty);
536 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000537 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000538
539 /* Single characters are shared when using this constructor.
540 Restrict to ASCII, since the input must be UTF-8. */
541 if (size == 1 && Py_CHARMASK(*u) < 128) {
542 unicode = unicode_latin1[Py_CHARMASK(*u)];
543 if (!unicode) {
544 unicode = _PyUnicode_New(1);
545 if (!unicode)
546 return NULL;
547 unicode->str[0] = Py_CHARMASK(*u);
548 unicode_latin1[Py_CHARMASK(*u)] = unicode;
549 }
550 Py_INCREF(unicode);
551 return (PyObject *)unicode;
552 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000553
554 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 }
556
Walter Dörwald55507312007-05-18 13:12:10 +0000557 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 if (!unicode)
559 return NULL;
560
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 return (PyObject *)unicode;
562}
563
Walter Dörwaldd2034312007-05-18 16:29:38 +0000564PyObject *PyUnicode_FromString(const char *u)
565{
566 size_t size = strlen(u);
567 if (size > PY_SSIZE_T_MAX) {
568 PyErr_SetString(PyExc_OverflowError, "input too long");
569 return NULL;
570 }
571
572 return PyUnicode_FromStringAndSize(u, size);
573}
574
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575#ifdef HAVE_WCHAR_H
576
Mark Dickinson081dfee2009-03-18 14:47:41 +0000577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584 to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
588{
589 PyUnicodeObject *unicode;
590 register Py_ssize_t i;
591 Py_ssize_t alloc;
592 const wchar_t *orig_w;
593
594 if (w == NULL) {
595 if (size == 0)
596 return PyUnicode_FromStringAndSize(NULL, 0);
597 PyErr_BadInternalCall();
598 return NULL;
599 }
600
601 if (size == -1) {
602 size = wcslen(w);
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000643 if (size == 0)
644 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000645 PyErr_BadInternalCall();
646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647 }
648
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == -1) {
650 size = wcslen(w);
651 }
652
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 unicode = _PyUnicode_New(size);
654 if (!unicode)
655 return NULL;
656
657 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000660#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000662 register Py_UNICODE *u;
663 register Py_ssize_t i;
664 u = PyUnicode_AS_UNICODE(unicode);
665 for (i = size; i > 0; i--)
666 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 }
668#endif
669
670 return (PyObject *)unicode;
671}
672
Mark Dickinson081dfee2009-03-18 14:47:41 +0000673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
Walter Dörwald346737f2007-05-31 10:44:43 +0000677static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000680{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000681 *fmt++ = '%';
682 if (width) {
683 if (zeropad)
684 *fmt++ = '0';
685 fmt += sprintf(fmt, "%d", width);
686 }
687 if (precision)
688 fmt += sprintf(fmt, ".%d", precision);
689 if (longflag)
690 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000691 else if (longlongflag) {
692 /* longlongflag should only ever be nonzero on machines with
693 HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695 char *f = PY_FORMAT_LONG_LONG;
696 while (*f)
697 *fmt++ = *f++;
698#else
699 /* we shouldn't ever get here */
700 assert(0);
701 *fmt++ = 'l';
702#endif
703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000704 else if (size_tflag) {
705 char *f = PY_FORMAT_SIZE_T;
706 while (*f)
707 *fmt++ = *f++;
708 }
709 *fmt++ = c;
710 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000711}
712
Walter Dörwaldd2034312007-05-18 16:29:38 +0000713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld. 21 characters
718 allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000728 va_list count;
729 Py_ssize_t callcount = 0;
730 PyObject **callresults = NULL;
731 PyObject **callresult = NULL;
732 Py_ssize_t n = 0;
733 int width = 0;
734 int precision = 0;
735 int zeropad;
736 const char* f;
737 Py_UNICODE *s;
738 PyObject *string;
739 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000740 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000741 /* use abuffer instead of buffer, if we need more space
742 * (which can happen if there's a format specifier with width). */
743 char *abuffer = NULL;
744 char *realbuffer;
745 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000749 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000750 /* step 1: count the number of %S/%R/%A/%s format specifications
751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000754 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000755 if (*f == '%') {
756 if (*(f+1)=='%')
757 continue;
758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759 ++callcount;
760 while (ISDIGIT((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763 ;
764 if (*f == 's')
765 ++callcount;
766 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000767 else if (128 <= (unsigned char)*f) {
768 PyErr_Format(PyExc_ValueError,
769 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000770 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000771 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000772 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 }
775 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000776 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000777 if (callcount) {
778 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
779 if (!callresults) {
780 PyErr_NoMemory();
781 return NULL;
782 }
783 callresult = callresults;
784 }
785 /* step 3: figure out how large a buffer we need */
786 for (f = format; *f; f++) {
787 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000788#ifdef HAVE_LONG_LONG
789 int longlongflag = 0;
790#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 const char* p = f;
792 width = 0;
793 while (ISDIGIT((unsigned)*f))
794 width = (width*10) + *f++ - '0';
795 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
796 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797
Benjamin Peterson14339b62009-01-31 16:36:08 +0000798 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
799 * they don't affect the amount of space we reserve.
800 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000801 if (*f == 'l') {
802 if (f[1] == 'd' || f[1] == 'u') {
803 ++f;
804 }
805#ifdef HAVE_LONG_LONG
806 else if (f[1] == 'l' &&
807 (f[2] == 'd' || f[2] == 'u')) {
808 longlongflag = 1;
809 f += 2;
810 }
811#endif
812 }
813 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000814 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000816
Benjamin Peterson14339b62009-01-31 16:36:08 +0000817 switch (*f) {
818 case 'c':
819 (void)va_arg(count, int);
820 /* fall through... */
821 case '%':
822 n++;
823 break;
824 case 'd': case 'u': case 'i': case 'x':
825 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000826#ifdef HAVE_LONG_LONG
827 if (longlongflag) {
828 if (width < MAX_LONG_LONG_CHARS)
829 width = MAX_LONG_LONG_CHARS;
830 }
831 else
832#endif
833 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
834 including sign. Decimal takes the most space. This
835 isn't enough for octal. If a width is specified we
836 need more (which we allocate later). */
837 if (width < MAX_LONG_CHARS)
838 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000839 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000840 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 if (abuffersize < width)
842 abuffersize = width;
843 break;
844 case 's':
845 {
846 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000847 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000848 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
849 if (!str)
850 goto fail;
851 n += PyUnicode_GET_SIZE(str);
852 /* Remember the str and switch to the next slot */
853 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000854 break;
855 }
856 case 'U':
857 {
858 PyObject *obj = va_arg(count, PyObject *);
859 assert(obj && PyUnicode_Check(obj));
860 n += PyUnicode_GET_SIZE(obj);
861 break;
862 }
863 case 'V':
864 {
865 PyObject *obj = va_arg(count, PyObject *);
866 const char *str = va_arg(count, const char *);
867 assert(obj || str);
868 assert(!obj || PyUnicode_Check(obj));
869 if (obj)
870 n += PyUnicode_GET_SIZE(obj);
871 else
872 n += strlen(str);
873 break;
874 }
875 case 'S':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 PyObject *str;
879 assert(obj);
880 str = PyObject_Str(obj);
881 if (!str)
882 goto fail;
883 n += PyUnicode_GET_SIZE(str);
884 /* Remember the str and switch to the next slot */
885 *callresult++ = str;
886 break;
887 }
888 case 'R':
889 {
890 PyObject *obj = va_arg(count, PyObject *);
891 PyObject *repr;
892 assert(obj);
893 repr = PyObject_Repr(obj);
894 if (!repr)
895 goto fail;
896 n += PyUnicode_GET_SIZE(repr);
897 /* Remember the repr and switch to the next slot */
898 *callresult++ = repr;
899 break;
900 }
901 case 'A':
902 {
903 PyObject *obj = va_arg(count, PyObject *);
904 PyObject *ascii;
905 assert(obj);
906 ascii = PyObject_ASCII(obj);
907 if (!ascii)
908 goto fail;
909 n += PyUnicode_GET_SIZE(ascii);
910 /* Remember the repr and switch to the next slot */
911 *callresult++ = ascii;
912 break;
913 }
914 case 'p':
915 (void) va_arg(count, int);
916 /* maximum 64-bit pointer representation:
917 * 0xffffffffffffffff
918 * so 19 characters is enough.
919 * XXX I count 18 -- what's the extra for?
920 */
921 n += 19;
922 break;
923 default:
924 /* if we stumble upon an unknown
925 formatting code, copy the rest of
926 the format string to the output
927 string. (we cannot just skip the
928 code, since there's no way to know
929 what's in the argument list) */
930 n += strlen(p);
931 goto expand;
932 }
933 } else
934 n++;
935 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000936 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000937 if (abuffersize > ITEM_BUFFER_LEN) {
938 /* add 1 for sprintf's trailing null byte */
939 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000940 if (!abuffer) {
941 PyErr_NoMemory();
942 goto fail;
943 }
944 realbuffer = abuffer;
945 }
946 else
947 realbuffer = buffer;
948 /* step 4: fill the buffer */
949 /* Since we've analyzed how much space we need for the worst case,
950 we don't have to resize the string.
951 There can be no errors beyond this point. */
952 string = PyUnicode_FromUnicode(NULL, n);
953 if (!string)
954 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000955
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 s = PyUnicode_AS_UNICODE(string);
957 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000958
Benjamin Peterson14339b62009-01-31 16:36:08 +0000959 for (f = format; *f; f++) {
960 if (*f == '%') {
961 const char* p = f++;
962 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000963 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000964 int size_tflag = 0;
965 zeropad = (*f == '0');
966 /* parse the width.precision part */
967 width = 0;
968 while (ISDIGIT((unsigned)*f))
969 width = (width*10) + *f++ - '0';
970 precision = 0;
971 if (*f == '.') {
972 f++;
973 while (ISDIGIT((unsigned)*f))
974 precision = (precision*10) + *f++ - '0';
975 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000976 /* Handle %ld, %lu, %lld and %llu. */
977 if (*f == 'l') {
978 if (f[1] == 'd' || f[1] == 'u') {
979 longflag = 1;
980 ++f;
981 }
982#ifdef HAVE_LONG_LONG
983 else if (f[1] == 'l' &&
984 (f[2] == 'd' || f[2] == 'u')) {
985 longlongflag = 1;
986 f += 2;
987 }
988#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 }
990 /* handle the size_t flag. */
991 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
992 size_tflag = 1;
993 ++f;
994 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000995
Benjamin Peterson14339b62009-01-31 16:36:08 +0000996 switch (*f) {
997 case 'c':
998 *s++ = va_arg(vargs, int);
999 break;
1000 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001001 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1002 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001003 if (longflag)
1004 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001005#ifdef HAVE_LONG_LONG
1006 else if (longlongflag)
1007 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1008#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001009 else if (size_tflag)
1010 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1011 else
1012 sprintf(realbuffer, fmt, va_arg(vargs, int));
1013 appendstring(realbuffer);
1014 break;
1015 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001016 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1017 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 if (longflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001020#ifdef HAVE_LONG_LONG
1021 else if (longlongflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs,
1023 unsigned PY_LONG_LONG));
1024#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001025 else if (size_tflag)
1026 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1027 else
1028 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1029 appendstring(realbuffer);
1030 break;
1031 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 sprintf(realbuffer, fmt, va_arg(vargs, int));
1034 appendstring(realbuffer);
1035 break;
1036 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001037 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 's':
1042 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001043 /* unused, since we already have the result */
1044 (void) va_arg(vargs, char *);
1045 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1046 PyUnicode_GET_SIZE(*callresult));
1047 s += PyUnicode_GET_SIZE(*callresult);
1048 /* We're done with the unicode()/repr() => forget it */
1049 Py_DECREF(*callresult);
1050 /* switch to next unicode()/repr() result */
1051 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001052 break;
1053 }
1054 case 'U':
1055 {
1056 PyObject *obj = va_arg(vargs, PyObject *);
1057 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1058 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1059 s += size;
1060 break;
1061 }
1062 case 'V':
1063 {
1064 PyObject *obj = va_arg(vargs, PyObject *);
1065 const char *str = va_arg(vargs, const char *);
1066 if (obj) {
1067 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1068 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1069 s += size;
1070 } else {
1071 appendstring(str);
1072 }
1073 break;
1074 }
1075 case 'S':
1076 case 'R':
1077 {
1078 Py_UNICODE *ucopy;
1079 Py_ssize_t usize;
1080 Py_ssize_t upos;
1081 /* unused, since we already have the result */
1082 (void) va_arg(vargs, PyObject *);
1083 ucopy = PyUnicode_AS_UNICODE(*callresult);
1084 usize = PyUnicode_GET_SIZE(*callresult);
1085 for (upos = 0; upos<usize;)
1086 *s++ = ucopy[upos++];
1087 /* We're done with the unicode()/repr() => forget it */
1088 Py_DECREF(*callresult);
1089 /* switch to next unicode()/repr() result */
1090 ++callresult;
1091 break;
1092 }
1093 case 'p':
1094 sprintf(buffer, "%p", va_arg(vargs, void*));
1095 /* %p is ill-defined: ensure leading 0x. */
1096 if (buffer[1] == 'X')
1097 buffer[1] = 'x';
1098 else if (buffer[1] != 'x') {
1099 memmove(buffer+2, buffer, strlen(buffer)+1);
1100 buffer[0] = '0';
1101 buffer[1] = 'x';
1102 }
1103 appendstring(buffer);
1104 break;
1105 case '%':
1106 *s++ = '%';
1107 break;
1108 default:
1109 appendstring(p);
1110 goto end;
1111 }
Victor Stinner1205f272010-09-11 00:54:47 +00001112 }
Victor Stinner1205f272010-09-11 00:54:47 +00001113 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001114 *s++ = *f;
1115 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001116
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 if (callresults)
1119 PyObject_Free(callresults);
1120 if (abuffer)
1121 PyObject_Free(abuffer);
1122 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1123 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001124 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 if (callresults) {
1126 PyObject **callresult2 = callresults;
1127 while (callresult2 < callresult) {
1128 Py_DECREF(*callresult2);
1129 ++callresult2;
1130 }
1131 PyObject_Free(callresults);
1132 }
1133 if (abuffer)
1134 PyObject_Free(abuffer);
1135 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001136}
1137
1138#undef appendstring
1139
1140PyObject *
1141PyUnicode_FromFormat(const char *format, ...)
1142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001143 PyObject* ret;
1144 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001145
1146#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001147 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001148#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001150#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001151 ret = PyUnicode_FromFormatV(format, vargs);
1152 va_end(vargs);
1153 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001154}
1155
Victor Stinner137c34c2010-09-29 10:25:54 +00001156static void
1157unicode_aswidechar(PyUnicodeObject *unicode,
1158 wchar_t *w,
1159 Py_ssize_t size)
1160{
1161#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1162 memcpy(w, unicode->str, size * sizeof(wchar_t));
1163#else
1164 register Py_UNICODE *u;
1165 register Py_ssize_t i;
1166 u = PyUnicode_AS_UNICODE(unicode);
1167 for (i = size; i > 0; i--)
1168 *w++ = *u++;
1169#endif
1170}
1171
1172Py_ssize_t
1173PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1174 wchar_t *w,
1175 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176{
1177 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 PyErr_BadInternalCall();
1179 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001181
1182 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001185
Victor Stinner137c34c2010-09-29 10:25:54 +00001186 unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001188 if (size > PyUnicode_GET_SIZE(unicode))
1189 return PyUnicode_GET_SIZE(unicode);
1190 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001191 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192}
1193
Victor Stinner137c34c2010-09-29 10:25:54 +00001194wchar_t*
1195PyUnicode_AsWideCharString(PyUnicodeObject *unicode,
1196 Py_ssize_t *size)
1197{
1198 wchar_t* buffer;
1199 Py_ssize_t buflen;
1200
1201 if (unicode == NULL) {
1202 PyErr_BadInternalCall();
1203 return NULL;
1204 }
1205
1206 if ((PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) < PyUnicode_GET_SIZE(unicode)) {
1207 PyErr_NoMemory();
1208 return NULL;
1209 }
1210
1211 buflen = PyUnicode_GET_SIZE(unicode) + 1; /* copy L'\0' */
1212 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1213 if (buffer == NULL) {
1214 PyErr_NoMemory();
1215 return NULL;
1216 }
1217 unicode_aswidechar(unicode, buffer, buflen);
1218 return buffer;
1219}
1220
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221#endif
1222
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001223PyObject *PyUnicode_FromOrdinal(int ordinal)
1224{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001225 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001226
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001227 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001228 PyErr_SetString(PyExc_ValueError,
1229 "chr() arg not in range(0x110000)");
1230 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001231 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001232
1233#ifndef Py_UNICODE_WIDE
1234 if (ordinal > 0xffff) {
1235 ordinal -= 0x10000;
1236 s[0] = 0xD800 | (ordinal >> 10);
1237 s[1] = 0xDC00 | (ordinal & 0x3FF);
1238 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001239 }
1240#endif
1241
Hye-Shik Chang40574832004-04-06 07:24:51 +00001242 s[0] = (Py_UNICODE)ordinal;
1243 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001244}
1245
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246PyObject *PyUnicode_FromObject(register PyObject *obj)
1247{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001248 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001249 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001250 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001251 Py_INCREF(obj);
1252 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001253 }
1254 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001255 /* For a Unicode subtype that's not a Unicode object,
1256 return a true Unicode object with the same data. */
1257 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1258 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001259 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001260 PyErr_Format(PyExc_TypeError,
1261 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001262 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001263 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001264}
1265
1266PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001267 const char *encoding,
1268 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001269{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001270 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001271 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001272
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001274 PyErr_BadInternalCall();
1275 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001277
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001278 /* Decoding bytes objects is the most common case and should be fast */
1279 if (PyBytes_Check(obj)) {
1280 if (PyBytes_GET_SIZE(obj) == 0) {
1281 Py_INCREF(unicode_empty);
1282 v = (PyObject *) unicode_empty;
1283 }
1284 else {
1285 v = PyUnicode_Decode(
1286 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1287 encoding, errors);
1288 }
1289 return v;
1290 }
1291
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001292 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001293 PyErr_SetString(PyExc_TypeError,
1294 "decoding str is not supported");
1295 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001296 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001297
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001298 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1299 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1300 PyErr_Format(PyExc_TypeError,
1301 "coercing to str: need bytes, bytearray "
1302 "or buffer-like object, %.80s found",
1303 Py_TYPE(obj)->tp_name);
1304 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001305 }
Tim Petersced69f82003-09-16 20:30:58 +00001306
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001307 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001308 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001309 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 }
Tim Petersced69f82003-09-16 20:30:58 +00001311 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001312 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001313
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001314 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001315 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316}
1317
Victor Stinner600d3be2010-06-10 12:00:55 +00001318/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001319 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1320 1 on success. */
1321static int
1322normalize_encoding(const char *encoding,
1323 char *lower,
1324 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001326 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001327 char *l;
1328 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001329
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001330 e = encoding;
1331 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001332 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001333 while (*e) {
1334 if (l == l_end)
1335 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001336 if (ISUPPER(*e)) {
1337 *l++ = TOLOWER(*e++);
1338 }
1339 else if (*e == '_') {
1340 *l++ = '-';
1341 e++;
1342 }
1343 else {
1344 *l++ = *e++;
1345 }
1346 }
1347 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001348 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001349}
1350
1351PyObject *PyUnicode_Decode(const char *s,
1352 Py_ssize_t size,
1353 const char *encoding,
1354 const char *errors)
1355{
1356 PyObject *buffer = NULL, *unicode;
1357 Py_buffer info;
1358 char lower[11]; /* Enough for any encoding shortcut */
1359
1360 if (encoding == NULL)
1361 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001362
1363 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001364 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1365 if (strcmp(lower, "utf-8") == 0)
1366 return PyUnicode_DecodeUTF8(s, size, errors);
1367 else if ((strcmp(lower, "latin-1") == 0) ||
1368 (strcmp(lower, "iso-8859-1") == 0))
1369 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001370#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001371 else if (strcmp(lower, "mbcs") == 0)
1372 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001373#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001374 else if (strcmp(lower, "ascii") == 0)
1375 return PyUnicode_DecodeASCII(s, size, errors);
1376 else if (strcmp(lower, "utf-16") == 0)
1377 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1378 else if (strcmp(lower, "utf-32") == 0)
1379 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381
1382 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001383 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001384 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001385 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001386 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 if (buffer == NULL)
1388 goto onError;
1389 unicode = PyCodec_Decode(buffer, encoding, errors);
1390 if (unicode == NULL)
1391 goto onError;
1392 if (!PyUnicode_Check(unicode)) {
1393 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001394 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001395 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 Py_DECREF(unicode);
1397 goto onError;
1398 }
1399 Py_DECREF(buffer);
1400 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001401
Benjamin Peterson29060642009-01-31 22:14:21 +00001402 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 Py_XDECREF(buffer);
1404 return NULL;
1405}
1406
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001407PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1408 const char *encoding,
1409 const char *errors)
1410{
1411 PyObject *v;
1412
1413 if (!PyUnicode_Check(unicode)) {
1414 PyErr_BadArgument();
1415 goto onError;
1416 }
1417
1418 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001419 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001420
1421 /* Decode via the codec registry */
1422 v = PyCodec_Decode(unicode, encoding, errors);
1423 if (v == NULL)
1424 goto onError;
1425 return v;
1426
Benjamin Peterson29060642009-01-31 22:14:21 +00001427 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001428 return NULL;
1429}
1430
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001431PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1432 const char *encoding,
1433 const char *errors)
1434{
1435 PyObject *v;
1436
1437 if (!PyUnicode_Check(unicode)) {
1438 PyErr_BadArgument();
1439 goto onError;
1440 }
1441
1442 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001443 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001444
1445 /* Decode via the codec registry */
1446 v = PyCodec_Decode(unicode, encoding, errors);
1447 if (v == NULL)
1448 goto onError;
1449 if (!PyUnicode_Check(v)) {
1450 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001451 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001452 Py_TYPE(v)->tp_name);
1453 Py_DECREF(v);
1454 goto onError;
1455 }
1456 return v;
1457
Benjamin Peterson29060642009-01-31 22:14:21 +00001458 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001459 return NULL;
1460}
1461
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001463 Py_ssize_t size,
1464 const char *encoding,
1465 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466{
1467 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001468
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 unicode = PyUnicode_FromUnicode(s, size);
1470 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001472 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1473 Py_DECREF(unicode);
1474 return v;
1475}
1476
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001477PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1478 const char *encoding,
1479 const char *errors)
1480{
1481 PyObject *v;
1482
1483 if (!PyUnicode_Check(unicode)) {
1484 PyErr_BadArgument();
1485 goto onError;
1486 }
1487
1488 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001489 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001490
1491 /* Encode via the codec registry */
1492 v = PyCodec_Encode(unicode, encoding, errors);
1493 if (v == NULL)
1494 goto onError;
1495 return v;
1496
Benjamin Peterson29060642009-01-31 22:14:21 +00001497 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001498 return NULL;
1499}
1500
Victor Stinnerae6265f2010-05-15 16:27:27 +00001501PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1502{
Victor Stinner313a1202010-06-11 23:56:51 +00001503 if (Py_FileSystemDefaultEncoding) {
1504#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1505 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1506 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1507 PyUnicode_GET_SIZE(unicode),
1508 NULL);
1509#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001510 return PyUnicode_AsEncodedString(unicode,
1511 Py_FileSystemDefaultEncoding,
1512 "surrogateescape");
Victor Stinner313a1202010-06-11 23:56:51 +00001513 } else
Victor Stinnerae6265f2010-05-15 16:27:27 +00001514 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Victor Stinner3119ed72010-08-18 22:26:50 +00001515 PyUnicode_GET_SIZE(unicode),
1516 "surrogateescape");
Victor Stinnerae6265f2010-05-15 16:27:27 +00001517}
1518
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1520 const char *encoding,
1521 const char *errors)
1522{
1523 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001524 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001525
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 if (!PyUnicode_Check(unicode)) {
1527 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 }
Fred Drakee4315f52000-05-09 19:53:39 +00001530
Tim Petersced69f82003-09-16 20:30:58 +00001531 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001532 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001533
1534 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001535 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1536 if (strcmp(lower, "utf-8") == 0)
1537 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1538 PyUnicode_GET_SIZE(unicode),
1539 errors);
1540 else if ((strcmp(lower, "latin-1") == 0) ||
1541 (strcmp(lower, "iso-8859-1") == 0))
1542 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1543 PyUnicode_GET_SIZE(unicode),
1544 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001545#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001546 else if (strcmp(lower, "mbcs") == 0)
1547 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1548 PyUnicode_GET_SIZE(unicode),
1549 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001550#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001551 else if (strcmp(lower, "ascii") == 0)
1552 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1553 PyUnicode_GET_SIZE(unicode),
1554 errors);
1555 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001556 /* During bootstrap, we may need to find the encodings
1557 package, to load the file system encoding, and require the
1558 file system encoding in order to load the encodings
1559 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001560
Victor Stinner59e62db2010-05-15 13:14:32 +00001561 Break out of this dependency by assuming that the path to
1562 the encodings module is ASCII-only. XXX could try wcstombs
1563 instead, if the file system encoding is the locale's
1564 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001565 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001566 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1567 !PyThreadState_GET()->interp->codecs_initialized)
1568 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1569 PyUnicode_GET_SIZE(unicode),
1570 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571
1572 /* Encode via the codec registry */
1573 v = PyCodec_Encode(unicode, encoding, errors);
1574 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001575 return NULL;
1576
1577 /* The normal path */
1578 if (PyBytes_Check(v))
1579 return v;
1580
1581 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001582 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001583 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001584 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001585
1586 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1587 "encoder %s returned bytearray instead of bytes",
1588 encoding);
1589 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001590 Py_DECREF(v);
1591 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001592 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001593
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001594 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1595 Py_DECREF(v);
1596 return b;
1597 }
1598
1599 PyErr_Format(PyExc_TypeError,
1600 "encoder did not return a bytes object (type=%.400s)",
1601 Py_TYPE(v)->tp_name);
1602 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001603 return NULL;
1604}
1605
1606PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1607 const char *encoding,
1608 const char *errors)
1609{
1610 PyObject *v;
1611
1612 if (!PyUnicode_Check(unicode)) {
1613 PyErr_BadArgument();
1614 goto onError;
1615 }
1616
1617 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001618 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001619
1620 /* Encode via the codec registry */
1621 v = PyCodec_Encode(unicode, encoding, errors);
1622 if (v == NULL)
1623 goto onError;
1624 if (!PyUnicode_Check(v)) {
1625 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001626 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001627 Py_TYPE(v)->tp_name);
1628 Py_DECREF(v);
1629 goto onError;
1630 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001631 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001632
Benjamin Peterson29060642009-01-31 22:14:21 +00001633 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001634 return NULL;
1635}
1636
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001637PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001638 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001639{
1640 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001641 if (v)
1642 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001643 if (errors != NULL)
1644 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001645 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001646 PyUnicode_GET_SIZE(unicode),
1647 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001648 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001649 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001650 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001651 return v;
1652}
1653
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001654PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001655PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001656 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001657 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1658}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001659
Christian Heimes5894ba72007-11-04 11:43:14 +00001660PyObject*
1661PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1662{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001663 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1664 can be undefined. If it is case, decode using UTF-8. The following assumes
1665 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1666 bootstrapping process where the codecs aren't ready yet.
1667 */
1668 if (Py_FileSystemDefaultEncoding) {
1669#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001670 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001671 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001672 }
1673#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001674 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001675 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001676 }
1677#endif
1678 return PyUnicode_Decode(s, size,
1679 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001680 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001681 }
1682 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001683 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001684 }
1685}
1686
Martin v. Löwis011e8422009-05-05 04:43:17 +00001687
1688int
1689PyUnicode_FSConverter(PyObject* arg, void* addr)
1690{
1691 PyObject *output = NULL;
1692 Py_ssize_t size;
1693 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001694 if (arg == NULL) {
1695 Py_DECREF(*(PyObject**)addr);
1696 return 1;
1697 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001698 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001699 output = arg;
1700 Py_INCREF(output);
1701 }
1702 else {
1703 arg = PyUnicode_FromObject(arg);
1704 if (!arg)
1705 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001706 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001707 Py_DECREF(arg);
1708 if (!output)
1709 return 0;
1710 if (!PyBytes_Check(output)) {
1711 Py_DECREF(output);
1712 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1713 return 0;
1714 }
1715 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001716 size = PyBytes_GET_SIZE(output);
1717 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001718 if (size != strlen(data)) {
1719 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1720 Py_DECREF(output);
1721 return 0;
1722 }
1723 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001724 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001725}
1726
1727
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001728int
1729PyUnicode_FSDecoder(PyObject* arg, void* addr)
1730{
1731 PyObject *output = NULL;
1732 Py_ssize_t size;
1733 void *data;
1734 if (arg == NULL) {
1735 Py_DECREF(*(PyObject**)addr);
1736 return 1;
1737 }
1738 if (PyUnicode_Check(arg)) {
1739 output = arg;
1740 Py_INCREF(output);
1741 }
1742 else {
1743 arg = PyBytes_FromObject(arg);
1744 if (!arg)
1745 return 0;
1746 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1747 PyBytes_GET_SIZE(arg));
1748 Py_DECREF(arg);
1749 if (!output)
1750 return 0;
1751 if (!PyUnicode_Check(output)) {
1752 Py_DECREF(output);
1753 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1754 return 0;
1755 }
1756 }
1757 size = PyUnicode_GET_SIZE(output);
1758 data = PyUnicode_AS_UNICODE(output);
1759 if (size != Py_UNICODE_strlen(data)) {
1760 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1761 Py_DECREF(output);
1762 return 0;
1763 }
1764 *(PyObject**)addr = output;
1765 return Py_CLEANUP_SUPPORTED;
1766}
1767
1768
Martin v. Löwis5b222132007-06-10 09:51:05 +00001769char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001770_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001771{
Christian Heimesf3863112007-11-22 07:46:41 +00001772 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001773 if (!PyUnicode_Check(unicode)) {
1774 PyErr_BadArgument();
1775 return NULL;
1776 }
Christian Heimesf3863112007-11-22 07:46:41 +00001777 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1778 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001779 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001780 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001781 *psize = PyBytes_GET_SIZE(bytes);
1782 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001783}
1784
1785char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001786_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001787{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001788 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001789}
1790
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1792{
1793 if (!PyUnicode_Check(unicode)) {
1794 PyErr_BadArgument();
1795 goto onError;
1796 }
1797 return PyUnicode_AS_UNICODE(unicode);
1798
Benjamin Peterson29060642009-01-31 22:14:21 +00001799 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 return NULL;
1801}
1802
Martin v. Löwis18e16552006-02-15 17:27:45 +00001803Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804{
1805 if (!PyUnicode_Check(unicode)) {
1806 PyErr_BadArgument();
1807 goto onError;
1808 }
1809 return PyUnicode_GET_SIZE(unicode);
1810
Benjamin Peterson29060642009-01-31 22:14:21 +00001811 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812 return -1;
1813}
1814
Thomas Wouters78890102000-07-22 19:25:51 +00001815const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001816{
Victor Stinner42cb4622010-09-01 19:39:01 +00001817 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001818}
1819
Victor Stinner554f3f02010-06-16 23:33:54 +00001820/* create or adjust a UnicodeDecodeError */
1821static void
1822make_decode_exception(PyObject **exceptionObject,
1823 const char *encoding,
1824 const char *input, Py_ssize_t length,
1825 Py_ssize_t startpos, Py_ssize_t endpos,
1826 const char *reason)
1827{
1828 if (*exceptionObject == NULL) {
1829 *exceptionObject = PyUnicodeDecodeError_Create(
1830 encoding, input, length, startpos, endpos, reason);
1831 }
1832 else {
1833 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1834 goto onError;
1835 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1836 goto onError;
1837 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1838 goto onError;
1839 }
1840 return;
1841
1842onError:
1843 Py_DECREF(*exceptionObject);
1844 *exceptionObject = NULL;
1845}
1846
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001847/* error handling callback helper:
1848 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001849 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001850 and adjust various state variables.
1851 return 0 on success, -1 on error
1852*/
1853
1854static
1855int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001856 const char *encoding, const char *reason,
1857 const char **input, const char **inend, Py_ssize_t *startinpos,
1858 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1859 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001860{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001861 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001862
1863 PyObject *restuple = NULL;
1864 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001865 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001866 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001867 Py_ssize_t requiredsize;
1868 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001869 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001870 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001871 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001872 int res = -1;
1873
1874 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001875 *errorHandler = PyCodec_LookupError(errors);
1876 if (*errorHandler == NULL)
1877 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001878 }
1879
Victor Stinner554f3f02010-06-16 23:33:54 +00001880 make_decode_exception(exceptionObject,
1881 encoding,
1882 *input, *inend - *input,
1883 *startinpos, *endinpos,
1884 reason);
1885 if (*exceptionObject == NULL)
1886 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001887
1888 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1889 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001890 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001891 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001892 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001893 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001894 }
1895 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001896 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001897
1898 /* Copy back the bytes variables, which might have been modified by the
1899 callback */
1900 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1901 if (!inputobj)
1902 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001903 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001905 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001906 *input = PyBytes_AS_STRING(inputobj);
1907 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001908 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001909 /* we can DECREF safely, as the exception has another reference,
1910 so the object won't go away. */
1911 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001912
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001913 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001914 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001915 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001916 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1917 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001918 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001919
1920 /* need more space? (at least enough for what we
1921 have+the replacement+the rest of the string (starting
1922 at the new input position), so we won't have to check space
1923 when there are no errors in the rest of the string) */
1924 repptr = PyUnicode_AS_UNICODE(repunicode);
1925 repsize = PyUnicode_GET_SIZE(repunicode);
1926 requiredsize = *outpos + repsize + insize-newpos;
1927 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001928 if (requiredsize<2*outsize)
1929 requiredsize = 2*outsize;
1930 if (_PyUnicode_Resize(output, requiredsize) < 0)
1931 goto onError;
1932 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001933 }
1934 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001935 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001936 Py_UNICODE_COPY(*outptr, repptr, repsize);
1937 *outptr += repsize;
1938 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001939
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001940 /* we made it! */
1941 res = 0;
1942
Benjamin Peterson29060642009-01-31 22:14:21 +00001943 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001944 Py_XDECREF(restuple);
1945 return res;
1946}
1947
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001948/* --- UTF-7 Codec -------------------------------------------------------- */
1949
Antoine Pitrou244651a2009-05-04 18:56:13 +00001950/* See RFC2152 for details. We encode conservatively and decode liberally. */
1951
1952/* Three simple macros defining base-64. */
1953
1954/* Is c a base-64 character? */
1955
1956#define IS_BASE64(c) \
1957 (((c) >= 'A' && (c) <= 'Z') || \
1958 ((c) >= 'a' && (c) <= 'z') || \
1959 ((c) >= '0' && (c) <= '9') || \
1960 (c) == '+' || (c) == '/')
1961
1962/* given that c is a base-64 character, what is its base-64 value? */
1963
1964#define FROM_BASE64(c) \
1965 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1966 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1967 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1968 (c) == '+' ? 62 : 63)
1969
1970/* What is the base-64 character of the bottom 6 bits of n? */
1971
1972#define TO_BASE64(n) \
1973 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1974
1975/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1976 * decoded as itself. We are permissive on decoding; the only ASCII
1977 * byte not decoding to itself is the + which begins a base64
1978 * string. */
1979
1980#define DECODE_DIRECT(c) \
1981 ((c) <= 127 && (c) != '+')
1982
1983/* The UTF-7 encoder treats ASCII characters differently according to
1984 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1985 * the above). See RFC2152. This array identifies these different
1986 * sets:
1987 * 0 : "Set D"
1988 * alphanumeric and '(),-./:?
1989 * 1 : "Set O"
1990 * !"#$%&*;<=>@[]^_`{|}
1991 * 2 : "whitespace"
1992 * ht nl cr sp
1993 * 3 : special (must be base64 encoded)
1994 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1995 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001996
Tim Petersced69f82003-09-16 20:30:58 +00001997static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001998char utf7_category[128] = {
1999/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2000 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2001/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2002 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2003/* sp ! " # $ % & ' ( ) * + , - . / */
2004 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2005/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2006 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2007/* @ A B C D E F G H I J K L M N O */
2008 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2009/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2010 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2011/* ` a b c d e f g h i j k l m n o */
2012 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2013/* p q r s t u v w x y z { | } ~ del */
2014 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002015};
2016
Antoine Pitrou244651a2009-05-04 18:56:13 +00002017/* ENCODE_DIRECT: this character should be encoded as itself. The
2018 * answer depends on whether we are encoding set O as itself, and also
2019 * on whether we are encoding whitespace as itself. RFC2152 makes it
2020 * clear that the answers to these questions vary between
2021 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002022
Antoine Pitrou244651a2009-05-04 18:56:13 +00002023#define ENCODE_DIRECT(c, directO, directWS) \
2024 ((c) < 128 && (c) > 0 && \
2025 ((utf7_category[(c)] == 0) || \
2026 (directWS && (utf7_category[(c)] == 2)) || \
2027 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002028
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002029PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002030 Py_ssize_t size,
2031 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002032{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002033 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2034}
2035
Antoine Pitrou244651a2009-05-04 18:56:13 +00002036/* The decoder. The only state we preserve is our read position,
2037 * i.e. how many characters we have consumed. So if we end in the
2038 * middle of a shift sequence we have to back off the read position
2039 * and the output to the beginning of the sequence, otherwise we lose
2040 * all the shift state (seen bits, number of bits seen, high
2041 * surrogate). */
2042
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002043PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002044 Py_ssize_t size,
2045 const char *errors,
2046 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002047{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002049 Py_ssize_t startinpos;
2050 Py_ssize_t endinpos;
2051 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002052 const char *e;
2053 PyUnicodeObject *unicode;
2054 Py_UNICODE *p;
2055 const char *errmsg = "";
2056 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002057 Py_UNICODE *shiftOutStart;
2058 unsigned int base64bits = 0;
2059 unsigned long base64buffer = 0;
2060 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 PyObject *errorHandler = NULL;
2062 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002063
2064 unicode = _PyUnicode_New(size);
2065 if (!unicode)
2066 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002067 if (size == 0) {
2068 if (consumed)
2069 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002070 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002071 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002072
2073 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002074 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002075 e = s + size;
2076
2077 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002078 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002079 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002080 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002081
Antoine Pitrou244651a2009-05-04 18:56:13 +00002082 if (inShift) { /* in a base-64 section */
2083 if (IS_BASE64(ch)) { /* consume a base-64 character */
2084 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2085 base64bits += 6;
2086 s++;
2087 if (base64bits >= 16) {
2088 /* we have enough bits for a UTF-16 value */
2089 Py_UNICODE outCh = (Py_UNICODE)
2090 (base64buffer >> (base64bits-16));
2091 base64bits -= 16;
2092 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2093 if (surrogate) {
2094 /* expecting a second surrogate */
2095 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2096#ifdef Py_UNICODE_WIDE
2097 *p++ = (((surrogate & 0x3FF)<<10)
2098 | (outCh & 0x3FF)) + 0x10000;
2099#else
2100 *p++ = surrogate;
2101 *p++ = outCh;
2102#endif
2103 surrogate = 0;
2104 }
2105 else {
2106 surrogate = 0;
2107 errmsg = "second surrogate missing";
2108 goto utf7Error;
2109 }
2110 }
2111 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2112 /* first surrogate */
2113 surrogate = outCh;
2114 }
2115 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2116 errmsg = "unexpected second surrogate";
2117 goto utf7Error;
2118 }
2119 else {
2120 *p++ = outCh;
2121 }
2122 }
2123 }
2124 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002125 inShift = 0;
2126 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002127 if (surrogate) {
2128 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002129 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002130 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002131 if (base64bits > 0) { /* left-over bits */
2132 if (base64bits >= 6) {
2133 /* We've seen at least one base-64 character */
2134 errmsg = "partial character in shift sequence";
2135 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002136 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002137 else {
2138 /* Some bits remain; they should be zero */
2139 if (base64buffer != 0) {
2140 errmsg = "non-zero padding bits in shift sequence";
2141 goto utf7Error;
2142 }
2143 }
2144 }
2145 if (ch != '-') {
2146 /* '-' is absorbed; other terminating
2147 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002148 *p++ = ch;
2149 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002150 }
2151 }
2152 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002153 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002154 s++; /* consume '+' */
2155 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002156 s++;
2157 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002158 }
2159 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002160 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002161 shiftOutStart = p;
2162 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002163 }
2164 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002165 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002166 *p++ = ch;
2167 s++;
2168 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002169 else {
2170 startinpos = s-starts;
2171 s++;
2172 errmsg = "unexpected special character";
2173 goto utf7Error;
2174 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002175 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002176utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002177 outpos = p-PyUnicode_AS_UNICODE(unicode);
2178 endinpos = s-starts;
2179 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002180 errors, &errorHandler,
2181 "utf7", errmsg,
2182 &starts, &e, &startinpos, &endinpos, &exc, &s,
2183 &unicode, &outpos, &p))
2184 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002185 }
2186
Antoine Pitrou244651a2009-05-04 18:56:13 +00002187 /* end of string */
2188
2189 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2190 /* if we're in an inconsistent state, that's an error */
2191 if (surrogate ||
2192 (base64bits >= 6) ||
2193 (base64bits > 0 && base64buffer != 0)) {
2194 outpos = p-PyUnicode_AS_UNICODE(unicode);
2195 endinpos = size;
2196 if (unicode_decode_call_errorhandler(
2197 errors, &errorHandler,
2198 "utf7", "unterminated shift sequence",
2199 &starts, &e, &startinpos, &endinpos, &exc, &s,
2200 &unicode, &outpos, &p))
2201 goto onError;
2202 if (s < e)
2203 goto restart;
2204 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002205 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002206
2207 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002208 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002209 if (inShift) {
2210 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002211 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002212 }
2213 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002214 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002215 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002216 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002217
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002218 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002219 goto onError;
2220
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002221 Py_XDECREF(errorHandler);
2222 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002223 return (PyObject *)unicode;
2224
Benjamin Peterson29060642009-01-31 22:14:21 +00002225 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002226 Py_XDECREF(errorHandler);
2227 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002228 Py_DECREF(unicode);
2229 return NULL;
2230}
2231
2232
2233PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002234 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002235 int base64SetO,
2236 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002237 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002238{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002239 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002240 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002241 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002242 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002243 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002244 unsigned int base64bits = 0;
2245 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002246 char * out;
2247 char * start;
2248
2249 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002250 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002251
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002252 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002253 return PyErr_NoMemory();
2254
Antoine Pitrou244651a2009-05-04 18:56:13 +00002255 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002256 if (v == NULL)
2257 return NULL;
2258
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002259 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002260 for (;i < size; ++i) {
2261 Py_UNICODE ch = s[i];
2262
Antoine Pitrou244651a2009-05-04 18:56:13 +00002263 if (inShift) {
2264 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2265 /* shifting out */
2266 if (base64bits) { /* output remaining bits */
2267 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2268 base64buffer = 0;
2269 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002270 }
2271 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002272 /* Characters not in the BASE64 set implicitly unshift the sequence
2273 so no '-' is required, except if the character is itself a '-' */
2274 if (IS_BASE64(ch) || ch == '-') {
2275 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002276 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002277 *out++ = (char) ch;
2278 }
2279 else {
2280 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002281 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002282 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002283 else { /* not in a shift sequence */
2284 if (ch == '+') {
2285 *out++ = '+';
2286 *out++ = '-';
2287 }
2288 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2289 *out++ = (char) ch;
2290 }
2291 else {
2292 *out++ = '+';
2293 inShift = 1;
2294 goto encode_char;
2295 }
2296 }
2297 continue;
2298encode_char:
2299#ifdef Py_UNICODE_WIDE
2300 if (ch >= 0x10000) {
2301 /* code first surrogate */
2302 base64bits += 16;
2303 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2304 while (base64bits >= 6) {
2305 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2306 base64bits -= 6;
2307 }
2308 /* prepare second surrogate */
2309 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2310 }
2311#endif
2312 base64bits += 16;
2313 base64buffer = (base64buffer << 16) | ch;
2314 while (base64bits >= 6) {
2315 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2316 base64bits -= 6;
2317 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002318 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002319 if (base64bits)
2320 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2321 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002322 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002323 if (_PyBytes_Resize(&v, out - start) < 0)
2324 return NULL;
2325 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002326}
2327
Antoine Pitrou244651a2009-05-04 18:56:13 +00002328#undef IS_BASE64
2329#undef FROM_BASE64
2330#undef TO_BASE64
2331#undef DECODE_DIRECT
2332#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002333
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334/* --- UTF-8 Codec -------------------------------------------------------- */
2335
Tim Petersced69f82003-09-16 20:30:58 +00002336static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002338 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2339 illegal prefix. See RFC 3629 for details */
2340 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2341 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002342 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2344 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2345 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2346 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002347 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2348 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2350 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002351 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2352 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2353 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2354 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2355 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356};
2357
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002359 Py_ssize_t size,
2360 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361{
Walter Dörwald69652032004-09-07 20:24:22 +00002362 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2363}
2364
Antoine Pitrouab868312009-01-10 15:40:25 +00002365/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2366#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2367
2368/* Mask to quickly check whether a C 'long' contains a
2369 non-ASCII, UTF8-encoded char. */
2370#if (SIZEOF_LONG == 8)
2371# define ASCII_CHAR_MASK 0x8080808080808080L
2372#elif (SIZEOF_LONG == 4)
2373# define ASCII_CHAR_MASK 0x80808080L
2374#else
2375# error C 'long' size should be either 4 or 8!
2376#endif
2377
Walter Dörwald69652032004-09-07 20:24:22 +00002378PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002379 Py_ssize_t size,
2380 const char *errors,
2381 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002382{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002383 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002385 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002386 Py_ssize_t startinpos;
2387 Py_ssize_t endinpos;
2388 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002389 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390 PyUnicodeObject *unicode;
2391 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002392 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002393 PyObject *errorHandler = NULL;
2394 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002395
2396 /* Note: size will always be longer than the resulting Unicode
2397 character count */
2398 unicode = _PyUnicode_New(size);
2399 if (!unicode)
2400 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002401 if (size == 0) {
2402 if (consumed)
2403 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002404 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406
2407 /* Unpack UTF-8 encoded data */
2408 p = unicode->str;
2409 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002410 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411
2412 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002413 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414
2415 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002416 /* Fast path for runs of ASCII characters. Given that common UTF-8
2417 input will consist of an overwhelming majority of ASCII
2418 characters, we try to optimize for this case by checking
2419 as many characters as a C 'long' can contain.
2420 First, check if we can do an aligned read, as most CPUs have
2421 a penalty for unaligned reads.
2422 */
2423 if (!((size_t) s & LONG_PTR_MASK)) {
2424 /* Help register allocation */
2425 register const char *_s = s;
2426 register Py_UNICODE *_p = p;
2427 while (_s < aligned_end) {
2428 /* Read a whole long at a time (either 4 or 8 bytes),
2429 and do a fast unrolled copy if it only contains ASCII
2430 characters. */
2431 unsigned long data = *(unsigned long *) _s;
2432 if (data & ASCII_CHAR_MASK)
2433 break;
2434 _p[0] = (unsigned char) _s[0];
2435 _p[1] = (unsigned char) _s[1];
2436 _p[2] = (unsigned char) _s[2];
2437 _p[3] = (unsigned char) _s[3];
2438#if (SIZEOF_LONG == 8)
2439 _p[4] = (unsigned char) _s[4];
2440 _p[5] = (unsigned char) _s[5];
2441 _p[6] = (unsigned char) _s[6];
2442 _p[7] = (unsigned char) _s[7];
2443#endif
2444 _s += SIZEOF_LONG;
2445 _p += SIZEOF_LONG;
2446 }
2447 s = _s;
2448 p = _p;
2449 if (s == e)
2450 break;
2451 ch = (unsigned char)*s;
2452 }
2453 }
2454
2455 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002456 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002457 s++;
2458 continue;
2459 }
2460
2461 n = utf8_code_length[ch];
2462
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002463 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002464 if (consumed)
2465 break;
2466 else {
2467 errmsg = "unexpected end of data";
2468 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002469 endinpos = startinpos+1;
2470 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2471 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002472 goto utf8Error;
2473 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475
2476 switch (n) {
2477
2478 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002479 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 startinpos = s-starts;
2481 endinpos = startinpos+1;
2482 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483
2484 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002485 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002486 startinpos = s-starts;
2487 endinpos = startinpos+1;
2488 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489
2490 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002491 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002492 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002493 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002494 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002495 goto utf8Error;
2496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002498 assert ((ch > 0x007F) && (ch <= 0x07FF));
2499 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 break;
2501
2502 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002503 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2504 will result in surrogates in range d800-dfff. Surrogates are
2505 not valid UTF-8 so they are rejected.
2506 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2507 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002508 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002509 (s[2] & 0xc0) != 0x80 ||
2510 ((unsigned char)s[0] == 0xE0 &&
2511 (unsigned char)s[1] < 0xA0) ||
2512 ((unsigned char)s[0] == 0xED &&
2513 (unsigned char)s[1] > 0x9F)) {
2514 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002515 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002516 endinpos = startinpos + 1;
2517
2518 /* if s[1] first two bits are 1 and 0, then the invalid
2519 continuation byte is s[2], so increment endinpos by 1,
2520 if not, s[1] is invalid and endinpos doesn't need to
2521 be incremented. */
2522 if ((s[1] & 0xC0) == 0x80)
2523 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002524 goto utf8Error;
2525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002527 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2528 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002529 break;
2530
2531 case 4:
2532 if ((s[1] & 0xc0) != 0x80 ||
2533 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002534 (s[3] & 0xc0) != 0x80 ||
2535 ((unsigned char)s[0] == 0xF0 &&
2536 (unsigned char)s[1] < 0x90) ||
2537 ((unsigned char)s[0] == 0xF4 &&
2538 (unsigned char)s[1] > 0x8F)) {
2539 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002540 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002541 endinpos = startinpos + 1;
2542 if ((s[1] & 0xC0) == 0x80) {
2543 endinpos++;
2544 if ((s[2] & 0xC0) == 0x80)
2545 endinpos++;
2546 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002547 goto utf8Error;
2548 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002549 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002550 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2551 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2552
Fredrik Lundh8f455852001-06-27 18:59:43 +00002553#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002554 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002555#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002556 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002557
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002558 /* translate from 10000..10FFFF to 0..FFFF */
2559 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002560
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002561 /* high surrogate = top 10 bits added to D800 */
2562 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002563
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002564 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002565 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002566#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002567 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 }
2569 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002570 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002571
Benjamin Peterson29060642009-01-31 22:14:21 +00002572 utf8Error:
2573 outpos = p-PyUnicode_AS_UNICODE(unicode);
2574 if (unicode_decode_call_errorhandler(
2575 errors, &errorHandler,
2576 "utf8", errmsg,
2577 &starts, &e, &startinpos, &endinpos, &exc, &s,
2578 &unicode, &outpos, &p))
2579 goto onError;
2580 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581 }
Walter Dörwald69652032004-09-07 20:24:22 +00002582 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002583 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584
2585 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002586 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 goto onError;
2588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002589 Py_XDECREF(errorHandler);
2590 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 return (PyObject *)unicode;
2592
Benjamin Peterson29060642009-01-31 22:14:21 +00002593 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002594 Py_XDECREF(errorHandler);
2595 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 Py_DECREF(unicode);
2597 return NULL;
2598}
2599
Antoine Pitrouab868312009-01-10 15:40:25 +00002600#undef ASCII_CHAR_MASK
2601
2602
Tim Peters602f7402002-04-27 18:03:26 +00002603/* Allocation strategy: if the string is short, convert into a stack buffer
2604 and allocate exactly as much space needed at the end. Else allocate the
2605 maximum possible needed (4 result bytes per Unicode character), and return
2606 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002607*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002608PyObject *
2609PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002610 Py_ssize_t size,
2611 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612{
Tim Peters602f7402002-04-27 18:03:26 +00002613#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002614
Guido van Rossum98297ee2007-11-06 21:34:58 +00002615 Py_ssize_t i; /* index into s of next input byte */
2616 PyObject *result; /* result string object */
2617 char *p; /* next free byte in output buffer */
2618 Py_ssize_t nallocated; /* number of result bytes allocated */
2619 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002620 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002621 PyObject *errorHandler = NULL;
2622 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002623
Tim Peters602f7402002-04-27 18:03:26 +00002624 assert(s != NULL);
2625 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626
Tim Peters602f7402002-04-27 18:03:26 +00002627 if (size <= MAX_SHORT_UNICHARS) {
2628 /* Write into the stack buffer; nallocated can't overflow.
2629 * At the end, we'll allocate exactly as much heap space as it
2630 * turns out we need.
2631 */
2632 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002633 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002634 p = stackbuf;
2635 }
2636 else {
2637 /* Overallocate on the heap, and give the excess back at the end. */
2638 nallocated = size * 4;
2639 if (nallocated / 4 != size) /* overflow! */
2640 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002641 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002642 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002643 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002644 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002645 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002646
Tim Peters602f7402002-04-27 18:03:26 +00002647 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002648 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002649
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002650 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002651 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002653
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002655 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002656 *p++ = (char)(0xc0 | (ch >> 6));
2657 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002658 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002659#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002660 /* Special case: check for high and low surrogate */
2661 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2662 Py_UCS4 ch2 = s[i];
2663 /* Combine the two surrogates to form a UCS4 value */
2664 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2665 i++;
2666
2667 /* Encode UCS4 Unicode ordinals */
2668 *p++ = (char)(0xf0 | (ch >> 18));
2669 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002670 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2671 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002672 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002673#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002674 Py_ssize_t newpos;
2675 PyObject *rep;
2676 Py_ssize_t repsize, k;
2677 rep = unicode_encode_call_errorhandler
2678 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2679 s, size, &exc, i-1, i, &newpos);
2680 if (!rep)
2681 goto error;
2682
2683 if (PyBytes_Check(rep))
2684 repsize = PyBytes_GET_SIZE(rep);
2685 else
2686 repsize = PyUnicode_GET_SIZE(rep);
2687
2688 if (repsize > 4) {
2689 Py_ssize_t offset;
2690
2691 if (result == NULL)
2692 offset = p - stackbuf;
2693 else
2694 offset = p - PyBytes_AS_STRING(result);
2695
2696 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2697 /* integer overflow */
2698 PyErr_NoMemory();
2699 goto error;
2700 }
2701 nallocated += repsize - 4;
2702 if (result != NULL) {
2703 if (_PyBytes_Resize(&result, nallocated) < 0)
2704 goto error;
2705 } else {
2706 result = PyBytes_FromStringAndSize(NULL, nallocated);
2707 if (result == NULL)
2708 goto error;
2709 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2710 }
2711 p = PyBytes_AS_STRING(result) + offset;
2712 }
2713
2714 if (PyBytes_Check(rep)) {
2715 char *prep = PyBytes_AS_STRING(rep);
2716 for(k = repsize; k > 0; k--)
2717 *p++ = *prep++;
2718 } else /* rep is unicode */ {
2719 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2720 Py_UNICODE c;
2721
2722 for(k=0; k<repsize; k++) {
2723 c = prep[k];
2724 if (0x80 <= c) {
2725 raise_encode_exception(&exc, "utf-8", s, size,
2726 i-1, i, "surrogates not allowed");
2727 goto error;
2728 }
2729 *p++ = (char)prep[k];
2730 }
2731 }
2732 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002733#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002734 }
Victor Stinner445a6232010-04-22 20:01:57 +00002735#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002736 } else if (ch < 0x10000) {
2737 *p++ = (char)(0xe0 | (ch >> 12));
2738 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2739 *p++ = (char)(0x80 | (ch & 0x3f));
2740 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002741 /* Encode UCS4 Unicode ordinals */
2742 *p++ = (char)(0xf0 | (ch >> 18));
2743 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2744 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2745 *p++ = (char)(0x80 | (ch & 0x3f));
2746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002748
Guido van Rossum98297ee2007-11-06 21:34:58 +00002749 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002750 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002751 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002752 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002753 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002754 }
2755 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002756 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002757 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002758 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002759 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002760 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002761 Py_XDECREF(errorHandler);
2762 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002763 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002764 error:
2765 Py_XDECREF(errorHandler);
2766 Py_XDECREF(exc);
2767 Py_XDECREF(result);
2768 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002769
Tim Peters602f7402002-04-27 18:03:26 +00002770#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771}
2772
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2774{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 if (!PyUnicode_Check(unicode)) {
2776 PyErr_BadArgument();
2777 return NULL;
2778 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002779 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002780 PyUnicode_GET_SIZE(unicode),
2781 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782}
2783
Walter Dörwald41980ca2007-08-16 21:55:45 +00002784/* --- UTF-32 Codec ------------------------------------------------------- */
2785
2786PyObject *
2787PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002788 Py_ssize_t size,
2789 const char *errors,
2790 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002791{
2792 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2793}
2794
2795PyObject *
2796PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 Py_ssize_t size,
2798 const char *errors,
2799 int *byteorder,
2800 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002801{
2802 const char *starts = s;
2803 Py_ssize_t startinpos;
2804 Py_ssize_t endinpos;
2805 Py_ssize_t outpos;
2806 PyUnicodeObject *unicode;
2807 Py_UNICODE *p;
2808#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002809 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002810 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002811#else
2812 const int pairs = 0;
2813#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002814 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002815 int bo = 0; /* assume native ordering by default */
2816 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002817 /* Offsets from q for retrieving bytes in the right order. */
2818#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2819 int iorder[] = {0, 1, 2, 3};
2820#else
2821 int iorder[] = {3, 2, 1, 0};
2822#endif
2823 PyObject *errorHandler = NULL;
2824 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002825
Walter Dörwald41980ca2007-08-16 21:55:45 +00002826 q = (unsigned char *)s;
2827 e = q + size;
2828
2829 if (byteorder)
2830 bo = *byteorder;
2831
2832 /* Check for BOM marks (U+FEFF) in the input and adjust current
2833 byte order setting accordingly. In native mode, the leading BOM
2834 mark is skipped, in all other modes, it is copied to the output
2835 stream as-is (giving a ZWNBSP character). */
2836 if (bo == 0) {
2837 if (size >= 4) {
2838 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002839 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002840#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 if (bom == 0x0000FEFF) {
2842 q += 4;
2843 bo = -1;
2844 }
2845 else if (bom == 0xFFFE0000) {
2846 q += 4;
2847 bo = 1;
2848 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002849#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002850 if (bom == 0x0000FEFF) {
2851 q += 4;
2852 bo = 1;
2853 }
2854 else if (bom == 0xFFFE0000) {
2855 q += 4;
2856 bo = -1;
2857 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002858#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002860 }
2861
2862 if (bo == -1) {
2863 /* force LE */
2864 iorder[0] = 0;
2865 iorder[1] = 1;
2866 iorder[2] = 2;
2867 iorder[3] = 3;
2868 }
2869 else if (bo == 1) {
2870 /* force BE */
2871 iorder[0] = 3;
2872 iorder[1] = 2;
2873 iorder[2] = 1;
2874 iorder[3] = 0;
2875 }
2876
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002877 /* On narrow builds we split characters outside the BMP into two
2878 codepoints => count how much extra space we need. */
2879#ifndef Py_UNICODE_WIDE
2880 for (qq = q; qq < e; qq += 4)
2881 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2882 pairs++;
2883#endif
2884
2885 /* This might be one to much, because of a BOM */
2886 unicode = _PyUnicode_New((size+3)/4+pairs);
2887 if (!unicode)
2888 return NULL;
2889 if (size == 0)
2890 return (PyObject *)unicode;
2891
2892 /* Unpack UTF-32 encoded data */
2893 p = unicode->str;
2894
Walter Dörwald41980ca2007-08-16 21:55:45 +00002895 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002896 Py_UCS4 ch;
2897 /* remaining bytes at the end? (size should be divisible by 4) */
2898 if (e-q<4) {
2899 if (consumed)
2900 break;
2901 errmsg = "truncated data";
2902 startinpos = ((const char *)q)-starts;
2903 endinpos = ((const char *)e)-starts;
2904 goto utf32Error;
2905 /* The remaining input chars are ignored if the callback
2906 chooses to skip the input */
2907 }
2908 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2909 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002910
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 if (ch >= 0x110000)
2912 {
2913 errmsg = "codepoint not in range(0x110000)";
2914 startinpos = ((const char *)q)-starts;
2915 endinpos = startinpos+4;
2916 goto utf32Error;
2917 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002918#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 if (ch >= 0x10000)
2920 {
2921 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2922 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2923 }
2924 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002925#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002926 *p++ = ch;
2927 q += 4;
2928 continue;
2929 utf32Error:
2930 outpos = p-PyUnicode_AS_UNICODE(unicode);
2931 if (unicode_decode_call_errorhandler(
2932 errors, &errorHandler,
2933 "utf32", errmsg,
2934 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2935 &unicode, &outpos, &p))
2936 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002937 }
2938
2939 if (byteorder)
2940 *byteorder = bo;
2941
2942 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002943 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002944
2945 /* Adjust length */
2946 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2947 goto onError;
2948
2949 Py_XDECREF(errorHandler);
2950 Py_XDECREF(exc);
2951 return (PyObject *)unicode;
2952
Benjamin Peterson29060642009-01-31 22:14:21 +00002953 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002954 Py_DECREF(unicode);
2955 Py_XDECREF(errorHandler);
2956 Py_XDECREF(exc);
2957 return NULL;
2958}
2959
2960PyObject *
2961PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002962 Py_ssize_t size,
2963 const char *errors,
2964 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002965{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002966 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002967 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002968 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002969#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002970 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002971#else
2972 const int pairs = 0;
2973#endif
2974 /* Offsets from p for storing byte pairs in the right order. */
2975#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2976 int iorder[] = {0, 1, 2, 3};
2977#else
2978 int iorder[] = {3, 2, 1, 0};
2979#endif
2980
Benjamin Peterson29060642009-01-31 22:14:21 +00002981#define STORECHAR(CH) \
2982 do { \
2983 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2984 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2985 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2986 p[iorder[0]] = (CH) & 0xff; \
2987 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002988 } while(0)
2989
2990 /* In narrow builds we can output surrogate pairs as one codepoint,
2991 so we need less space. */
2992#ifndef Py_UNICODE_WIDE
2993 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002994 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2995 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2996 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002997#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002998 nsize = (size - pairs + (byteorder == 0));
2999 bytesize = nsize * 4;
3000 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003001 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003002 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003003 if (v == NULL)
3004 return NULL;
3005
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003006 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003007 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003008 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003009 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003010 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003011
3012 if (byteorder == -1) {
3013 /* force LE */
3014 iorder[0] = 0;
3015 iorder[1] = 1;
3016 iorder[2] = 2;
3017 iorder[3] = 3;
3018 }
3019 else if (byteorder == 1) {
3020 /* force BE */
3021 iorder[0] = 3;
3022 iorder[1] = 2;
3023 iorder[2] = 1;
3024 iorder[3] = 0;
3025 }
3026
3027 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003029#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3031 Py_UCS4 ch2 = *s;
3032 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3033 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3034 s++;
3035 size--;
3036 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003037 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003038#endif
3039 STORECHAR(ch);
3040 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003041
3042 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003043 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003044#undef STORECHAR
3045}
3046
3047PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3048{
3049 if (!PyUnicode_Check(unicode)) {
3050 PyErr_BadArgument();
3051 return NULL;
3052 }
3053 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003054 PyUnicode_GET_SIZE(unicode),
3055 NULL,
3056 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003057}
3058
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059/* --- UTF-16 Codec ------------------------------------------------------- */
3060
Tim Peters772747b2001-08-09 22:21:55 +00003061PyObject *
3062PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003063 Py_ssize_t size,
3064 const char *errors,
3065 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066{
Walter Dörwald69652032004-09-07 20:24:22 +00003067 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3068}
3069
Antoine Pitrouab868312009-01-10 15:40:25 +00003070/* Two masks for fast checking of whether a C 'long' may contain
3071 UTF16-encoded surrogate characters. This is an efficient heuristic,
3072 assuming that non-surrogate characters with a code point >= 0x8000 are
3073 rare in most input.
3074 FAST_CHAR_MASK is used when the input is in native byte ordering,
3075 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003076*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003077#if (SIZEOF_LONG == 8)
3078# define FAST_CHAR_MASK 0x8000800080008000L
3079# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3080#elif (SIZEOF_LONG == 4)
3081# define FAST_CHAR_MASK 0x80008000L
3082# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3083#else
3084# error C 'long' size should be either 4 or 8!
3085#endif
3086
Walter Dörwald69652032004-09-07 20:24:22 +00003087PyObject *
3088PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003089 Py_ssize_t size,
3090 const char *errors,
3091 int *byteorder,
3092 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003093{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003095 Py_ssize_t startinpos;
3096 Py_ssize_t endinpos;
3097 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098 PyUnicodeObject *unicode;
3099 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003100 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003101 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003102 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003103 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003104 /* Offsets from q for retrieving byte pairs in the right order. */
3105#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3106 int ihi = 1, ilo = 0;
3107#else
3108 int ihi = 0, ilo = 1;
3109#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003110 PyObject *errorHandler = NULL;
3111 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112
3113 /* Note: size will always be longer than the resulting Unicode
3114 character count */
3115 unicode = _PyUnicode_New(size);
3116 if (!unicode)
3117 return NULL;
3118 if (size == 0)
3119 return (PyObject *)unicode;
3120
3121 /* Unpack UTF-16 encoded data */
3122 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003123 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003124 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
3126 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003127 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003129 /* Check for BOM marks (U+FEFF) in the input and adjust current
3130 byte order setting accordingly. In native mode, the leading BOM
3131 mark is skipped, in all other modes, it is copied to the output
3132 stream as-is (giving a ZWNBSP character). */
3133 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003134 if (size >= 2) {
3135 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003136#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003137 if (bom == 0xFEFF) {
3138 q += 2;
3139 bo = -1;
3140 }
3141 else if (bom == 0xFFFE) {
3142 q += 2;
3143 bo = 1;
3144 }
Tim Petersced69f82003-09-16 20:30:58 +00003145#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 if (bom == 0xFEFF) {
3147 q += 2;
3148 bo = 1;
3149 }
3150 else if (bom == 0xFFFE) {
3151 q += 2;
3152 bo = -1;
3153 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003154#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157
Tim Peters772747b2001-08-09 22:21:55 +00003158 if (bo == -1) {
3159 /* force LE */
3160 ihi = 1;
3161 ilo = 0;
3162 }
3163 else if (bo == 1) {
3164 /* force BE */
3165 ihi = 0;
3166 ilo = 1;
3167 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003168#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3169 native_ordering = ilo < ihi;
3170#else
3171 native_ordering = ilo > ihi;
3172#endif
Tim Peters772747b2001-08-09 22:21:55 +00003173
Antoine Pitrouab868312009-01-10 15:40:25 +00003174 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003175 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003177 /* First check for possible aligned read of a C 'long'. Unaligned
3178 reads are more expensive, better to defer to another iteration. */
3179 if (!((size_t) q & LONG_PTR_MASK)) {
3180 /* Fast path for runs of non-surrogate chars. */
3181 register const unsigned char *_q = q;
3182 Py_UNICODE *_p = p;
3183 if (native_ordering) {
3184 /* Native ordering is simple: as long as the input cannot
3185 possibly contain a surrogate char, do an unrolled copy
3186 of several 16-bit code points to the target object.
3187 The non-surrogate check is done on several input bytes
3188 at a time (as many as a C 'long' can contain). */
3189 while (_q < aligned_end) {
3190 unsigned long data = * (unsigned long *) _q;
3191 if (data & FAST_CHAR_MASK)
3192 break;
3193 _p[0] = ((unsigned short *) _q)[0];
3194 _p[1] = ((unsigned short *) _q)[1];
3195#if (SIZEOF_LONG == 8)
3196 _p[2] = ((unsigned short *) _q)[2];
3197 _p[3] = ((unsigned short *) _q)[3];
3198#endif
3199 _q += SIZEOF_LONG;
3200 _p += SIZEOF_LONG / 2;
3201 }
3202 }
3203 else {
3204 /* Byteswapped ordering is similar, but we must decompose
3205 the copy bytewise, and take care of zero'ing out the
3206 upper bytes if the target object is in 32-bit units
3207 (that is, in UCS-4 builds). */
3208 while (_q < aligned_end) {
3209 unsigned long data = * (unsigned long *) _q;
3210 if (data & SWAPPED_FAST_CHAR_MASK)
3211 break;
3212 /* Zero upper bytes in UCS-4 builds */
3213#if (Py_UNICODE_SIZE > 2)
3214 _p[0] = 0;
3215 _p[1] = 0;
3216#if (SIZEOF_LONG == 8)
3217 _p[2] = 0;
3218 _p[3] = 0;
3219#endif
3220#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003221 /* Issue #4916; UCS-4 builds on big endian machines must
3222 fill the two last bytes of each 4-byte unit. */
3223#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3224# define OFF 2
3225#else
3226# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003227#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003228 ((unsigned char *) _p)[OFF + 1] = _q[0];
3229 ((unsigned char *) _p)[OFF + 0] = _q[1];
3230 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3231 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3232#if (SIZEOF_LONG == 8)
3233 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3234 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3235 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3236 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3237#endif
3238#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003239 _q += SIZEOF_LONG;
3240 _p += SIZEOF_LONG / 2;
3241 }
3242 }
3243 p = _p;
3244 q = _q;
3245 if (q >= e)
3246 break;
3247 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249
Benjamin Peterson14339b62009-01-31 16:36:08 +00003250 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003251
3252 if (ch < 0xD800 || ch > 0xDFFF) {
3253 *p++ = ch;
3254 continue;
3255 }
3256
3257 /* UTF-16 code pair: */
3258 if (q > e) {
3259 errmsg = "unexpected end of data";
3260 startinpos = (((const char *)q) - 2) - starts;
3261 endinpos = ((const char *)e) + 1 - starts;
3262 goto utf16Error;
3263 }
3264 if (0xD800 <= ch && ch <= 0xDBFF) {
3265 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3266 q += 2;
3267 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003268#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003269 *p++ = ch;
3270 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003271#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003272 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003273#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 continue;
3275 }
3276 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003277 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003278 startinpos = (((const char *)q)-4)-starts;
3279 endinpos = startinpos+2;
3280 goto utf16Error;
3281 }
3282
Benjamin Peterson14339b62009-01-31 16:36:08 +00003283 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003284 errmsg = "illegal encoding";
3285 startinpos = (((const char *)q)-2)-starts;
3286 endinpos = startinpos+2;
3287 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003288
Benjamin Peterson29060642009-01-31 22:14:21 +00003289 utf16Error:
3290 outpos = p - PyUnicode_AS_UNICODE(unicode);
3291 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003292 errors,
3293 &errorHandler,
3294 "utf16", errmsg,
3295 &starts,
3296 (const char **)&e,
3297 &startinpos,
3298 &endinpos,
3299 &exc,
3300 (const char **)&q,
3301 &unicode,
3302 &outpos,
3303 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003304 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003306 /* remaining byte at the end? (size should be even) */
3307 if (e == q) {
3308 if (!consumed) {
3309 errmsg = "truncated data";
3310 startinpos = ((const char *)q) - starts;
3311 endinpos = ((const char *)e) + 1 - starts;
3312 outpos = p - PyUnicode_AS_UNICODE(unicode);
3313 if (unicode_decode_call_errorhandler(
3314 errors,
3315 &errorHandler,
3316 "utf16", errmsg,
3317 &starts,
3318 (const char **)&e,
3319 &startinpos,
3320 &endinpos,
3321 &exc,
3322 (const char **)&q,
3323 &unicode,
3324 &outpos,
3325 &p))
3326 goto onError;
3327 /* The remaining input chars are ignored if the callback
3328 chooses to skip the input */
3329 }
3330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331
3332 if (byteorder)
3333 *byteorder = bo;
3334
Walter Dörwald69652032004-09-07 20:24:22 +00003335 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003337
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003339 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 goto onError;
3341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 Py_XDECREF(errorHandler);
3343 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 return (PyObject *)unicode;
3345
Benjamin Peterson29060642009-01-31 22:14:21 +00003346 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003348 Py_XDECREF(errorHandler);
3349 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 return NULL;
3351}
3352
Antoine Pitrouab868312009-01-10 15:40:25 +00003353#undef FAST_CHAR_MASK
3354#undef SWAPPED_FAST_CHAR_MASK
3355
Tim Peters772747b2001-08-09 22:21:55 +00003356PyObject *
3357PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003358 Py_ssize_t size,
3359 const char *errors,
3360 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003362 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003363 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003364 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003365#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003366 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003367#else
3368 const int pairs = 0;
3369#endif
Tim Peters772747b2001-08-09 22:21:55 +00003370 /* Offsets from p for storing byte pairs in the right order. */
3371#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3372 int ihi = 1, ilo = 0;
3373#else
3374 int ihi = 0, ilo = 1;
3375#endif
3376
Benjamin Peterson29060642009-01-31 22:14:21 +00003377#define STORECHAR(CH) \
3378 do { \
3379 p[ihi] = ((CH) >> 8) & 0xff; \
3380 p[ilo] = (CH) & 0xff; \
3381 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003382 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003384#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003385 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003386 if (s[i] >= 0x10000)
3387 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003388#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003389 /* 2 * (size + pairs + (byteorder == 0)) */
3390 if (size > PY_SSIZE_T_MAX ||
3391 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003392 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003393 nsize = size + pairs + (byteorder == 0);
3394 bytesize = nsize * 2;
3395 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003397 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003398 if (v == NULL)
3399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003401 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003403 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003404 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003405 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003406
3407 if (byteorder == -1) {
3408 /* force LE */
3409 ihi = 1;
3410 ilo = 0;
3411 }
3412 else if (byteorder == 1) {
3413 /* force BE */
3414 ihi = 0;
3415 ilo = 1;
3416 }
3417
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003418 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003419 Py_UNICODE ch = *s++;
3420 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003421#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003422 if (ch >= 0x10000) {
3423 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3424 ch = 0xD800 | ((ch-0x10000) >> 10);
3425 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003426#endif
Tim Peters772747b2001-08-09 22:21:55 +00003427 STORECHAR(ch);
3428 if (ch2)
3429 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003430 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003431
3432 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003433 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003434#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435}
3436
3437PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3438{
3439 if (!PyUnicode_Check(unicode)) {
3440 PyErr_BadArgument();
3441 return NULL;
3442 }
3443 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003444 PyUnicode_GET_SIZE(unicode),
3445 NULL,
3446 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447}
3448
3449/* --- Unicode Escape Codec ----------------------------------------------- */
3450
Fredrik Lundh06d12682001-01-24 07:59:11 +00003451static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003452
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003454 Py_ssize_t size,
3455 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003458 Py_ssize_t startinpos;
3459 Py_ssize_t endinpos;
3460 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003465 char* message;
3466 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467 PyObject *errorHandler = NULL;
3468 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003469
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 /* Escaped strings will always be longer than the resulting
3471 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 length after conversion to the true value.
3473 (but if the error callback returns a long replacement string
3474 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 v = _PyUnicode_New(size);
3476 if (v == NULL)
3477 goto onError;
3478 if (size == 0)
3479 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003480
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003483
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 while (s < end) {
3485 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003486 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488
3489 /* Non-escape characters are interpreted as Unicode ordinals */
3490 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003491 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 continue;
3493 }
3494
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496 /* \ - Escapes */
3497 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003498 c = *s++;
3499 if (s > end)
3500 c = '\0'; /* Invalid after \ */
3501 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502
Benjamin Peterson29060642009-01-31 22:14:21 +00003503 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 case '\n': break;
3505 case '\\': *p++ = '\\'; break;
3506 case '\'': *p++ = '\''; break;
3507 case '\"': *p++ = '\"'; break;
3508 case 'b': *p++ = '\b'; break;
3509 case 'f': *p++ = '\014'; break; /* FF */
3510 case 't': *p++ = '\t'; break;
3511 case 'n': *p++ = '\n'; break;
3512 case 'r': *p++ = '\r'; break;
3513 case 'v': *p++ = '\013'; break; /* VT */
3514 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3515
Benjamin Peterson29060642009-01-31 22:14:21 +00003516 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517 case '0': case '1': case '2': case '3':
3518 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003519 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003520 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003521 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003522 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003523 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003525 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 break;
3527
Benjamin Peterson29060642009-01-31 22:14:21 +00003528 /* hex escapes */
3529 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003531 digits = 2;
3532 message = "truncated \\xXX escape";
3533 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003537 digits = 4;
3538 message = "truncated \\uXXXX escape";
3539 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540
Benjamin Peterson29060642009-01-31 22:14:21 +00003541 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003542 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003543 digits = 8;
3544 message = "truncated \\UXXXXXXXX escape";
3545 hexescape:
3546 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 outpos = p-PyUnicode_AS_UNICODE(v);
3548 if (s+digits>end) {
3549 endinpos = size;
3550 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003551 errors, &errorHandler,
3552 "unicodeescape", "end of string in escape sequence",
3553 &starts, &end, &startinpos, &endinpos, &exc, &s,
3554 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 goto onError;
3556 goto nextByte;
3557 }
3558 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003559 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003560 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003561 endinpos = (s+i+1)-starts;
3562 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003563 errors, &errorHandler,
3564 "unicodeescape", message,
3565 &starts, &end, &startinpos, &endinpos, &exc, &s,
3566 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003567 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003569 }
3570 chr = (chr<<4) & ~0xF;
3571 if (c >= '0' && c <= '9')
3572 chr += c - '0';
3573 else if (c >= 'a' && c <= 'f')
3574 chr += 10 + c - 'a';
3575 else
3576 chr += 10 + c - 'A';
3577 }
3578 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003579 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 /* _decoding_error will have already written into the
3581 target buffer. */
3582 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003583 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003584 /* when we get here, chr is a 32-bit unicode character */
3585 if (chr <= 0xffff)
3586 /* UCS-2 character */
3587 *p++ = (Py_UNICODE) chr;
3588 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003589 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003590 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003591#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003592 *p++ = chr;
3593#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003594 chr -= 0x10000L;
3595 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003596 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003597#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003598 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599 endinpos = s-starts;
3600 outpos = p-PyUnicode_AS_UNICODE(v);
3601 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 errors, &errorHandler,
3603 "unicodeescape", "illegal Unicode character",
3604 &starts, &end, &startinpos, &endinpos, &exc, &s,
3605 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003606 goto onError;
3607 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003608 break;
3609
Benjamin Peterson29060642009-01-31 22:14:21 +00003610 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003611 case 'N':
3612 message = "malformed \\N character escape";
3613 if (ucnhash_CAPI == NULL) {
3614 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003615 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003616 if (ucnhash_CAPI == NULL)
3617 goto ucnhashError;
3618 }
3619 if (*s == '{') {
3620 const char *start = s+1;
3621 /* look for the closing brace */
3622 while (*s != '}' && s < end)
3623 s++;
3624 if (s > start && s < end && *s == '}') {
3625 /* found a name. look it up in the unicode database */
3626 message = "unknown Unicode character name";
3627 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003628 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003629 goto store;
3630 }
3631 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 endinpos = s-starts;
3633 outpos = p-PyUnicode_AS_UNICODE(v);
3634 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 errors, &errorHandler,
3636 "unicodeescape", message,
3637 &starts, &end, &startinpos, &endinpos, &exc, &s,
3638 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003639 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003640 break;
3641
3642 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003643 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 message = "\\ at end of string";
3645 s--;
3646 endinpos = s-starts;
3647 outpos = p-PyUnicode_AS_UNICODE(v);
3648 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 errors, &errorHandler,
3650 "unicodeescape", message,
3651 &starts, &end, &startinpos, &endinpos, &exc, &s,
3652 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003653 goto onError;
3654 }
3655 else {
3656 *p++ = '\\';
3657 *p++ = (unsigned char)s[-1];
3658 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003659 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003661 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003664 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003666 Py_XDECREF(errorHandler);
3667 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003669
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003671 PyErr_SetString(
3672 PyExc_UnicodeError,
3673 "\\N escapes not supported (can't load unicodedata module)"
3674 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003675 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 Py_XDECREF(errorHandler);
3677 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003678 return NULL;
3679
Benjamin Peterson29060642009-01-31 22:14:21 +00003680 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 Py_XDECREF(errorHandler);
3683 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 return NULL;
3685}
3686
3687/* Return a Unicode-Escape string version of the Unicode object.
3688
3689 If quotes is true, the string is enclosed in u"" or u'' quotes as
3690 appropriate.
3691
3692*/
3693
Thomas Wouters477c8d52006-05-27 19:21:47 +00003694Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003695 Py_ssize_t size,
3696 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003697{
3698 /* like wcschr, but doesn't stop at NULL characters */
3699
3700 while (size-- > 0) {
3701 if (*s == ch)
3702 return s;
3703 s++;
3704 }
3705
3706 return NULL;
3707}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003708
Walter Dörwald79e913e2007-05-12 11:08:06 +00003709static const char *hexdigits = "0123456789abcdef";
3710
3711PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003714 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003717#ifdef Py_UNICODE_WIDE
3718 const Py_ssize_t expandsize = 10;
3719#else
3720 const Py_ssize_t expandsize = 6;
3721#endif
3722
Thomas Wouters89f507f2006-12-13 04:49:30 +00003723 /* XXX(nnorwitz): rather than over-allocating, it would be
3724 better to choose a different scheme. Perhaps scan the
3725 first N-chars of the string and allocate based on that size.
3726 */
3727 /* Initial allocation is based on the longest-possible unichr
3728 escape.
3729
3730 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3731 unichr, so in this case it's the longest unichr escape. In
3732 narrow (UTF-16) builds this is five chars per source unichr
3733 since there are two unichrs in the surrogate pair, so in narrow
3734 (UTF-16) builds it's not the longest unichr escape.
3735
3736 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3737 so in the narrow (UTF-16) build case it's the longest unichr
3738 escape.
3739 */
3740
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003741 if (size == 0)
3742 return PyBytes_FromStringAndSize(NULL, 0);
3743
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003744 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003745 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003746
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003747 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003748 2
3749 + expandsize*size
3750 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 if (repr == NULL)
3752 return NULL;
3753
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003754 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756 while (size-- > 0) {
3757 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003758
Walter Dörwald79e913e2007-05-12 11:08:06 +00003759 /* Escape backslashes */
3760 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 *p++ = '\\';
3762 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003763 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003764 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003765
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003766#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003767 /* Map 21-bit characters to '\U00xxxxxx' */
3768 else if (ch >= 0x10000) {
3769 *p++ = '\\';
3770 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003771 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3772 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3773 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3774 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3775 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3776 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3777 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3778 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003779 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003780 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003781#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3783 else if (ch >= 0xD800 && ch < 0xDC00) {
3784 Py_UNICODE ch2;
3785 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003786
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 ch2 = *s++;
3788 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003789 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003790 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3791 *p++ = '\\';
3792 *p++ = 'U';
3793 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3794 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3795 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3796 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3797 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3798 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3799 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3800 *p++ = hexdigits[ucs & 0x0000000F];
3801 continue;
3802 }
3803 /* Fall through: isolated surrogates are copied as-is */
3804 s--;
3805 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003806 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003807#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003808
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003810 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 *p++ = '\\';
3812 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003813 *p++ = hexdigits[(ch >> 12) & 0x000F];
3814 *p++ = hexdigits[(ch >> 8) & 0x000F];
3815 *p++ = hexdigits[(ch >> 4) & 0x000F];
3816 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003818
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003819 /* Map special whitespace to '\t', \n', '\r' */
3820 else if (ch == '\t') {
3821 *p++ = '\\';
3822 *p++ = 't';
3823 }
3824 else if (ch == '\n') {
3825 *p++ = '\\';
3826 *p++ = 'n';
3827 }
3828 else if (ch == '\r') {
3829 *p++ = '\\';
3830 *p++ = 'r';
3831 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003832
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003833 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003834 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003836 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003837 *p++ = hexdigits[(ch >> 4) & 0x000F];
3838 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003839 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003840
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 /* Copy everything else as-is */
3842 else
3843 *p++ = (char) ch;
3844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003846 assert(p - PyBytes_AS_STRING(repr) > 0);
3847 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3848 return NULL;
3849 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850}
3851
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003852PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003854 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 if (!PyUnicode_Check(unicode)) {
3856 PyErr_BadArgument();
3857 return NULL;
3858 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003859 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3860 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003861 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862}
3863
3864/* --- Raw Unicode Escape Codec ------------------------------------------- */
3865
3866PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003867 Py_ssize_t size,
3868 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003870 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003871 Py_ssize_t startinpos;
3872 Py_ssize_t endinpos;
3873 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 const char *end;
3877 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003878 PyObject *errorHandler = NULL;
3879 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003880
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 /* Escaped strings will always be longer than the resulting
3882 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 length after conversion to the true value. (But decoding error
3884 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 v = _PyUnicode_New(size);
3886 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003887 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003889 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 end = s + size;
3892 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003893 unsigned char c;
3894 Py_UCS4 x;
3895 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003896 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897
Benjamin Peterson29060642009-01-31 22:14:21 +00003898 /* Non-escape characters are interpreted as Unicode ordinals */
3899 if (*s != '\\') {
3900 *p++ = (unsigned char)*s++;
3901 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003902 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003903 startinpos = s-starts;
3904
3905 /* \u-escapes are only interpreted iff the number of leading
3906 backslashes if odd */
3907 bs = s;
3908 for (;s < end;) {
3909 if (*s != '\\')
3910 break;
3911 *p++ = (unsigned char)*s++;
3912 }
3913 if (((s - bs) & 1) == 0 ||
3914 s >= end ||
3915 (*s != 'u' && *s != 'U')) {
3916 continue;
3917 }
3918 p--;
3919 count = *s=='u' ? 4 : 8;
3920 s++;
3921
3922 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3923 outpos = p-PyUnicode_AS_UNICODE(v);
3924 for (x = 0, i = 0; i < count; ++i, ++s) {
3925 c = (unsigned char)*s;
3926 if (!ISXDIGIT(c)) {
3927 endinpos = s-starts;
3928 if (unicode_decode_call_errorhandler(
3929 errors, &errorHandler,
3930 "rawunicodeescape", "truncated \\uXXXX",
3931 &starts, &end, &startinpos, &endinpos, &exc, &s,
3932 &v, &outpos, &p))
3933 goto onError;
3934 goto nextByte;
3935 }
3936 x = (x<<4) & ~0xF;
3937 if (c >= '0' && c <= '9')
3938 x += c - '0';
3939 else if (c >= 'a' && c <= 'f')
3940 x += 10 + c - 'a';
3941 else
3942 x += 10 + c - 'A';
3943 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003944 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003945 /* UCS-2 character */
3946 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003947 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003948 /* UCS-4 character. Either store directly, or as
3949 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003950#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003951 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003952#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003953 x -= 0x10000L;
3954 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3955 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003956#endif
3957 } else {
3958 endinpos = s-starts;
3959 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003960 if (unicode_decode_call_errorhandler(
3961 errors, &errorHandler,
3962 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 &starts, &end, &startinpos, &endinpos, &exc, &s,
3964 &v, &outpos, &p))
3965 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003966 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003967 nextByte:
3968 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003970 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 Py_XDECREF(errorHandler);
3973 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003975
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978 Py_XDECREF(errorHandler);
3979 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 return NULL;
3981}
3982
3983PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003984 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003986 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 char *p;
3988 char *q;
3989
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003990#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003991 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003992#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003993 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003994#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003995
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003996 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003998
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003999 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 if (repr == NULL)
4001 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004002 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004003 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004005 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 while (size-- > 0) {
4007 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004008#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004009 /* Map 32-bit characters to '\Uxxxxxxxx' */
4010 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004011 *p++ = '\\';
4012 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004013 *p++ = hexdigits[(ch >> 28) & 0xf];
4014 *p++ = hexdigits[(ch >> 24) & 0xf];
4015 *p++ = hexdigits[(ch >> 20) & 0xf];
4016 *p++ = hexdigits[(ch >> 16) & 0xf];
4017 *p++ = hexdigits[(ch >> 12) & 0xf];
4018 *p++ = hexdigits[(ch >> 8) & 0xf];
4019 *p++ = hexdigits[(ch >> 4) & 0xf];
4020 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004021 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004022 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004023#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004024 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4025 if (ch >= 0xD800 && ch < 0xDC00) {
4026 Py_UNICODE ch2;
4027 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004028
Benjamin Peterson29060642009-01-31 22:14:21 +00004029 ch2 = *s++;
4030 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004031 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004032 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4033 *p++ = '\\';
4034 *p++ = 'U';
4035 *p++ = hexdigits[(ucs >> 28) & 0xf];
4036 *p++ = hexdigits[(ucs >> 24) & 0xf];
4037 *p++ = hexdigits[(ucs >> 20) & 0xf];
4038 *p++ = hexdigits[(ucs >> 16) & 0xf];
4039 *p++ = hexdigits[(ucs >> 12) & 0xf];
4040 *p++ = hexdigits[(ucs >> 8) & 0xf];
4041 *p++ = hexdigits[(ucs >> 4) & 0xf];
4042 *p++ = hexdigits[ucs & 0xf];
4043 continue;
4044 }
4045 /* Fall through: isolated surrogates are copied as-is */
4046 s--;
4047 size++;
4048 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004049#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 /* Map 16-bit characters to '\uxxxx' */
4051 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 *p++ = '\\';
4053 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004054 *p++ = hexdigits[(ch >> 12) & 0xf];
4055 *p++ = hexdigits[(ch >> 8) & 0xf];
4056 *p++ = hexdigits[(ch >> 4) & 0xf];
4057 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 /* Copy everything else as-is */
4060 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 *p++ = (char) ch;
4062 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004063 size = p - q;
4064
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004065 assert(size > 0);
4066 if (_PyBytes_Resize(&repr, size) < 0)
4067 return NULL;
4068 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069}
4070
4071PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4072{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004073 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004075 PyErr_BadArgument();
4076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004078 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4079 PyUnicode_GET_SIZE(unicode));
4080
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004081 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082}
4083
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004084/* --- Unicode Internal Codec ------------------------------------------- */
4085
4086PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004087 Py_ssize_t size,
4088 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004089{
4090 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004091 Py_ssize_t startinpos;
4092 Py_ssize_t endinpos;
4093 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004094 PyUnicodeObject *v;
4095 Py_UNICODE *p;
4096 const char *end;
4097 const char *reason;
4098 PyObject *errorHandler = NULL;
4099 PyObject *exc = NULL;
4100
Neal Norwitzd43069c2006-01-08 01:12:10 +00004101#ifdef Py_UNICODE_WIDE
4102 Py_UNICODE unimax = PyUnicode_GetMax();
4103#endif
4104
Thomas Wouters89f507f2006-12-13 04:49:30 +00004105 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004106 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4107 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004108 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004109 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004110 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004111 p = PyUnicode_AS_UNICODE(v);
4112 end = s + size;
4113
4114 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004115 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004116 /* We have to sanity check the raw data, otherwise doom looms for
4117 some malformed UCS-4 data. */
4118 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004119#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004120 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004121#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004122 end-s < Py_UNICODE_SIZE
4123 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004125 startinpos = s - starts;
4126 if (end-s < Py_UNICODE_SIZE) {
4127 endinpos = end-starts;
4128 reason = "truncated input";
4129 }
4130 else {
4131 endinpos = s - starts + Py_UNICODE_SIZE;
4132 reason = "illegal code point (> 0x10FFFF)";
4133 }
4134 outpos = p - PyUnicode_AS_UNICODE(v);
4135 if (unicode_decode_call_errorhandler(
4136 errors, &errorHandler,
4137 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004138 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004139 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004140 goto onError;
4141 }
4142 }
4143 else {
4144 p++;
4145 s += Py_UNICODE_SIZE;
4146 }
4147 }
4148
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004149 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004150 goto onError;
4151 Py_XDECREF(errorHandler);
4152 Py_XDECREF(exc);
4153 return (PyObject *)v;
4154
Benjamin Peterson29060642009-01-31 22:14:21 +00004155 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004156 Py_XDECREF(v);
4157 Py_XDECREF(errorHandler);
4158 Py_XDECREF(exc);
4159 return NULL;
4160}
4161
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162/* --- Latin-1 Codec ------------------------------------------------------ */
4163
4164PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004165 Py_ssize_t size,
4166 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167{
4168 PyUnicodeObject *v;
4169 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004170 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004171
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004173 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 Py_UNICODE r = *(unsigned char*)s;
4175 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004176 }
4177
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 v = _PyUnicode_New(size);
4179 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004184 e = s + size;
4185 /* Unrolling the copy makes it much faster by reducing the looping
4186 overhead. This is similar to what many memcpy() implementations do. */
4187 unrolled_end = e - 4;
4188 while (s < unrolled_end) {
4189 p[0] = (unsigned char) s[0];
4190 p[1] = (unsigned char) s[1];
4191 p[2] = (unsigned char) s[2];
4192 p[3] = (unsigned char) s[3];
4193 s += 4;
4194 p += 4;
4195 }
4196 while (s < e)
4197 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004199
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201 Py_XDECREF(v);
4202 return NULL;
4203}
4204
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205/* create or adjust a UnicodeEncodeError */
4206static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 const char *encoding,
4208 const Py_UNICODE *unicode, Py_ssize_t size,
4209 Py_ssize_t startpos, Py_ssize_t endpos,
4210 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 *exceptionObject = PyUnicodeEncodeError_Create(
4214 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215 }
4216 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004217 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4218 goto onError;
4219 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4220 goto onError;
4221 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4222 goto onError;
4223 return;
4224 onError:
4225 Py_DECREF(*exceptionObject);
4226 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227 }
4228}
4229
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230/* raises a UnicodeEncodeError */
4231static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004232 const char *encoding,
4233 const Py_UNICODE *unicode, Py_ssize_t size,
4234 Py_ssize_t startpos, Py_ssize_t endpos,
4235 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236{
4237 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241}
4242
4243/* error handling callback helper:
4244 build arguments, call the callback and check the arguments,
4245 put the result into newpos and return the replacement string, which
4246 has to be freed by the caller */
4247static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004248 PyObject **errorHandler,
4249 const char *encoding, const char *reason,
4250 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4251 Py_ssize_t startpos, Py_ssize_t endpos,
4252 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004253{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004254 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255
4256 PyObject *restuple;
4257 PyObject *resunicode;
4258
4259 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004260 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 }
4264
4265 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004266 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269
4270 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004273 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004275 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 Py_DECREF(restuple);
4277 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004279 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004280 &resunicode, newpos)) {
4281 Py_DECREF(restuple);
4282 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004283 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004284 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4285 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4286 Py_DECREF(restuple);
4287 return NULL;
4288 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004291 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4293 Py_DECREF(restuple);
4294 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004295 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 Py_INCREF(resunicode);
4297 Py_DECREF(restuple);
4298 return resunicode;
4299}
4300
4301static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 Py_ssize_t size,
4303 const char *errors,
4304 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305{
4306 /* output object */
4307 PyObject *res;
4308 /* pointers to the beginning and end+1 of input */
4309 const Py_UNICODE *startp = p;
4310 const Py_UNICODE *endp = p + size;
4311 /* pointer to the beginning of the unencodable characters */
4312 /* const Py_UNICODE *badp = NULL; */
4313 /* pointer into the output */
4314 char *str;
4315 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004316 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004317 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4318 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 PyObject *errorHandler = NULL;
4320 PyObject *exc = NULL;
4321 /* the following variable is used for caching string comparisons
4322 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4323 int known_errorHandler = -1;
4324
4325 /* allocate enough for a simple encoding without
4326 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004327 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004328 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004329 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004331 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004332 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 ressize = size;
4334
4335 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004336 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004337
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 /* can we encode this? */
4339 if (c<limit) {
4340 /* no overflow check, because we know that the space is enough */
4341 *str++ = (char)c;
4342 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004343 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 else {
4345 Py_ssize_t unicodepos = p-startp;
4346 Py_ssize_t requiredsize;
4347 PyObject *repunicode;
4348 Py_ssize_t repsize;
4349 Py_ssize_t newpos;
4350 Py_ssize_t respos;
4351 Py_UNICODE *uni2;
4352 /* startpos for collecting unencodable chars */
4353 const Py_UNICODE *collstart = p;
4354 const Py_UNICODE *collend = p;
4355 /* find all unecodable characters */
4356 while ((collend < endp) && ((*collend)>=limit))
4357 ++collend;
4358 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4359 if (known_errorHandler==-1) {
4360 if ((errors==NULL) || (!strcmp(errors, "strict")))
4361 known_errorHandler = 1;
4362 else if (!strcmp(errors, "replace"))
4363 known_errorHandler = 2;
4364 else if (!strcmp(errors, "ignore"))
4365 known_errorHandler = 3;
4366 else if (!strcmp(errors, "xmlcharrefreplace"))
4367 known_errorHandler = 4;
4368 else
4369 known_errorHandler = 0;
4370 }
4371 switch (known_errorHandler) {
4372 case 1: /* strict */
4373 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4374 goto onError;
4375 case 2: /* replace */
4376 while (collstart++<collend)
4377 *str++ = '?'; /* fall through */
4378 case 3: /* ignore */
4379 p = collend;
4380 break;
4381 case 4: /* xmlcharrefreplace */
4382 respos = str - PyBytes_AS_STRING(res);
4383 /* determine replacement size (temporarily (mis)uses p) */
4384 for (p = collstart, repsize = 0; p < collend; ++p) {
4385 if (*p<10)
4386 repsize += 2+1+1;
4387 else if (*p<100)
4388 repsize += 2+2+1;
4389 else if (*p<1000)
4390 repsize += 2+3+1;
4391 else if (*p<10000)
4392 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004393#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 else
4395 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004396#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 else if (*p<100000)
4398 repsize += 2+5+1;
4399 else if (*p<1000000)
4400 repsize += 2+6+1;
4401 else
4402 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004403#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004404 }
4405 requiredsize = respos+repsize+(endp-collend);
4406 if (requiredsize > ressize) {
4407 if (requiredsize<2*ressize)
4408 requiredsize = 2*ressize;
4409 if (_PyBytes_Resize(&res, requiredsize))
4410 goto onError;
4411 str = PyBytes_AS_STRING(res) + respos;
4412 ressize = requiredsize;
4413 }
4414 /* generate replacement (temporarily (mis)uses p) */
4415 for (p = collstart; p < collend; ++p) {
4416 str += sprintf(str, "&#%d;", (int)*p);
4417 }
4418 p = collend;
4419 break;
4420 default:
4421 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4422 encoding, reason, startp, size, &exc,
4423 collstart-startp, collend-startp, &newpos);
4424 if (repunicode == NULL)
4425 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004426 if (PyBytes_Check(repunicode)) {
4427 /* Directly copy bytes result to output. */
4428 repsize = PyBytes_Size(repunicode);
4429 if (repsize > 1) {
4430 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004431 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004432 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4433 Py_DECREF(repunicode);
4434 goto onError;
4435 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004436 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004437 ressize += repsize-1;
4438 }
4439 memcpy(str, PyBytes_AsString(repunicode), repsize);
4440 str += repsize;
4441 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004442 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004443 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004444 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 /* need more space? (at least enough for what we
4446 have+the replacement+the rest of the string, so
4447 we won't have to check space for encodable characters) */
4448 respos = str - PyBytes_AS_STRING(res);
4449 repsize = PyUnicode_GET_SIZE(repunicode);
4450 requiredsize = respos+repsize+(endp-collend);
4451 if (requiredsize > ressize) {
4452 if (requiredsize<2*ressize)
4453 requiredsize = 2*ressize;
4454 if (_PyBytes_Resize(&res, requiredsize)) {
4455 Py_DECREF(repunicode);
4456 goto onError;
4457 }
4458 str = PyBytes_AS_STRING(res) + respos;
4459 ressize = requiredsize;
4460 }
4461 /* check if there is anything unencodable in the replacement
4462 and copy it to the output */
4463 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4464 c = *uni2;
4465 if (c >= limit) {
4466 raise_encode_exception(&exc, encoding, startp, size,
4467 unicodepos, unicodepos+1, reason);
4468 Py_DECREF(repunicode);
4469 goto onError;
4470 }
4471 *str = (char)c;
4472 }
4473 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004474 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004475 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004476 }
4477 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004478 /* Resize if we allocated to much */
4479 size = str - PyBytes_AS_STRING(res);
4480 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004481 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004482 if (_PyBytes_Resize(&res, size) < 0)
4483 goto onError;
4484 }
4485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 Py_XDECREF(errorHandler);
4487 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004488 return res;
4489
4490 onError:
4491 Py_XDECREF(res);
4492 Py_XDECREF(errorHandler);
4493 Py_XDECREF(exc);
4494 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495}
4496
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 Py_ssize_t size,
4499 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004501 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502}
4503
4504PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4505{
4506 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 PyErr_BadArgument();
4508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509 }
4510 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 PyUnicode_GET_SIZE(unicode),
4512 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513}
4514
4515/* --- 7-bit ASCII Codec -------------------------------------------------- */
4516
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 Py_ssize_t size,
4519 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522 PyUnicodeObject *v;
4523 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004524 Py_ssize_t startinpos;
4525 Py_ssize_t endinpos;
4526 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 const char *e;
4528 PyObject *errorHandler = NULL;
4529 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004530
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004532 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 Py_UNICODE r = *(unsigned char*)s;
4534 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004535 }
Tim Petersced69f82003-09-16 20:30:58 +00004536
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 v = _PyUnicode_New(size);
4538 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004543 e = s + size;
4544 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 register unsigned char c = (unsigned char)*s;
4546 if (c < 128) {
4547 *p++ = c;
4548 ++s;
4549 }
4550 else {
4551 startinpos = s-starts;
4552 endinpos = startinpos + 1;
4553 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4554 if (unicode_decode_call_errorhandler(
4555 errors, &errorHandler,
4556 "ascii", "ordinal not in range(128)",
4557 &starts, &e, &startinpos, &endinpos, &exc, &s,
4558 &v, &outpos, &p))
4559 goto onError;
4560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004562 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4564 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 Py_XDECREF(errorHandler);
4566 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004568
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 Py_XDECREF(errorHandler);
4572 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573 return NULL;
4574}
4575
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004577 Py_ssize_t size,
4578 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581}
4582
4583PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4584{
4585 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 PyErr_BadArgument();
4587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 }
4589 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 PyUnicode_GET_SIZE(unicode),
4591 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592}
4593
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004594#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004595
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004596/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004597
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004598#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004599#define NEED_RETRY
4600#endif
4601
4602/* XXX This code is limited to "true" double-byte encodings, as
4603 a) it assumes an incomplete character consists of a single byte, and
4604 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004605 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004606
4607static int is_dbcs_lead_byte(const char *s, int offset)
4608{
4609 const char *curr = s + offset;
4610
4611 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 const char *prev = CharPrev(s, curr);
4613 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004614 }
4615 return 0;
4616}
4617
4618/*
4619 * Decode MBCS string into unicode object. If 'final' is set, converts
4620 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4621 */
4622static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 const char *s, /* MBCS string */
4624 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004625 int final,
4626 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004627{
4628 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004629 Py_ssize_t n;
4630 DWORD usize;
4631 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004632
4633 assert(size >= 0);
4634
Victor Stinner554f3f02010-06-16 23:33:54 +00004635 /* check and handle 'errors' arg */
4636 if (errors==NULL || strcmp(errors, "strict")==0)
4637 flags = MB_ERR_INVALID_CHARS;
4638 else if (strcmp(errors, "ignore")==0)
4639 flags = 0;
4640 else {
4641 PyErr_Format(PyExc_ValueError,
4642 "mbcs encoding does not support errors='%s'",
4643 errors);
4644 return -1;
4645 }
4646
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004647 /* Skip trailing lead-byte unless 'final' is set */
4648 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004649 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004650
4651 /* First get the size of the result */
4652 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004653 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4654 if (usize==0)
4655 goto mbcs_decode_error;
4656 } else
4657 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004658
4659 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 /* Create unicode object */
4661 *v = _PyUnicode_New(usize);
4662 if (*v == NULL)
4663 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004664 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004665 }
4666 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004667 /* Extend unicode object */
4668 n = PyUnicode_GET_SIZE(*v);
4669 if (_PyUnicode_Resize(v, n + usize) < 0)
4670 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004671 }
4672
4673 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004674 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004676 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4677 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004678 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004679 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004680 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004681
4682mbcs_decode_error:
4683 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4684 we raise a UnicodeDecodeError - else it is a 'generic'
4685 windows error
4686 */
4687 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4688 /* Ideally, we should get reason from FormatMessage - this
4689 is the Windows 2000 English version of the message
4690 */
4691 PyObject *exc = NULL;
4692 const char *reason = "No mapping for the Unicode character exists "
4693 "in the target multi-byte code page.";
4694 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4695 if (exc != NULL) {
4696 PyCodec_StrictErrors(exc);
4697 Py_DECREF(exc);
4698 }
4699 } else {
4700 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4701 }
4702 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004703}
4704
4705PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 Py_ssize_t size,
4707 const char *errors,
4708 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004709{
4710 PyUnicodeObject *v = NULL;
4711 int done;
4712
4713 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004714 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004715
4716#ifdef NEED_RETRY
4717 retry:
4718 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004719 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004720 else
4721#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004722 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004723
4724 if (done < 0) {
4725 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004726 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004727 }
4728
4729 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004731
4732#ifdef NEED_RETRY
4733 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 s += done;
4735 size -= done;
4736 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004737 }
4738#endif
4739
4740 return (PyObject *)v;
4741}
4742
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004743PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004744 Py_ssize_t size,
4745 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004746{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004747 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4748}
4749
4750/*
4751 * Convert unicode into string object (MBCS).
4752 * Returns 0 if succeed, -1 otherwise.
4753 */
4754static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004755 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004756 int size, /* size of unicode */
4757 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004758{
Victor Stinner554f3f02010-06-16 23:33:54 +00004759 BOOL usedDefaultChar = FALSE;
4760 BOOL *pusedDefaultChar;
4761 int mbcssize;
4762 Py_ssize_t n;
4763 PyObject *exc = NULL;
4764 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004765
4766 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004767
Victor Stinner554f3f02010-06-16 23:33:54 +00004768 /* check and handle 'errors' arg */
4769 if (errors==NULL || strcmp(errors, "strict")==0) {
4770 flags = WC_NO_BEST_FIT_CHARS;
4771 pusedDefaultChar = &usedDefaultChar;
4772 } else if (strcmp(errors, "replace")==0) {
4773 flags = 0;
4774 pusedDefaultChar = NULL;
4775 } else {
4776 PyErr_Format(PyExc_ValueError,
4777 "mbcs encoding does not support errors='%s'",
4778 errors);
4779 return -1;
4780 }
4781
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004782 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004783 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004784 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4785 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 if (mbcssize == 0) {
4787 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4788 return -1;
4789 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004790 /* If we used a default char, then we failed! */
4791 if (pusedDefaultChar && *pusedDefaultChar)
4792 goto mbcs_encode_error;
4793 } else {
4794 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004795 }
4796
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004797 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 /* Create string object */
4799 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4800 if (*repr == NULL)
4801 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004802 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004803 }
4804 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004805 /* Extend string object */
4806 n = PyBytes_Size(*repr);
4807 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4808 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004809 }
4810
4811 /* Do the conversion */
4812 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004814 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4815 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4817 return -1;
4818 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004819 if (pusedDefaultChar && *pusedDefaultChar)
4820 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004821 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004822 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004823
4824mbcs_encode_error:
4825 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4826 Py_XDECREF(exc);
4827 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004828}
4829
4830PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 Py_ssize_t size,
4832 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004833{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004834 PyObject *repr = NULL;
4835 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004836
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004837#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004839 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004840 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004841 else
4842#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004843 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004844
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004845 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 Py_XDECREF(repr);
4847 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004848 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004849
4850#ifdef NEED_RETRY
4851 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 p += INT_MAX;
4853 size -= INT_MAX;
4854 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004855 }
4856#endif
4857
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004858 return repr;
4859}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004860
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004861PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4862{
4863 if (!PyUnicode_Check(unicode)) {
4864 PyErr_BadArgument();
4865 return NULL;
4866 }
4867 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004868 PyUnicode_GET_SIZE(unicode),
4869 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004870}
4871
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004872#undef NEED_RETRY
4873
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004874#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004875
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876/* --- Character Mapping Codec -------------------------------------------- */
4877
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004879 Py_ssize_t size,
4880 PyObject *mapping,
4881 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004884 Py_ssize_t startinpos;
4885 Py_ssize_t endinpos;
4886 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 PyUnicodeObject *v;
4889 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004890 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 PyObject *errorHandler = NULL;
4892 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004893 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004894 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004895
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 /* Default to Latin-1 */
4897 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899
4900 v = _PyUnicode_New(size);
4901 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004904 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004906 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004907 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004908 mapstring = PyUnicode_AS_UNICODE(mapping);
4909 maplen = PyUnicode_GET_SIZE(mapping);
4910 while (s < e) {
4911 unsigned char ch = *s;
4912 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 if (ch < maplen)
4915 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 if (x == 0xfffe) {
4918 /* undefined mapping */
4919 outpos = p-PyUnicode_AS_UNICODE(v);
4920 startinpos = s-starts;
4921 endinpos = startinpos+1;
4922 if (unicode_decode_call_errorhandler(
4923 errors, &errorHandler,
4924 "charmap", "character maps to <undefined>",
4925 &starts, &e, &startinpos, &endinpos, &exc, &s,
4926 &v, &outpos, &p)) {
4927 goto onError;
4928 }
4929 continue;
4930 }
4931 *p++ = x;
4932 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004933 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004934 }
4935 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 while (s < e) {
4937 unsigned char ch = *s;
4938 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004939
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4941 w = PyLong_FromLong((long)ch);
4942 if (w == NULL)
4943 goto onError;
4944 x = PyObject_GetItem(mapping, w);
4945 Py_DECREF(w);
4946 if (x == NULL) {
4947 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4948 /* No mapping found means: mapping is undefined. */
4949 PyErr_Clear();
4950 x = Py_None;
4951 Py_INCREF(x);
4952 } else
4953 goto onError;
4954 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004955
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 /* Apply mapping */
4957 if (PyLong_Check(x)) {
4958 long value = PyLong_AS_LONG(x);
4959 if (value < 0 || value > 65535) {
4960 PyErr_SetString(PyExc_TypeError,
4961 "character mapping must be in range(65536)");
4962 Py_DECREF(x);
4963 goto onError;
4964 }
4965 *p++ = (Py_UNICODE)value;
4966 }
4967 else if (x == Py_None) {
4968 /* undefined mapping */
4969 outpos = p-PyUnicode_AS_UNICODE(v);
4970 startinpos = s-starts;
4971 endinpos = startinpos+1;
4972 if (unicode_decode_call_errorhandler(
4973 errors, &errorHandler,
4974 "charmap", "character maps to <undefined>",
4975 &starts, &e, &startinpos, &endinpos, &exc, &s,
4976 &v, &outpos, &p)) {
4977 Py_DECREF(x);
4978 goto onError;
4979 }
4980 Py_DECREF(x);
4981 continue;
4982 }
4983 else if (PyUnicode_Check(x)) {
4984 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004985
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 if (targetsize == 1)
4987 /* 1-1 mapping */
4988 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004989
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 else if (targetsize > 1) {
4991 /* 1-n mapping */
4992 if (targetsize > extrachars) {
4993 /* resize first */
4994 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4995 Py_ssize_t needed = (targetsize - extrachars) + \
4996 (targetsize << 2);
4997 extrachars += needed;
4998 /* XXX overflow detection missing */
4999 if (_PyUnicode_Resize(&v,
5000 PyUnicode_GET_SIZE(v) + needed) < 0) {
5001 Py_DECREF(x);
5002 goto onError;
5003 }
5004 p = PyUnicode_AS_UNICODE(v) + oldpos;
5005 }
5006 Py_UNICODE_COPY(p,
5007 PyUnicode_AS_UNICODE(x),
5008 targetsize);
5009 p += targetsize;
5010 extrachars -= targetsize;
5011 }
5012 /* 1-0 mapping: skip the character */
5013 }
5014 else {
5015 /* wrong return value */
5016 PyErr_SetString(PyExc_TypeError,
5017 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005018 Py_DECREF(x);
5019 goto onError;
5020 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 Py_DECREF(x);
5022 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024 }
5025 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5027 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028 Py_XDECREF(errorHandler);
5029 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005031
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005033 Py_XDECREF(errorHandler);
5034 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035 Py_XDECREF(v);
5036 return NULL;
5037}
5038
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005039/* Charmap encoding: the lookup table */
5040
5041struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 PyObject_HEAD
5043 unsigned char level1[32];
5044 int count2, count3;
5045 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005046};
5047
5048static PyObject*
5049encoding_map_size(PyObject *obj, PyObject* args)
5050{
5051 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005052 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005054}
5055
5056static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005057 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 PyDoc_STR("Return the size (in bytes) of this object") },
5059 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005060};
5061
5062static void
5063encoding_map_dealloc(PyObject* o)
5064{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005065 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005066}
5067
5068static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005069 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 "EncodingMap", /*tp_name*/
5071 sizeof(struct encoding_map), /*tp_basicsize*/
5072 0, /*tp_itemsize*/
5073 /* methods */
5074 encoding_map_dealloc, /*tp_dealloc*/
5075 0, /*tp_print*/
5076 0, /*tp_getattr*/
5077 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005078 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 0, /*tp_repr*/
5080 0, /*tp_as_number*/
5081 0, /*tp_as_sequence*/
5082 0, /*tp_as_mapping*/
5083 0, /*tp_hash*/
5084 0, /*tp_call*/
5085 0, /*tp_str*/
5086 0, /*tp_getattro*/
5087 0, /*tp_setattro*/
5088 0, /*tp_as_buffer*/
5089 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5090 0, /*tp_doc*/
5091 0, /*tp_traverse*/
5092 0, /*tp_clear*/
5093 0, /*tp_richcompare*/
5094 0, /*tp_weaklistoffset*/
5095 0, /*tp_iter*/
5096 0, /*tp_iternext*/
5097 encoding_map_methods, /*tp_methods*/
5098 0, /*tp_members*/
5099 0, /*tp_getset*/
5100 0, /*tp_base*/
5101 0, /*tp_dict*/
5102 0, /*tp_descr_get*/
5103 0, /*tp_descr_set*/
5104 0, /*tp_dictoffset*/
5105 0, /*tp_init*/
5106 0, /*tp_alloc*/
5107 0, /*tp_new*/
5108 0, /*tp_free*/
5109 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005110};
5111
5112PyObject*
5113PyUnicode_BuildEncodingMap(PyObject* string)
5114{
5115 Py_UNICODE *decode;
5116 PyObject *result;
5117 struct encoding_map *mresult;
5118 int i;
5119 int need_dict = 0;
5120 unsigned char level1[32];
5121 unsigned char level2[512];
5122 unsigned char *mlevel1, *mlevel2, *mlevel3;
5123 int count2 = 0, count3 = 0;
5124
5125 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5126 PyErr_BadArgument();
5127 return NULL;
5128 }
5129 decode = PyUnicode_AS_UNICODE(string);
5130 memset(level1, 0xFF, sizeof level1);
5131 memset(level2, 0xFF, sizeof level2);
5132
5133 /* If there isn't a one-to-one mapping of NULL to \0,
5134 or if there are non-BMP characters, we need to use
5135 a mapping dictionary. */
5136 if (decode[0] != 0)
5137 need_dict = 1;
5138 for (i = 1; i < 256; i++) {
5139 int l1, l2;
5140 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005141#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005142 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005143#endif
5144 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005145 need_dict = 1;
5146 break;
5147 }
5148 if (decode[i] == 0xFFFE)
5149 /* unmapped character */
5150 continue;
5151 l1 = decode[i] >> 11;
5152 l2 = decode[i] >> 7;
5153 if (level1[l1] == 0xFF)
5154 level1[l1] = count2++;
5155 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005156 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005157 }
5158
5159 if (count2 >= 0xFF || count3 >= 0xFF)
5160 need_dict = 1;
5161
5162 if (need_dict) {
5163 PyObject *result = PyDict_New();
5164 PyObject *key, *value;
5165 if (!result)
5166 return NULL;
5167 for (i = 0; i < 256; i++) {
5168 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005169 key = PyLong_FromLong(decode[i]);
5170 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005171 if (!key || !value)
5172 goto failed1;
5173 if (PyDict_SetItem(result, key, value) == -1)
5174 goto failed1;
5175 Py_DECREF(key);
5176 Py_DECREF(value);
5177 }
5178 return result;
5179 failed1:
5180 Py_XDECREF(key);
5181 Py_XDECREF(value);
5182 Py_DECREF(result);
5183 return NULL;
5184 }
5185
5186 /* Create a three-level trie */
5187 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5188 16*count2 + 128*count3 - 1);
5189 if (!result)
5190 return PyErr_NoMemory();
5191 PyObject_Init(result, &EncodingMapType);
5192 mresult = (struct encoding_map*)result;
5193 mresult->count2 = count2;
5194 mresult->count3 = count3;
5195 mlevel1 = mresult->level1;
5196 mlevel2 = mresult->level23;
5197 mlevel3 = mresult->level23 + 16*count2;
5198 memcpy(mlevel1, level1, 32);
5199 memset(mlevel2, 0xFF, 16*count2);
5200 memset(mlevel3, 0, 128*count3);
5201 count3 = 0;
5202 for (i = 1; i < 256; i++) {
5203 int o1, o2, o3, i2, i3;
5204 if (decode[i] == 0xFFFE)
5205 /* unmapped character */
5206 continue;
5207 o1 = decode[i]>>11;
5208 o2 = (decode[i]>>7) & 0xF;
5209 i2 = 16*mlevel1[o1] + o2;
5210 if (mlevel2[i2] == 0xFF)
5211 mlevel2[i2] = count3++;
5212 o3 = decode[i] & 0x7F;
5213 i3 = 128*mlevel2[i2] + o3;
5214 mlevel3[i3] = i;
5215 }
5216 return result;
5217}
5218
5219static int
5220encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5221{
5222 struct encoding_map *map = (struct encoding_map*)mapping;
5223 int l1 = c>>11;
5224 int l2 = (c>>7) & 0xF;
5225 int l3 = c & 0x7F;
5226 int i;
5227
5228#ifdef Py_UNICODE_WIDE
5229 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005231 }
5232#endif
5233 if (c == 0)
5234 return 0;
5235 /* level 1*/
5236 i = map->level1[l1];
5237 if (i == 0xFF) {
5238 return -1;
5239 }
5240 /* level 2*/
5241 i = map->level23[16*i+l2];
5242 if (i == 0xFF) {
5243 return -1;
5244 }
5245 /* level 3 */
5246 i = map->level23[16*map->count2 + 128*i + l3];
5247 if (i == 0) {
5248 return -1;
5249 }
5250 return i;
5251}
5252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005253/* Lookup the character ch in the mapping. If the character
5254 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005255 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005256static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257{
Christian Heimes217cfd12007-12-02 14:31:20 +00005258 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005259 PyObject *x;
5260
5261 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005263 x = PyObject_GetItem(mapping, w);
5264 Py_DECREF(w);
5265 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5267 /* No mapping found means: mapping is undefined. */
5268 PyErr_Clear();
5269 x = Py_None;
5270 Py_INCREF(x);
5271 return x;
5272 } else
5273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005275 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005276 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005277 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 long value = PyLong_AS_LONG(x);
5279 if (value < 0 || value > 255) {
5280 PyErr_SetString(PyExc_TypeError,
5281 "character mapping must be in range(256)");
5282 Py_DECREF(x);
5283 return NULL;
5284 }
5285 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005287 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 /* wrong return value */
5291 PyErr_Format(PyExc_TypeError,
5292 "character mapping must return integer, bytes or None, not %.400s",
5293 x->ob_type->tp_name);
5294 Py_DECREF(x);
5295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 }
5297}
5298
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005299static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005300charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005301{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005302 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5303 /* exponentially overallocate to minimize reallocations */
5304 if (requiredsize < 2*outsize)
5305 requiredsize = 2*outsize;
5306 if (_PyBytes_Resize(outobj, requiredsize))
5307 return -1;
5308 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005309}
5310
Benjamin Peterson14339b62009-01-31 16:36:08 +00005311typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005313}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005314/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005315 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005316 space is available. Return a new reference to the object that
5317 was put in the output buffer, or Py_None, if the mapping was undefined
5318 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005319 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005320static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005321charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005323{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005324 PyObject *rep;
5325 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005326 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005327
Christian Heimes90aa7642007-12-19 02:45:37 +00005328 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005329 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005331 if (res == -1)
5332 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 if (outsize<requiredsize)
5334 if (charmapencode_resize(outobj, outpos, requiredsize))
5335 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005336 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 outstart[(*outpos)++] = (char)res;
5338 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005339 }
5340
5341 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005342 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005344 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 Py_DECREF(rep);
5346 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005347 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 if (PyLong_Check(rep)) {
5349 Py_ssize_t requiredsize = *outpos+1;
5350 if (outsize<requiredsize)
5351 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5352 Py_DECREF(rep);
5353 return enc_EXCEPTION;
5354 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005355 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005357 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 else {
5359 const char *repchars = PyBytes_AS_STRING(rep);
5360 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5361 Py_ssize_t requiredsize = *outpos+repsize;
5362 if (outsize<requiredsize)
5363 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5364 Py_DECREF(rep);
5365 return enc_EXCEPTION;
5366 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005367 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 memcpy(outstart + *outpos, repchars, repsize);
5369 *outpos += repsize;
5370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005372 Py_DECREF(rep);
5373 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005374}
5375
5376/* handle an error in PyUnicode_EncodeCharmap
5377 Return 0 on success, -1 on error */
5378static
5379int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005380 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005381 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005382 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005383 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384{
5385 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005386 Py_ssize_t repsize;
5387 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005388 Py_UNICODE *uni2;
5389 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005390 Py_ssize_t collstartpos = *inpos;
5391 Py_ssize_t collendpos = *inpos+1;
5392 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005393 char *encoding = "charmap";
5394 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005395 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397 /* find all unencodable characters */
5398 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005399 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005400 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 int res = encoding_map_lookup(p[collendpos], mapping);
5402 if (res != -1)
5403 break;
5404 ++collendpos;
5405 continue;
5406 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005407
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 rep = charmapencode_lookup(p[collendpos], mapping);
5409 if (rep==NULL)
5410 return -1;
5411 else if (rep!=Py_None) {
5412 Py_DECREF(rep);
5413 break;
5414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005415 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005417 }
5418 /* cache callback name lookup
5419 * (if not done yet, i.e. it's the first error) */
5420 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 if ((errors==NULL) || (!strcmp(errors, "strict")))
5422 *known_errorHandler = 1;
5423 else if (!strcmp(errors, "replace"))
5424 *known_errorHandler = 2;
5425 else if (!strcmp(errors, "ignore"))
5426 *known_errorHandler = 3;
5427 else if (!strcmp(errors, "xmlcharrefreplace"))
5428 *known_errorHandler = 4;
5429 else
5430 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431 }
5432 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005433 case 1: /* strict */
5434 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5435 return -1;
5436 case 2: /* replace */
5437 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 x = charmapencode_output('?', mapping, res, respos);
5439 if (x==enc_EXCEPTION) {
5440 return -1;
5441 }
5442 else if (x==enc_FAILED) {
5443 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5444 return -1;
5445 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005446 }
5447 /* fall through */
5448 case 3: /* ignore */
5449 *inpos = collendpos;
5450 break;
5451 case 4: /* xmlcharrefreplace */
5452 /* generate replacement (temporarily (mis)uses p) */
5453 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 char buffer[2+29+1+1];
5455 char *cp;
5456 sprintf(buffer, "&#%d;", (int)p[collpos]);
5457 for (cp = buffer; *cp; ++cp) {
5458 x = charmapencode_output(*cp, mapping, res, respos);
5459 if (x==enc_EXCEPTION)
5460 return -1;
5461 else if (x==enc_FAILED) {
5462 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5463 return -1;
5464 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005465 }
5466 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005467 *inpos = collendpos;
5468 break;
5469 default:
5470 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 encoding, reason, p, size, exceptionObject,
5472 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005473 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005475 if (PyBytes_Check(repunicode)) {
5476 /* Directly copy bytes result to output. */
5477 Py_ssize_t outsize = PyBytes_Size(*res);
5478 Py_ssize_t requiredsize;
5479 repsize = PyBytes_Size(repunicode);
5480 requiredsize = *respos + repsize;
5481 if (requiredsize > outsize)
5482 /* Make room for all additional bytes. */
5483 if (charmapencode_resize(res, respos, requiredsize)) {
5484 Py_DECREF(repunicode);
5485 return -1;
5486 }
5487 memcpy(PyBytes_AsString(*res) + *respos,
5488 PyBytes_AsString(repunicode), repsize);
5489 *respos += repsize;
5490 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005491 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005492 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005493 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005494 /* generate replacement */
5495 repsize = PyUnicode_GET_SIZE(repunicode);
5496 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 x = charmapencode_output(*uni2, mapping, res, respos);
5498 if (x==enc_EXCEPTION) {
5499 return -1;
5500 }
5501 else if (x==enc_FAILED) {
5502 Py_DECREF(repunicode);
5503 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5504 return -1;
5505 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005506 }
5507 *inpos = newpos;
5508 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509 }
5510 return 0;
5511}
5512
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 Py_ssize_t size,
5515 PyObject *mapping,
5516 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518 /* output object */
5519 PyObject *res = NULL;
5520 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005521 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005522 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005523 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 PyObject *errorHandler = NULL;
5525 PyObject *exc = NULL;
5526 /* the following variable is used for caching string comparisons
5527 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5528 * 3=ignore, 4=xmlcharrefreplace */
5529 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530
5531 /* Default to Latin-1 */
5532 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005535 /* allocate enough for a simple encoding without
5536 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005537 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005538 if (res == NULL)
5539 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005540 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005543 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 /* try to encode it */
5545 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5546 if (x==enc_EXCEPTION) /* error */
5547 goto onError;
5548 if (x==enc_FAILED) { /* unencodable character */
5549 if (charmap_encoding_error(p, size, &inpos, mapping,
5550 &exc,
5551 &known_errorHandler, &errorHandler, errors,
5552 &res, &respos)) {
5553 goto onError;
5554 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005555 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 else
5557 /* done with this character => adjust input position */
5558 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005562 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005563 if (_PyBytes_Resize(&res, respos) < 0)
5564 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566 Py_XDECREF(exc);
5567 Py_XDECREF(errorHandler);
5568 return res;
5569
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005571 Py_XDECREF(res);
5572 Py_XDECREF(exc);
5573 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 return NULL;
5575}
5576
5577PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579{
5580 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 PyErr_BadArgument();
5582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 }
5584 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 PyUnicode_GET_SIZE(unicode),
5586 mapping,
5587 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588}
5589
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005590/* create or adjust a UnicodeTranslateError */
5591static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 const Py_UNICODE *unicode, Py_ssize_t size,
5593 Py_ssize_t startpos, Py_ssize_t endpos,
5594 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005597 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 }
5600 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5602 goto onError;
5603 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5604 goto onError;
5605 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5606 goto onError;
5607 return;
5608 onError:
5609 Py_DECREF(*exceptionObject);
5610 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 }
5612}
5613
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614/* raises a UnicodeTranslateError */
5615static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 const Py_UNICODE *unicode, Py_ssize_t size,
5617 Py_ssize_t startpos, Py_ssize_t endpos,
5618 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619{
5620 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624}
5625
5626/* error handling callback helper:
5627 build arguments, call the callback and check the arguments,
5628 put the result into newpos and return the replacement string, which
5629 has to be freed by the caller */
5630static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 PyObject **errorHandler,
5632 const char *reason,
5633 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5634 Py_ssize_t startpos, Py_ssize_t endpos,
5635 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005637 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005638
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005639 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640 PyObject *restuple;
5641 PyObject *resunicode;
5642
5643 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005644 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 }
5648
5649 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653
5654 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005659 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 Py_DECREF(restuple);
5661 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005662 }
5663 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 &resunicode, &i_newpos)) {
5665 Py_DECREF(restuple);
5666 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005668 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005670 else
5671 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005672 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5674 Py_DECREF(restuple);
5675 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005676 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677 Py_INCREF(resunicode);
5678 Py_DECREF(restuple);
5679 return resunicode;
5680}
5681
5682/* Lookup the character ch in the mapping and put the result in result,
5683 which must be decrefed by the caller.
5684 Return 0 on success, -1 on error */
5685static
5686int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5687{
Christian Heimes217cfd12007-12-02 14:31:20 +00005688 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689 PyObject *x;
5690
5691 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 x = PyObject_GetItem(mapping, w);
5694 Py_DECREF(w);
5695 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5697 /* No mapping found means: use 1:1 mapping. */
5698 PyErr_Clear();
5699 *result = NULL;
5700 return 0;
5701 } else
5702 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703 }
5704 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 *result = x;
5706 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005708 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 long value = PyLong_AS_LONG(x);
5710 long max = PyUnicode_GetMax();
5711 if (value < 0 || value > max) {
5712 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005713 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 Py_DECREF(x);
5715 return -1;
5716 }
5717 *result = x;
5718 return 0;
5719 }
5720 else if (PyUnicode_Check(x)) {
5721 *result = x;
5722 return 0;
5723 }
5724 else {
5725 /* wrong return value */
5726 PyErr_SetString(PyExc_TypeError,
5727 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005728 Py_DECREF(x);
5729 return -1;
5730 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731}
5732/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 if not reallocate and adjust various state variables.
5734 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735static
Walter Dörwald4894c302003-10-24 14:25:28 +00005736int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005738{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005739 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005740 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 /* remember old output position */
5742 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5743 /* exponentially overallocate to minimize reallocations */
5744 if (requiredsize < 2 * oldsize)
5745 requiredsize = 2 * oldsize;
5746 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5747 return -1;
5748 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005749 }
5750 return 0;
5751}
5752/* lookup the character, put the result in the output string and adjust
5753 various state variables. Return a new reference to the object that
5754 was put in the output buffer in *result, or Py_None, if the mapping was
5755 undefined (in which case no character was written).
5756 The called must decref result.
5757 Return 0 on success, -1 on error. */
5758static
Walter Dörwald4894c302003-10-24 14:25:28 +00005759int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5761 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762{
Walter Dörwald4894c302003-10-24 14:25:28 +00005763 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005765 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 /* not found => default to 1:1 mapping */
5767 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 }
5769 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005771 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 /* no overflow check, because we know that the space is enough */
5773 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 }
5775 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5777 if (repsize==1) {
5778 /* no overflow check, because we know that the space is enough */
5779 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5780 }
5781 else if (repsize!=0) {
5782 /* more than one character */
5783 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5784 (insize - (curinp-startinp)) +
5785 repsize - 1;
5786 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5787 return -1;
5788 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5789 *outp += repsize;
5790 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 }
5792 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005794 return 0;
5795}
5796
5797PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 Py_ssize_t size,
5799 PyObject *mapping,
5800 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 /* output object */
5803 PyObject *res = NULL;
5804 /* pointers to the beginning and end+1 of input */
5805 const Py_UNICODE *startp = p;
5806 const Py_UNICODE *endp = p + size;
5807 /* pointer into the output */
5808 Py_UNICODE *str;
5809 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005810 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 char *reason = "character maps to <undefined>";
5812 PyObject *errorHandler = NULL;
5813 PyObject *exc = NULL;
5814 /* the following variable is used for caching string comparisons
5815 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5816 * 3=ignore, 4=xmlcharrefreplace */
5817 int known_errorHandler = -1;
5818
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 PyErr_BadArgument();
5821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005823
5824 /* allocate enough for a simple 1:1 translation without
5825 replacements, if we need more, we'll resize */
5826 res = PyUnicode_FromUnicode(NULL, size);
5827 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005831 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005833 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 /* try to encode it */
5835 PyObject *x = NULL;
5836 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5837 Py_XDECREF(x);
5838 goto onError;
5839 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005840 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 if (x!=Py_None) /* it worked => adjust input pointer */
5842 ++p;
5843 else { /* untranslatable character */
5844 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5845 Py_ssize_t repsize;
5846 Py_ssize_t newpos;
5847 Py_UNICODE *uni2;
5848 /* startpos for collecting untranslatable chars */
5849 const Py_UNICODE *collstart = p;
5850 const Py_UNICODE *collend = p+1;
5851 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 /* find all untranslatable characters */
5854 while (collend < endp) {
5855 if (charmaptranslate_lookup(*collend, mapping, &x))
5856 goto onError;
5857 Py_XDECREF(x);
5858 if (x!=Py_None)
5859 break;
5860 ++collend;
5861 }
5862 /* cache callback name lookup
5863 * (if not done yet, i.e. it's the first error) */
5864 if (known_errorHandler==-1) {
5865 if ((errors==NULL) || (!strcmp(errors, "strict")))
5866 known_errorHandler = 1;
5867 else if (!strcmp(errors, "replace"))
5868 known_errorHandler = 2;
5869 else if (!strcmp(errors, "ignore"))
5870 known_errorHandler = 3;
5871 else if (!strcmp(errors, "xmlcharrefreplace"))
5872 known_errorHandler = 4;
5873 else
5874 known_errorHandler = 0;
5875 }
5876 switch (known_errorHandler) {
5877 case 1: /* strict */
5878 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005879 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 case 2: /* replace */
5881 /* No need to check for space, this is a 1:1 replacement */
5882 for (coll = collstart; coll<collend; ++coll)
5883 *str++ = '?';
5884 /* fall through */
5885 case 3: /* ignore */
5886 p = collend;
5887 break;
5888 case 4: /* xmlcharrefreplace */
5889 /* generate replacement (temporarily (mis)uses p) */
5890 for (p = collstart; p < collend; ++p) {
5891 char buffer[2+29+1+1];
5892 char *cp;
5893 sprintf(buffer, "&#%d;", (int)*p);
5894 if (charmaptranslate_makespace(&res, &str,
5895 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5896 goto onError;
5897 for (cp = buffer; *cp; ++cp)
5898 *str++ = *cp;
5899 }
5900 p = collend;
5901 break;
5902 default:
5903 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5904 reason, startp, size, &exc,
5905 collstart-startp, collend-startp, &newpos);
5906 if (repunicode == NULL)
5907 goto onError;
5908 /* generate replacement */
5909 repsize = PyUnicode_GET_SIZE(repunicode);
5910 if (charmaptranslate_makespace(&res, &str,
5911 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5912 Py_DECREF(repunicode);
5913 goto onError;
5914 }
5915 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5916 *str++ = *uni2;
5917 p = startp + newpos;
5918 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005919 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005920 }
5921 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005922 /* Resize if we allocated to much */
5923 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005924 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 if (PyUnicode_Resize(&res, respos) < 0)
5926 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 }
5928 Py_XDECREF(exc);
5929 Py_XDECREF(errorHandler);
5930 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005933 Py_XDECREF(res);
5934 Py_XDECREF(exc);
5935 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 return NULL;
5937}
5938
5939PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 PyObject *mapping,
5941 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942{
5943 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005944
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 str = PyUnicode_FromObject(str);
5946 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 PyUnicode_GET_SIZE(str),
5950 mapping,
5951 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 Py_DECREF(str);
5953 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005954
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 Py_XDECREF(str);
5957 return NULL;
5958}
Tim Petersced69f82003-09-16 20:30:58 +00005959
Guido van Rossum9e896b32000-04-05 20:11:21 +00005960/* --- Decimal Encoder ---------------------------------------------------- */
5961
5962int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 Py_ssize_t length,
5964 char *output,
5965 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005966{
5967 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005968 PyObject *errorHandler = NULL;
5969 PyObject *exc = NULL;
5970 const char *encoding = "decimal";
5971 const char *reason = "invalid decimal Unicode string";
5972 /* the following variable is used for caching string comparisons
5973 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5974 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005975
5976 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 PyErr_BadArgument();
5978 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005979 }
5980
5981 p = s;
5982 end = s + length;
5983 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 register Py_UNICODE ch = *p;
5985 int decimal;
5986 PyObject *repunicode;
5987 Py_ssize_t repsize;
5988 Py_ssize_t newpos;
5989 Py_UNICODE *uni2;
5990 Py_UNICODE *collstart;
5991 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005992
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005994 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 ++p;
5996 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005997 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 decimal = Py_UNICODE_TODECIMAL(ch);
5999 if (decimal >= 0) {
6000 *output++ = '0' + decimal;
6001 ++p;
6002 continue;
6003 }
6004 if (0 < ch && ch < 256) {
6005 *output++ = (char)ch;
6006 ++p;
6007 continue;
6008 }
6009 /* All other characters are considered unencodable */
6010 collstart = p;
6011 collend = p+1;
6012 while (collend < end) {
6013 if ((0 < *collend && *collend < 256) ||
6014 !Py_UNICODE_ISSPACE(*collend) ||
6015 Py_UNICODE_TODECIMAL(*collend))
6016 break;
6017 }
6018 /* cache callback name lookup
6019 * (if not done yet, i.e. it's the first error) */
6020 if (known_errorHandler==-1) {
6021 if ((errors==NULL) || (!strcmp(errors, "strict")))
6022 known_errorHandler = 1;
6023 else if (!strcmp(errors, "replace"))
6024 known_errorHandler = 2;
6025 else if (!strcmp(errors, "ignore"))
6026 known_errorHandler = 3;
6027 else if (!strcmp(errors, "xmlcharrefreplace"))
6028 known_errorHandler = 4;
6029 else
6030 known_errorHandler = 0;
6031 }
6032 switch (known_errorHandler) {
6033 case 1: /* strict */
6034 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6035 goto onError;
6036 case 2: /* replace */
6037 for (p = collstart; p < collend; ++p)
6038 *output++ = '?';
6039 /* fall through */
6040 case 3: /* ignore */
6041 p = collend;
6042 break;
6043 case 4: /* xmlcharrefreplace */
6044 /* generate replacement (temporarily (mis)uses p) */
6045 for (p = collstart; p < collend; ++p)
6046 output += sprintf(output, "&#%d;", (int)*p);
6047 p = collend;
6048 break;
6049 default:
6050 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6051 encoding, reason, s, length, &exc,
6052 collstart-s, collend-s, &newpos);
6053 if (repunicode == NULL)
6054 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006055 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006056 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006057 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6058 Py_DECREF(repunicode);
6059 goto onError;
6060 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 /* generate replacement */
6062 repsize = PyUnicode_GET_SIZE(repunicode);
6063 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6064 Py_UNICODE ch = *uni2;
6065 if (Py_UNICODE_ISSPACE(ch))
6066 *output++ = ' ';
6067 else {
6068 decimal = Py_UNICODE_TODECIMAL(ch);
6069 if (decimal >= 0)
6070 *output++ = '0' + decimal;
6071 else if (0 < ch && ch < 256)
6072 *output++ = (char)ch;
6073 else {
6074 Py_DECREF(repunicode);
6075 raise_encode_exception(&exc, encoding,
6076 s, length, collstart-s, collend-s, reason);
6077 goto onError;
6078 }
6079 }
6080 }
6081 p = s + newpos;
6082 Py_DECREF(repunicode);
6083 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006084 }
6085 /* 0-terminate the output string */
6086 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087 Py_XDECREF(exc);
6088 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006089 return 0;
6090
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092 Py_XDECREF(exc);
6093 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006094 return -1;
6095}
6096
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097/* --- Helpers ------------------------------------------------------------ */
6098
Eric Smith8c663262007-08-25 02:26:07 +00006099#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006100#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006101
Thomas Wouters477c8d52006-05-27 19:21:47 +00006102#include "stringlib/count.h"
6103#include "stringlib/find.h"
6104#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006105#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006106
Eric Smith5807c412008-05-11 21:00:57 +00006107#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006108#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006109#include "stringlib/localeutil.h"
6110
Thomas Wouters477c8d52006-05-27 19:21:47 +00006111/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006112#define ADJUST_INDICES(start, end, len) \
6113 if (end > len) \
6114 end = len; \
6115 else if (end < 0) { \
6116 end += len; \
6117 if (end < 0) \
6118 end = 0; \
6119 } \
6120 if (start < 0) { \
6121 start += len; \
6122 if (start < 0) \
6123 start = 0; \
6124 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006125
Martin v. Löwis18e16552006-02-15 17:27:45 +00006126Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006127 PyObject *substr,
6128 Py_ssize_t start,
6129 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006131 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006132 PyUnicodeObject* str_obj;
6133 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006134
Thomas Wouters477c8d52006-05-27 19:21:47 +00006135 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6136 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006138 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6139 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 Py_DECREF(str_obj);
6141 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 }
Tim Petersced69f82003-09-16 20:30:58 +00006143
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006144 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006145 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006146 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6147 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006148 );
6149
6150 Py_DECREF(sub_obj);
6151 Py_DECREF(str_obj);
6152
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 return result;
6154}
6155
Martin v. Löwis18e16552006-02-15 17:27:45 +00006156Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006157 PyObject *sub,
6158 Py_ssize_t start,
6159 Py_ssize_t end,
6160 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006162 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006163
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006165 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006167 sub = PyUnicode_FromObject(sub);
6168 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 Py_DECREF(str);
6170 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 }
Tim Petersced69f82003-09-16 20:30:58 +00006172
Thomas Wouters477c8d52006-05-27 19:21:47 +00006173 if (direction > 0)
6174 result = stringlib_find_slice(
6175 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6176 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6177 start, end
6178 );
6179 else
6180 result = stringlib_rfind_slice(
6181 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6182 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6183 start, end
6184 );
6185
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006187 Py_DECREF(sub);
6188
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189 return result;
6190}
6191
Tim Petersced69f82003-09-16 20:30:58 +00006192static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 PyUnicodeObject *substring,
6195 Py_ssize_t start,
6196 Py_ssize_t end,
6197 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 if (substring->length == 0)
6200 return 1;
6201
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006202 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 end -= substring->length;
6204 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206
6207 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 if (Py_UNICODE_MATCH(self, end, substring))
6209 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 } else {
6211 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 }
6214
6215 return 0;
6216}
6217
Martin v. Löwis18e16552006-02-15 17:27:45 +00006218Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 PyObject *substr,
6220 Py_ssize_t start,
6221 Py_ssize_t end,
6222 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006224 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006225
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 str = PyUnicode_FromObject(str);
6227 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 substr = PyUnicode_FromObject(substr);
6230 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 Py_DECREF(str);
6232 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 }
Tim Petersced69f82003-09-16 20:30:58 +00006234
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 (PyUnicodeObject *)substr,
6237 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 Py_DECREF(str);
6239 Py_DECREF(substr);
6240 return result;
6241}
6242
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243/* Apply fixfct filter to the Unicode object self and return a
6244 reference to the modified object */
6245
Tim Petersced69f82003-09-16 20:30:58 +00006246static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249{
6250
6251 PyUnicodeObject *u;
6252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006253 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006256
6257 Py_UNICODE_COPY(u->str, self->str, self->length);
6258
Tim Peters7a29bd52001-09-12 03:03:31 +00006259 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 /* fixfct should return TRUE if it modified the buffer. If
6261 FALSE, return a reference to the original buffer instead
6262 (to save space, not time) */
6263 Py_INCREF(self);
6264 Py_DECREF(u);
6265 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266 }
6267 return (PyObject*) u;
6268}
6269
Tim Petersced69f82003-09-16 20:30:58 +00006270static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271int fixupper(PyUnicodeObject *self)
6272{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006273 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 Py_UNICODE *s = self->str;
6275 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006276
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006279
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 ch = Py_UNICODE_TOUPPER(*s);
6281 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 *s = ch;
6284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 s++;
6286 }
6287
6288 return status;
6289}
6290
Tim Petersced69f82003-09-16 20:30:58 +00006291static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292int fixlower(PyUnicodeObject *self)
6293{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006294 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 Py_UNICODE *s = self->str;
6296 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006297
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006300
Benjamin Peterson29060642009-01-31 22:14:21 +00006301 ch = Py_UNICODE_TOLOWER(*s);
6302 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006304 *s = ch;
6305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 s++;
6307 }
6308
6309 return status;
6310}
6311
Tim Petersced69f82003-09-16 20:30:58 +00006312static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313int fixswapcase(PyUnicodeObject *self)
6314{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006315 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 Py_UNICODE *s = self->str;
6317 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006318
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 while (len-- > 0) {
6320 if (Py_UNICODE_ISUPPER(*s)) {
6321 *s = Py_UNICODE_TOLOWER(*s);
6322 status = 1;
6323 } else if (Py_UNICODE_ISLOWER(*s)) {
6324 *s = Py_UNICODE_TOUPPER(*s);
6325 status = 1;
6326 }
6327 s++;
6328 }
6329
6330 return status;
6331}
6332
Tim Petersced69f82003-09-16 20:30:58 +00006333static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334int fixcapitalize(PyUnicodeObject *self)
6335{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006336 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006337 Py_UNICODE *s = self->str;
6338 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006339
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006340 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006342 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 *s = Py_UNICODE_TOUPPER(*s);
6344 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006346 s++;
6347 while (--len > 0) {
6348 if (Py_UNICODE_ISUPPER(*s)) {
6349 *s = Py_UNICODE_TOLOWER(*s);
6350 status = 1;
6351 }
6352 s++;
6353 }
6354 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355}
6356
6357static
6358int fixtitle(PyUnicodeObject *self)
6359{
6360 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6361 register Py_UNICODE *e;
6362 int previous_is_cased;
6363
6364 /* Shortcut for single character strings */
6365 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6367 if (*p != ch) {
6368 *p = ch;
6369 return 1;
6370 }
6371 else
6372 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 }
Tim Petersced69f82003-09-16 20:30:58 +00006374
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 e = p + PyUnicode_GET_SIZE(self);
6376 previous_is_cased = 0;
6377 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006379
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 if (previous_is_cased)
6381 *p = Py_UNICODE_TOLOWER(ch);
6382 else
6383 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006384
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 if (Py_UNICODE_ISLOWER(ch) ||
6386 Py_UNICODE_ISUPPER(ch) ||
6387 Py_UNICODE_ISTITLE(ch))
6388 previous_is_cased = 1;
6389 else
6390 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 }
6392 return 1;
6393}
6394
Tim Peters8ce9f162004-08-27 01:49:32 +00006395PyObject *
6396PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
Skip Montanaro6543b452004-09-16 03:28:13 +00006398 const Py_UNICODE blank = ' ';
6399 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006400 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006401 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006402 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6403 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006404 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6405 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006406 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006407 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408
Tim Peters05eba1f2004-08-27 21:32:02 +00006409 fseq = PySequence_Fast(seq, "");
6410 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006411 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006412 }
6413
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006414 /* NOTE: the following code can't call back into Python code,
6415 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006416 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006417
Tim Peters05eba1f2004-08-27 21:32:02 +00006418 seqlen = PySequence_Fast_GET_SIZE(fseq);
6419 /* If empty sequence, return u"". */
6420 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006421 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6422 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006423 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006424 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006425 /* If singleton sequence with an exact Unicode, return that. */
6426 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 item = items[0];
6428 if (PyUnicode_CheckExact(item)) {
6429 Py_INCREF(item);
6430 res = (PyUnicodeObject *)item;
6431 goto Done;
6432 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006433 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006434 else {
6435 /* Set up sep and seplen */
6436 if (separator == NULL) {
6437 sep = &blank;
6438 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006439 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006440 else {
6441 if (!PyUnicode_Check(separator)) {
6442 PyErr_Format(PyExc_TypeError,
6443 "separator: expected str instance,"
6444 " %.80s found",
6445 Py_TYPE(separator)->tp_name);
6446 goto onError;
6447 }
6448 sep = PyUnicode_AS_UNICODE(separator);
6449 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006450 }
6451 }
6452
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006453 /* There are at least two things to join, or else we have a subclass
6454 * of str in the sequence.
6455 * Do a pre-pass to figure out the total amount of space we'll
6456 * need (sz), and see whether all argument are strings.
6457 */
6458 sz = 0;
6459 for (i = 0; i < seqlen; i++) {
6460 const Py_ssize_t old_sz = sz;
6461 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 if (!PyUnicode_Check(item)) {
6463 PyErr_Format(PyExc_TypeError,
6464 "sequence item %zd: expected str instance,"
6465 " %.80s found",
6466 i, Py_TYPE(item)->tp_name);
6467 goto onError;
6468 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006469 sz += PyUnicode_GET_SIZE(item);
6470 if (i != 0)
6471 sz += seplen;
6472 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6473 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006475 goto onError;
6476 }
6477 }
Tim Petersced69f82003-09-16 20:30:58 +00006478
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006479 res = _PyUnicode_New(sz);
6480 if (res == NULL)
6481 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006482
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006483 /* Catenate everything. */
6484 res_p = PyUnicode_AS_UNICODE(res);
6485 for (i = 0; i < seqlen; ++i) {
6486 Py_ssize_t itemlen;
6487 item = items[i];
6488 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 /* Copy item, and maybe the separator. */
6490 if (i) {
6491 Py_UNICODE_COPY(res_p, sep, seplen);
6492 res_p += seplen;
6493 }
6494 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6495 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006496 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006497
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006499 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 return (PyObject *)res;
6501
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006503 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006504 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 return NULL;
6506}
6507
Tim Petersced69f82003-09-16 20:30:58 +00006508static
6509PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 Py_ssize_t left,
6511 Py_ssize_t right,
6512 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513{
6514 PyUnicodeObject *u;
6515
6516 if (left < 0)
6517 left = 0;
6518 if (right < 0)
6519 right = 0;
6520
Tim Peters7a29bd52001-09-12 03:03:31 +00006521 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 Py_INCREF(self);
6523 return self;
6524 }
6525
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006526 if (left > PY_SSIZE_T_MAX - self->length ||
6527 right > PY_SSIZE_T_MAX - (left + self->length)) {
6528 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6529 return NULL;
6530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 u = _PyUnicode_New(left + self->length + right);
6532 if (u) {
6533 if (left)
6534 Py_UNICODE_FILL(u->str, fill, left);
6535 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6536 if (right)
6537 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6538 }
6539
6540 return u;
6541}
6542
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006543PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546
6547 string = PyUnicode_FromObject(string);
6548 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006551 list = stringlib_splitlines(
6552 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6553 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554
6555 Py_DECREF(string);
6556 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557}
6558
Tim Petersced69f82003-09-16 20:30:58 +00006559static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 PyUnicodeObject *substring,
6562 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006565 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006568 return stringlib_split_whitespace(
6569 (PyObject*) self, self->str, self->length, maxcount
6570 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006572 return stringlib_split(
6573 (PyObject*) self, self->str, self->length,
6574 substring->str, substring->length,
6575 maxcount
6576 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577}
6578
Tim Petersced69f82003-09-16 20:30:58 +00006579static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006580PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 PyUnicodeObject *substring,
6582 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006583{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006584 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006585 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006586
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006587 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006588 return stringlib_rsplit_whitespace(
6589 (PyObject*) self, self->str, self->length, maxcount
6590 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006591
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006592 return stringlib_rsplit(
6593 (PyObject*) self, self->str, self->length,
6594 substring->str, substring->length,
6595 maxcount
6596 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006597}
6598
6599static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 PyUnicodeObject *str1,
6602 PyUnicodeObject *str2,
6603 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
6605 PyUnicodeObject *u;
6606
6607 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006609 else if (maxcount == 0 || self->length == 0)
6610 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
Thomas Wouters477c8d52006-05-27 19:21:47 +00006612 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006613 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006614 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006615 if (str1->length == 0)
6616 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006617 if (str1->length == 1) {
6618 /* replace characters */
6619 Py_UNICODE u1, u2;
6620 if (!findchar(self->str, self->length, str1->str[0]))
6621 goto nothing;
6622 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6623 if (!u)
6624 return NULL;
6625 Py_UNICODE_COPY(u->str, self->str, self->length);
6626 u1 = str1->str[0];
6627 u2 = str2->str[0];
6628 for (i = 0; i < u->length; i++)
6629 if (u->str[i] == u1) {
6630 if (--maxcount < 0)
6631 break;
6632 u->str[i] = u2;
6633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006635 i = stringlib_find(
6636 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006638 if (i < 0)
6639 goto nothing;
6640 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6641 if (!u)
6642 return NULL;
6643 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006644
6645 /* change everything in-place, starting with this one */
6646 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6647 i += str1->length;
6648
6649 while ( --maxcount > 0) {
6650 i = stringlib_find(self->str+i, self->length-i,
6651 str1->str, str1->length,
6652 i);
6653 if (i == -1)
6654 break;
6655 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6656 i += str1->length;
6657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006660
6661 Py_ssize_t n, i, j, e;
6662 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 Py_UNICODE *p;
6664
6665 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006666 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6667 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006668 if (n == 0)
6669 goto nothing;
6670 /* new_size = self->length + n * (str2->length - str1->length)); */
6671 delta = (str2->length - str1->length);
6672 if (delta == 0) {
6673 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006675 product = n * (str2->length - str1->length);
6676 if ((product / (str2->length - str1->length)) != n) {
6677 PyErr_SetString(PyExc_OverflowError,
6678 "replace string is too long");
6679 return NULL;
6680 }
6681 new_size = self->length + product;
6682 if (new_size < 0) {
6683 PyErr_SetString(PyExc_OverflowError,
6684 "replace string is too long");
6685 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 }
6687 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006688 u = _PyUnicode_New(new_size);
6689 if (!u)
6690 return NULL;
6691 i = 0;
6692 p = u->str;
6693 e = self->length - str1->length;
6694 if (str1->length > 0) {
6695 while (n-- > 0) {
6696 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006697 j = stringlib_find(self->str+i, self->length-i,
6698 str1->str, str1->length,
6699 i);
6700 if (j == -1)
6701 break;
6702 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006703 /* copy unchanged part [i:j] */
6704 Py_UNICODE_COPY(p, self->str+i, j-i);
6705 p += j - i;
6706 }
6707 /* copy substitution string */
6708 if (str2->length > 0) {
6709 Py_UNICODE_COPY(p, str2->str, str2->length);
6710 p += str2->length;
6711 }
6712 i = j + str1->length;
6713 }
6714 if (i < self->length)
6715 /* copy tail [i:] */
6716 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6717 } else {
6718 /* interleave */
6719 while (n > 0) {
6720 Py_UNICODE_COPY(p, str2->str, str2->length);
6721 p += str2->length;
6722 if (--n <= 0)
6723 break;
6724 *p++ = self->str[i++];
6725 }
6726 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006730
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006732 /* nothing to replace; return original string (when possible) */
6733 if (PyUnicode_CheckExact(self)) {
6734 Py_INCREF(self);
6735 return (PyObject *) self;
6736 }
6737 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738}
6739
6740/* --- Unicode Object Methods --------------------------------------------- */
6741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006742PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744\n\
6745Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006746characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747
6748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006749unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 return fixup(self, fixtitle);
6752}
6753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006754PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756\n\
6757Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006758have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
6760static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006761unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 return fixup(self, fixcapitalize);
6764}
6765
6766#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006767PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769\n\
6770Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006771normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772
6773static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006774unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775{
6776 PyObject *list;
6777 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006778 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 /* Split into words */
6781 list = split(self, NULL, -1);
6782 if (!list)
6783 return NULL;
6784
6785 /* Capitalize each word */
6786 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6787 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 if (item == NULL)
6790 goto onError;
6791 Py_DECREF(PyList_GET_ITEM(list, i));
6792 PyList_SET_ITEM(list, i, item);
6793 }
6794
6795 /* Join the words to form a new string */
6796 item = PyUnicode_Join(NULL, list);
6797
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799 Py_DECREF(list);
6800 return (PyObject *)item;
6801}
6802#endif
6803
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006804/* Argument converter. Coerces to a single unicode character */
6805
6806static int
6807convert_uc(PyObject *obj, void *addr)
6808{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006809 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6810 PyObject *uniobj;
6811 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006812
Benjamin Peterson14339b62009-01-31 16:36:08 +00006813 uniobj = PyUnicode_FromObject(obj);
6814 if (uniobj == NULL) {
6815 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006817 return 0;
6818 }
6819 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6820 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006822 Py_DECREF(uniobj);
6823 return 0;
6824 }
6825 unistr = PyUnicode_AS_UNICODE(uniobj);
6826 *fillcharloc = unistr[0];
6827 Py_DECREF(uniobj);
6828 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006829}
6830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006831PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006834Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006835done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836
6837static PyObject *
6838unicode_center(PyUnicodeObject *self, PyObject *args)
6839{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006840 Py_ssize_t marg, left;
6841 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006842 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843
Thomas Woutersde017742006-02-16 19:34:37 +00006844 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 return NULL;
6846
Tim Peters7a29bd52001-09-12 03:03:31 +00006847 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 Py_INCREF(self);
6849 return (PyObject*) self;
6850 }
6851
6852 marg = width - self->length;
6853 left = marg / 2 + (marg & width & 1);
6854
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006855 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856}
6857
Marc-André Lemburge5034372000-08-08 08:04:29 +00006858#if 0
6859
6860/* This code should go into some future Unicode collation support
6861 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006862 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006863
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006864/* speedy UTF-16 code point order comparison */
6865/* gleaned from: */
6866/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6867
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006868static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006869{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006870 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006871 0, 0, 0, 0, 0, 0, 0, 0,
6872 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006873 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006874};
6875
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876static int
6877unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6878{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006879 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006880
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 Py_UNICODE *s1 = str1->str;
6882 Py_UNICODE *s2 = str2->str;
6883
6884 len1 = str1->length;
6885 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006886
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006888 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006889
6890 c1 = *s1++;
6891 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006892
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 if (c1 > (1<<11) * 26)
6894 c1 += utf16Fixup[c1>>11];
6895 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006896 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006897 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006898
6899 if (c1 != c2)
6900 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006901
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006902 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 }
6904
6905 return (len1 < len2) ? -1 : (len1 != len2);
6906}
6907
Marc-André Lemburge5034372000-08-08 08:04:29 +00006908#else
6909
6910static int
6911unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6912{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006913 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006914
6915 Py_UNICODE *s1 = str1->str;
6916 Py_UNICODE *s2 = str2->str;
6917
6918 len1 = str1->length;
6919 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006920
Marc-André Lemburge5034372000-08-08 08:04:29 +00006921 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006922 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006923
Fredrik Lundh45714e92001-06-26 16:39:36 +00006924 c1 = *s1++;
6925 c2 = *s2++;
6926
6927 if (c1 != c2)
6928 return (c1 < c2) ? -1 : 1;
6929
Marc-André Lemburge5034372000-08-08 08:04:29 +00006930 len1--; len2--;
6931 }
6932
6933 return (len1 < len2) ? -1 : (len1 != len2);
6934}
6935
6936#endif
6937
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006941 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6942 return unicode_compare((PyUnicodeObject *)left,
6943 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006944 PyErr_Format(PyExc_TypeError,
6945 "Can't compare %.100s and %.100s",
6946 left->ob_type->tp_name,
6947 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948 return -1;
6949}
6950
Martin v. Löwis5b222132007-06-10 09:51:05 +00006951int
6952PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6953{
6954 int i;
6955 Py_UNICODE *id;
6956 assert(PyUnicode_Check(uni));
6957 id = PyUnicode_AS_UNICODE(uni);
6958 /* Compare Unicode string and source character set string */
6959 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006960 if (id[i] != str[i])
6961 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006962 /* This check keeps Python strings that end in '\0' from comparing equal
6963 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006964 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006966 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006968 return 0;
6969}
6970
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006971
Benjamin Peterson29060642009-01-31 22:14:21 +00006972#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006973 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006974
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006975PyObject *PyUnicode_RichCompare(PyObject *left,
6976 PyObject *right,
6977 int op)
6978{
6979 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006980
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006981 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6982 PyObject *v;
6983 if (((PyUnicodeObject *) left)->length !=
6984 ((PyUnicodeObject *) right)->length) {
6985 if (op == Py_EQ) {
6986 Py_INCREF(Py_False);
6987 return Py_False;
6988 }
6989 if (op == Py_NE) {
6990 Py_INCREF(Py_True);
6991 return Py_True;
6992 }
6993 }
6994 if (left == right)
6995 result = 0;
6996 else
6997 result = unicode_compare((PyUnicodeObject *)left,
6998 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006999
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007000 /* Convert the return value to a Boolean */
7001 switch (op) {
7002 case Py_EQ:
7003 v = TEST_COND(result == 0);
7004 break;
7005 case Py_NE:
7006 v = TEST_COND(result != 0);
7007 break;
7008 case Py_LE:
7009 v = TEST_COND(result <= 0);
7010 break;
7011 case Py_GE:
7012 v = TEST_COND(result >= 0);
7013 break;
7014 case Py_LT:
7015 v = TEST_COND(result == -1);
7016 break;
7017 case Py_GT:
7018 v = TEST_COND(result == 1);
7019 break;
7020 default:
7021 PyErr_BadArgument();
7022 return NULL;
7023 }
7024 Py_INCREF(v);
7025 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007026 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007027
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007028 Py_INCREF(Py_NotImplemented);
7029 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007030}
7031
Guido van Rossum403d68b2000-03-13 15:55:09 +00007032int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007034{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007035 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007036 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007037
7038 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007039 sub = PyUnicode_FromObject(element);
7040 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 PyErr_Format(PyExc_TypeError,
7042 "'in <string>' requires string as left operand, not %s",
7043 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007044 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007045 }
7046
Thomas Wouters477c8d52006-05-27 19:21:47 +00007047 str = PyUnicode_FromObject(container);
7048 if (!str) {
7049 Py_DECREF(sub);
7050 return -1;
7051 }
7052
7053 result = stringlib_contains_obj(str, sub);
7054
7055 Py_DECREF(str);
7056 Py_DECREF(sub);
7057
Guido van Rossum403d68b2000-03-13 15:55:09 +00007058 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007059}
7060
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061/* Concat to string or Unicode object giving a new Unicode object. */
7062
7063PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065{
7066 PyUnicodeObject *u = NULL, *v = NULL, *w;
7067
7068 /* Coerce the two arguments */
7069 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7070 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7073 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007074 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075
7076 /* Shortcuts */
7077 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 Py_DECREF(v);
7079 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 }
7081 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 Py_DECREF(u);
7083 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 }
7085
7086 /* Concat the two Unicode strings */
7087 w = _PyUnicode_New(u->length + v->length);
7088 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 Py_UNICODE_COPY(w->str, u->str, u->length);
7091 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7092
7093 Py_DECREF(u);
7094 Py_DECREF(v);
7095 return (PyObject *)w;
7096
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 Py_XDECREF(u);
7099 Py_XDECREF(v);
7100 return NULL;
7101}
7102
Walter Dörwald1ab83302007-05-18 17:15:44 +00007103void
7104PyUnicode_Append(PyObject **pleft, PyObject *right)
7105{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007106 PyObject *new;
7107 if (*pleft == NULL)
7108 return;
7109 if (right == NULL || !PyUnicode_Check(*pleft)) {
7110 Py_DECREF(*pleft);
7111 *pleft = NULL;
7112 return;
7113 }
7114 new = PyUnicode_Concat(*pleft, right);
7115 Py_DECREF(*pleft);
7116 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007117}
7118
7119void
7120PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7121{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007122 PyUnicode_Append(pleft, right);
7123 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007124}
7125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007126PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007129Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007130string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007131interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132
7133static PyObject *
7134unicode_count(PyUnicodeObject *self, PyObject *args)
7135{
7136 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007137 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007138 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139 PyObject *result;
7140
Guido van Rossumb8872e62000-05-09 14:14:27 +00007141 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 return NULL;
7144
7145 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007146 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007149
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007150 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007151 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007152 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007153 substring->str, substring->length,
7154 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007155 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156
7157 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007158
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 return result;
7160}
7161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007162PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007163 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007165Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007166to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007167handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007168a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7169'xmlcharrefreplace' as well as any other name registered with\n\
7170codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171
7172static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007173unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007175 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 char *encoding = NULL;
7177 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007178 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007179
Benjamin Peterson308d6372009-09-18 21:42:35 +00007180 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7181 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007183 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007184 if (v == NULL)
7185 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007186 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007187 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007188 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007189 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007190 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007191 Py_DECREF(v);
7192 return NULL;
7193 }
7194 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007195
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007197 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007198}
7199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007200PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202\n\
7203Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007204If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205
7206static PyObject*
7207unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7208{
7209 Py_UNICODE *e;
7210 Py_UNICODE *p;
7211 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007212 Py_UNICODE *qe;
7213 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 PyUnicodeObject *u;
7215 int tabsize = 8;
7216
7217 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219
Thomas Wouters7e474022000-07-16 12:04:32 +00007220 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007221 i = 0; /* chars up to and including most recent \n or \r */
7222 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7223 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 for (p = self->str; p < e; p++)
7225 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 if (tabsize > 0) {
7227 incr = tabsize - (j % tabsize); /* cannot overflow */
7228 if (j > PY_SSIZE_T_MAX - incr)
7229 goto overflow1;
7230 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007231 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 if (j > PY_SSIZE_T_MAX - 1)
7235 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236 j++;
7237 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 if (i > PY_SSIZE_T_MAX - j)
7239 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007241 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 }
7243 }
7244
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007245 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007247
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 /* Second pass: create output string and fill it */
7249 u = _PyUnicode_New(i + j);
7250 if (!u)
7251 return NULL;
7252
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007253 j = 0; /* same as in first pass */
7254 q = u->str; /* next output char */
7255 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256
7257 for (p = self->str; p < e; p++)
7258 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 if (tabsize > 0) {
7260 i = tabsize - (j % tabsize);
7261 j += i;
7262 while (i--) {
7263 if (q >= qe)
7264 goto overflow2;
7265 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007266 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007268 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007269 else {
7270 if (q >= qe)
7271 goto overflow2;
7272 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007273 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 if (*p == '\n' || *p == '\r')
7275 j = 0;
7276 }
7277
7278 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007279
7280 overflow2:
7281 Py_DECREF(u);
7282 overflow1:
7283 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285}
7286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007287PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007288 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289\n\
7290Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007291such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292arguments start and end are interpreted as in slice notation.\n\
7293\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007294Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295
7296static PyObject *
7297unicode_find(PyUnicodeObject *self, PyObject *args)
7298{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007299 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007300 Py_ssize_t start;
7301 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007302 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303
Christian Heimes9cd17752007-11-18 19:35:23 +00007304 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306
Thomas Wouters477c8d52006-05-27 19:21:47 +00007307 result = stringlib_find_slice(
7308 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7309 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7310 start, end
7311 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312
7313 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007314
Christian Heimes217cfd12007-12-02 14:31:20 +00007315 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316}
7317
7318static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007319unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320{
7321 if (index < 0 || index >= self->length) {
7322 PyErr_SetString(PyExc_IndexError, "string index out of range");
7323 return NULL;
7324 }
7325
7326 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7327}
7328
Guido van Rossumc2504932007-09-18 19:42:40 +00007329/* Believe it or not, this produces the same value for ASCII strings
7330 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007332unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333{
Guido van Rossumc2504932007-09-18 19:42:40 +00007334 Py_ssize_t len;
7335 Py_UNICODE *p;
7336 long x;
7337
7338 if (self->hash != -1)
7339 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007340 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007341 p = self->str;
7342 x = *p << 7;
7343 while (--len >= 0)
7344 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007345 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007346 if (x == -1)
7347 x = -2;
7348 self->hash = x;
7349 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350}
7351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007352PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007355Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356
7357static PyObject *
7358unicode_index(PyUnicodeObject *self, PyObject *args)
7359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007360 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007361 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007362 Py_ssize_t start;
7363 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364
Christian Heimes9cd17752007-11-18 19:35:23 +00007365 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
Thomas Wouters477c8d52006-05-27 19:21:47 +00007368 result = stringlib_find_slice(
7369 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7370 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7371 start, end
7372 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
7374 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007375
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 if (result < 0) {
7377 PyErr_SetString(PyExc_ValueError, "substring not found");
7378 return NULL;
7379 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007380
Christian Heimes217cfd12007-12-02 14:31:20 +00007381 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382}
7383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007384PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007387Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007388at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
7390static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007391unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392{
7393 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7394 register const Py_UNICODE *e;
7395 int cased;
7396
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397 /* Shortcut for single character strings */
7398 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007401 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007402 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007404
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405 e = p + PyUnicode_GET_SIZE(self);
7406 cased = 0;
7407 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007409
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7411 return PyBool_FromLong(0);
7412 else if (!cased && Py_UNICODE_ISLOWER(ch))
7413 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007415 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416}
7417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007418PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007421Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007422at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423
7424static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007425unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426{
7427 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7428 register const Py_UNICODE *e;
7429 int cased;
7430
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431 /* Shortcut for single character strings */
7432 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007435 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007436 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007438
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 e = p + PyUnicode_GET_SIZE(self);
7440 cased = 0;
7441 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007443
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7445 return PyBool_FromLong(0);
7446 else if (!cased && Py_UNICODE_ISUPPER(ch))
7447 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007449 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450}
7451
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007452PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007453 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007455Return True if S is a titlecased string and there is at least one\n\
7456character in S, i.e. upper- and titlecase characters may only\n\
7457follow uncased characters and lowercase characters only cased ones.\n\
7458Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459
7460static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007461unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462{
7463 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7464 register const Py_UNICODE *e;
7465 int cased, previous_is_cased;
7466
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 /* Shortcut for single character strings */
7468 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7470 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007472 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007473 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007475
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 e = p + PyUnicode_GET_SIZE(self);
7477 cased = 0;
7478 previous_is_cased = 0;
7479 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007481
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7483 if (previous_is_cased)
7484 return PyBool_FromLong(0);
7485 previous_is_cased = 1;
7486 cased = 1;
7487 }
7488 else if (Py_UNICODE_ISLOWER(ch)) {
7489 if (!previous_is_cased)
7490 return PyBool_FromLong(0);
7491 previous_is_cased = 1;
7492 cased = 1;
7493 }
7494 else
7495 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007497 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498}
7499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007500PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007503Return True if all characters in S are whitespace\n\
7504and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505
7506static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007507unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508{
7509 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7510 register const Py_UNICODE *e;
7511
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 /* Shortcut for single character strings */
7513 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 Py_UNICODE_ISSPACE(*p))
7515 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007517 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007518 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007520
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521 e = p + PyUnicode_GET_SIZE(self);
7522 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 if (!Py_UNICODE_ISSPACE(*p))
7524 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007526 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527}
7528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007529PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007531\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007532Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007533and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007534
7535static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007536unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007537{
7538 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7539 register const Py_UNICODE *e;
7540
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007541 /* Shortcut for single character strings */
7542 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 Py_UNICODE_ISALPHA(*p))
7544 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007545
7546 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007547 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007549
7550 e = p + PyUnicode_GET_SIZE(self);
7551 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 if (!Py_UNICODE_ISALPHA(*p))
7553 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007554 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007555 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007556}
7557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007558PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007560\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007561Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007562and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007563
7564static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007565unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007566{
7567 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7568 register const Py_UNICODE *e;
7569
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007570 /* Shortcut for single character strings */
7571 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 Py_UNICODE_ISALNUM(*p))
7573 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007574
7575 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007576 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007578
7579 e = p + PyUnicode_GET_SIZE(self);
7580 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007581 if (!Py_UNICODE_ISALNUM(*p))
7582 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007583 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007584 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007585}
7586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007587PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007590Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007591False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
7593static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007594unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595{
7596 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7597 register const Py_UNICODE *e;
7598
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 /* Shortcut for single character strings */
7600 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 Py_UNICODE_ISDECIMAL(*p))
7602 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007604 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007605 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007607
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 e = p + PyUnicode_GET_SIZE(self);
7609 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 if (!Py_UNICODE_ISDECIMAL(*p))
7611 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007613 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614}
7615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007616PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007619Return True if all characters in S are digits\n\
7620and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621
7622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007623unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624{
7625 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7626 register const Py_UNICODE *e;
7627
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 /* Shortcut for single character strings */
7629 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 Py_UNICODE_ISDIGIT(*p))
7631 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007633 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007634 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007636
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 e = p + PyUnicode_GET_SIZE(self);
7638 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 if (!Py_UNICODE_ISDIGIT(*p))
7640 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007642 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643}
7644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007645PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007648Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007649False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650
7651static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007652unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653{
7654 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7655 register const Py_UNICODE *e;
7656
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657 /* Shortcut for single character strings */
7658 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 Py_UNICODE_ISNUMERIC(*p))
7660 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007662 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007663 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007665
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 e = p + PyUnicode_GET_SIZE(self);
7667 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 if (!Py_UNICODE_ISNUMERIC(*p))
7669 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007671 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672}
7673
Martin v. Löwis47383402007-08-15 07:32:56 +00007674int
7675PyUnicode_IsIdentifier(PyObject *self)
7676{
7677 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7678 register const Py_UNICODE *e;
7679
7680 /* Special case for empty strings */
7681 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007683
7684 /* PEP 3131 says that the first character must be in
7685 XID_Start and subsequent characters in XID_Continue,
7686 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007687 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007688 letters, digits, underscore). However, given the current
7689 definition of XID_Start and XID_Continue, it is sufficient
7690 to check just for these, except that _ must be allowed
7691 as starting an identifier. */
7692 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7693 return 0;
7694
7695 e = p + PyUnicode_GET_SIZE(self);
7696 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 if (!_PyUnicode_IsXidContinue(*p))
7698 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007699 }
7700 return 1;
7701}
7702
7703PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007705\n\
7706Return True if S is a valid identifier according\n\
7707to the language definition.");
7708
7709static PyObject*
7710unicode_isidentifier(PyObject *self)
7711{
7712 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7713}
7714
Georg Brandl559e5d72008-06-11 18:37:52 +00007715PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007716 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007717\n\
7718Return True if all characters in S are considered\n\
7719printable in repr() or S is empty, False otherwise.");
7720
7721static PyObject*
7722unicode_isprintable(PyObject *self)
7723{
7724 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7725 register const Py_UNICODE *e;
7726
7727 /* Shortcut for single character strings */
7728 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7729 Py_RETURN_TRUE;
7730 }
7731
7732 e = p + PyUnicode_GET_SIZE(self);
7733 for (; p < e; p++) {
7734 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7735 Py_RETURN_FALSE;
7736 }
7737 }
7738 Py_RETURN_TRUE;
7739}
7740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007741PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007742 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743\n\
7744Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007745iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746
7747static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007748unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007750 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751}
7752
Martin v. Löwis18e16552006-02-15 17:27:45 +00007753static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754unicode_length(PyUnicodeObject *self)
7755{
7756 return self->length;
7757}
7758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007759PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007762Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007763done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764
7765static PyObject *
7766unicode_ljust(PyUnicodeObject *self, PyObject *args)
7767{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007768 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007769 Py_UNICODE fillchar = ' ';
7770
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007771 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 return NULL;
7773
Tim Peters7a29bd52001-09-12 03:03:31 +00007774 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 Py_INCREF(self);
7776 return (PyObject*) self;
7777 }
7778
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007779 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780}
7781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007782PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007785Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786
7787static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007788unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790 return fixup(self, fixlower);
7791}
7792
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007793#define LEFTSTRIP 0
7794#define RIGHTSTRIP 1
7795#define BOTHSTRIP 2
7796
7797/* Arrays indexed by above */
7798static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7799
7800#define STRIPNAME(i) (stripformat[i]+3)
7801
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007802/* externally visible for str.strip(unicode) */
7803PyObject *
7804_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7805{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007806 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7807 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7808 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7809 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7810 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007811
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007813
Benjamin Peterson14339b62009-01-31 16:36:08 +00007814 i = 0;
7815 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7817 i++;
7818 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007819 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007820
Benjamin Peterson14339b62009-01-31 16:36:08 +00007821 j = len;
7822 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 do {
7824 j--;
7825 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7826 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007827 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007828
Benjamin Peterson14339b62009-01-31 16:36:08 +00007829 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 Py_INCREF(self);
7831 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007832 }
7833 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007835}
7836
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837
7838static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007839do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007841 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7842 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007843
Benjamin Peterson14339b62009-01-31 16:36:08 +00007844 i = 0;
7845 if (striptype != RIGHTSTRIP) {
7846 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7847 i++;
7848 }
7849 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007850
Benjamin Peterson14339b62009-01-31 16:36:08 +00007851 j = len;
7852 if (striptype != LEFTSTRIP) {
7853 do {
7854 j--;
7855 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7856 j++;
7857 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007858
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7860 Py_INCREF(self);
7861 return (PyObject*)self;
7862 }
7863 else
7864 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865}
7866
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007867
7868static PyObject *
7869do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7870{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007871 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007872
Benjamin Peterson14339b62009-01-31 16:36:08 +00007873 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7874 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007875
Benjamin Peterson14339b62009-01-31 16:36:08 +00007876 if (sep != NULL && sep != Py_None) {
7877 if (PyUnicode_Check(sep))
7878 return _PyUnicode_XStrip(self, striptype, sep);
7879 else {
7880 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 "%s arg must be None or str",
7882 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007883 return NULL;
7884 }
7885 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007886
Benjamin Peterson14339b62009-01-31 16:36:08 +00007887 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007888}
7889
7890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007891PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007893\n\
7894Return a copy of the string S with leading and trailing\n\
7895whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007896If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007897
7898static PyObject *
7899unicode_strip(PyUnicodeObject *self, PyObject *args)
7900{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007901 if (PyTuple_GET_SIZE(args) == 0)
7902 return do_strip(self, BOTHSTRIP); /* Common case */
7903 else
7904 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007905}
7906
7907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007908PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007910\n\
7911Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007912If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007913
7914static PyObject *
7915unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7916{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007917 if (PyTuple_GET_SIZE(args) == 0)
7918 return do_strip(self, LEFTSTRIP); /* Common case */
7919 else
7920 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007921}
7922
7923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007924PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007926\n\
7927Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007928If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007929
7930static PyObject *
7931unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7932{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007933 if (PyTuple_GET_SIZE(args) == 0)
7934 return do_strip(self, RIGHTSTRIP); /* Common case */
7935 else
7936 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007937}
7938
7939
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007941unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942{
7943 PyUnicodeObject *u;
7944 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007945 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007946 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947
Georg Brandl222de0f2009-04-12 12:01:50 +00007948 if (len < 1) {
7949 Py_INCREF(unicode_empty);
7950 return (PyObject *)unicode_empty;
7951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952
Tim Peters7a29bd52001-09-12 03:03:31 +00007953 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954 /* no repeat, return original string */
7955 Py_INCREF(str);
7956 return (PyObject*) str;
7957 }
Tim Peters8f422462000-09-09 06:13:41 +00007958
7959 /* ensure # of chars needed doesn't overflow int and # of bytes
7960 * needed doesn't overflow size_t
7961 */
7962 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007963 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007964 PyErr_SetString(PyExc_OverflowError,
7965 "repeated string is too long");
7966 return NULL;
7967 }
7968 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7969 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7970 PyErr_SetString(PyExc_OverflowError,
7971 "repeated string is too long");
7972 return NULL;
7973 }
7974 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 if (!u)
7976 return NULL;
7977
7978 p = u->str;
7979
Georg Brandl222de0f2009-04-12 12:01:50 +00007980 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007981 Py_UNICODE_FILL(p, str->str[0], len);
7982 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007983 Py_ssize_t done = str->length; /* number of characters copied this far */
7984 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007986 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007987 Py_UNICODE_COPY(p+done, p, n);
7988 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 }
7991
7992 return (PyObject*) u;
7993}
7994
7995PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 PyObject *subobj,
7997 PyObject *replobj,
7998 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999{
8000 PyObject *self;
8001 PyObject *str1;
8002 PyObject *str2;
8003 PyObject *result;
8004
8005 self = PyUnicode_FromObject(obj);
8006 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 str1 = PyUnicode_FromObject(subobj);
8009 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008010 Py_DECREF(self);
8011 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 }
8013 str2 = PyUnicode_FromObject(replobj);
8014 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 Py_DECREF(self);
8016 Py_DECREF(str1);
8017 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 }
Tim Petersced69f82003-09-16 20:30:58 +00008019 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 (PyUnicodeObject *)str1,
8021 (PyUnicodeObject *)str2,
8022 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 Py_DECREF(self);
8024 Py_DECREF(str1);
8025 Py_DECREF(str2);
8026 return result;
8027}
8028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008029PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008030 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031\n\
8032Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008033old replaced by new. If the optional argument count is\n\
8034given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035
8036static PyObject*
8037unicode_replace(PyUnicodeObject *self, PyObject *args)
8038{
8039 PyUnicodeObject *str1;
8040 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008041 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 PyObject *result;
8043
Martin v. Löwis18e16552006-02-15 17:27:45 +00008044 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 return NULL;
8046 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8047 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008050 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 Py_DECREF(str1);
8052 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054
8055 result = replace(self, str1, str2, maxcount);
8056
8057 Py_DECREF(str1);
8058 Py_DECREF(str2);
8059 return result;
8060}
8061
8062static
8063PyObject *unicode_repr(PyObject *unicode)
8064{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008065 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008066 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008067 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8068 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8069
8070 /* XXX(nnorwitz): rather than over-allocating, it would be
8071 better to choose a different scheme. Perhaps scan the
8072 first N-chars of the string and allocate based on that size.
8073 */
8074 /* Initial allocation is based on the longest-possible unichr
8075 escape.
8076
8077 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8078 unichr, so in this case it's the longest unichr escape. In
8079 narrow (UTF-16) builds this is five chars per source unichr
8080 since there are two unichrs in the surrogate pair, so in narrow
8081 (UTF-16) builds it's not the longest unichr escape.
8082
8083 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8084 so in the narrow (UTF-16) build case it's the longest unichr
8085 escape.
8086 */
8087
Walter Dörwald1ab83302007-05-18 17:15:44 +00008088 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008090#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008092#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008094#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008096 if (repr == NULL)
8097 return NULL;
8098
Walter Dörwald1ab83302007-05-18 17:15:44 +00008099 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008100
8101 /* Add quote */
8102 *p++ = (findchar(s, size, '\'') &&
8103 !findchar(s, size, '"')) ? '"' : '\'';
8104 while (size-- > 0) {
8105 Py_UNICODE ch = *s++;
8106
8107 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008108 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008109 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008110 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008111 continue;
8112 }
8113
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008115 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008116 *p++ = '\\';
8117 *p++ = 't';
8118 }
8119 else if (ch == '\n') {
8120 *p++ = '\\';
8121 *p++ = 'n';
8122 }
8123 else if (ch == '\r') {
8124 *p++ = '\\';
8125 *p++ = 'r';
8126 }
8127
8128 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008129 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008130 *p++ = '\\';
8131 *p++ = 'x';
8132 *p++ = hexdigits[(ch >> 4) & 0x000F];
8133 *p++ = hexdigits[ch & 0x000F];
8134 }
8135
Georg Brandl559e5d72008-06-11 18:37:52 +00008136 /* Copy ASCII characters as-is */
8137 else if (ch < 0x7F) {
8138 *p++ = ch;
8139 }
8140
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008142 else {
8143 Py_UCS4 ucs = ch;
8144
8145#ifndef Py_UNICODE_WIDE
8146 Py_UNICODE ch2 = 0;
8147 /* Get code point from surrogate pair */
8148 if (size > 0) {
8149 ch2 = *s;
8150 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008152 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008154 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008155 size--;
8156 }
8157 }
8158#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008159 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008160 (categories Z* and C* except ASCII space)
8161 */
8162 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8163 /* Map 8-bit characters to '\xhh' */
8164 if (ucs <= 0xff) {
8165 *p++ = '\\';
8166 *p++ = 'x';
8167 *p++ = hexdigits[(ch >> 4) & 0x000F];
8168 *p++ = hexdigits[ch & 0x000F];
8169 }
8170 /* Map 21-bit characters to '\U00xxxxxx' */
8171 else if (ucs >= 0x10000) {
8172 *p++ = '\\';
8173 *p++ = 'U';
8174 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8175 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8176 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8177 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8178 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8179 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8180 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8181 *p++ = hexdigits[ucs & 0x0000000F];
8182 }
8183 /* Map 16-bit characters to '\uxxxx' */
8184 else {
8185 *p++ = '\\';
8186 *p++ = 'u';
8187 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8188 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8189 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8190 *p++ = hexdigits[ucs & 0x000F];
8191 }
8192 }
8193 /* Copy characters as-is */
8194 else {
8195 *p++ = ch;
8196#ifndef Py_UNICODE_WIDE
8197 if (ucs >= 0x10000)
8198 *p++ = ch2;
8199#endif
8200 }
8201 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008202 }
8203 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008204 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008205
8206 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008207 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008208 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209}
8210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008211PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213\n\
8214Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008215such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216arguments start and end are interpreted as in slice notation.\n\
8217\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008218Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219
8220static PyObject *
8221unicode_rfind(PyUnicodeObject *self, PyObject *args)
8222{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008223 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008224 Py_ssize_t start;
8225 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008226 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227
Christian Heimes9cd17752007-11-18 19:35:23 +00008228 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230
Thomas Wouters477c8d52006-05-27 19:21:47 +00008231 result = stringlib_rfind_slice(
8232 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8233 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8234 start, end
8235 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236
8237 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008238
Christian Heimes217cfd12007-12-02 14:31:20 +00008239 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240}
8241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008242PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008245Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246
8247static PyObject *
8248unicode_rindex(PyUnicodeObject *self, PyObject *args)
8249{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008250 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008251 Py_ssize_t start;
8252 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008253 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254
Christian Heimes9cd17752007-11-18 19:35:23 +00008255 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257
Thomas Wouters477c8d52006-05-27 19:21:47 +00008258 result = stringlib_rfind_slice(
8259 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8260 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8261 start, end
8262 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263
8264 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008265
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 if (result < 0) {
8267 PyErr_SetString(PyExc_ValueError, "substring not found");
8268 return NULL;
8269 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008270 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271}
8272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008273PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008276Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008277done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278
8279static PyObject *
8280unicode_rjust(PyUnicodeObject *self, PyObject *args)
8281{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008282 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008283 Py_UNICODE fillchar = ' ';
8284
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008285 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 return NULL;
8287
Tim Peters7a29bd52001-09-12 03:03:31 +00008288 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 Py_INCREF(self);
8290 return (PyObject*) self;
8291 }
8292
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008293 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294}
8295
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 PyObject *sep,
8298 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299{
8300 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008301
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 s = PyUnicode_FromObject(s);
8303 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008304 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 if (sep != NULL) {
8306 sep = PyUnicode_FromObject(sep);
8307 if (sep == NULL) {
8308 Py_DECREF(s);
8309 return NULL;
8310 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 }
8312
8313 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8314
8315 Py_DECREF(s);
8316 Py_XDECREF(sep);
8317 return result;
8318}
8319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008320PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322\n\
8323Return a list of the words in S, using sep as the\n\
8324delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008325splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008326whitespace string is a separator and empty strings are\n\
8327removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328
8329static PyObject*
8330unicode_split(PyUnicodeObject *self, PyObject *args)
8331{
8332 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008333 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334
Martin v. Löwis18e16552006-02-15 17:27:45 +00008335 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336 return NULL;
8337
8338 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344}
8345
Thomas Wouters477c8d52006-05-27 19:21:47 +00008346PyObject *
8347PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8348{
8349 PyObject* str_obj;
8350 PyObject* sep_obj;
8351 PyObject* out;
8352
8353 str_obj = PyUnicode_FromObject(str_in);
8354 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008356 sep_obj = PyUnicode_FromObject(sep_in);
8357 if (!sep_obj) {
8358 Py_DECREF(str_obj);
8359 return NULL;
8360 }
8361
8362 out = stringlib_partition(
8363 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8364 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8365 );
8366
8367 Py_DECREF(sep_obj);
8368 Py_DECREF(str_obj);
8369
8370 return out;
8371}
8372
8373
8374PyObject *
8375PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8376{
8377 PyObject* str_obj;
8378 PyObject* sep_obj;
8379 PyObject* out;
8380
8381 str_obj = PyUnicode_FromObject(str_in);
8382 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008384 sep_obj = PyUnicode_FromObject(sep_in);
8385 if (!sep_obj) {
8386 Py_DECREF(str_obj);
8387 return NULL;
8388 }
8389
8390 out = stringlib_rpartition(
8391 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8392 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8393 );
8394
8395 Py_DECREF(sep_obj);
8396 Py_DECREF(str_obj);
8397
8398 return out;
8399}
8400
8401PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008403\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008404Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008405the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008406found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008407
8408static PyObject*
8409unicode_partition(PyUnicodeObject *self, PyObject *separator)
8410{
8411 return PyUnicode_Partition((PyObject *)self, separator);
8412}
8413
8414PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008415 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008416\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008417Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008418the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008419separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008420
8421static PyObject*
8422unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8423{
8424 return PyUnicode_RPartition((PyObject *)self, separator);
8425}
8426
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008427PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 PyObject *sep,
8429 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008430{
8431 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008433 s = PyUnicode_FromObject(s);
8434 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008435 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 if (sep != NULL) {
8437 sep = PyUnicode_FromObject(sep);
8438 if (sep == NULL) {
8439 Py_DECREF(s);
8440 return NULL;
8441 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008442 }
8443
8444 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8445
8446 Py_DECREF(s);
8447 Py_XDECREF(sep);
8448 return result;
8449}
8450
8451PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008453\n\
8454Return a list of the words in S, using sep as the\n\
8455delimiter string, starting at the end of the string and\n\
8456working to the front. If maxsplit is given, at most maxsplit\n\
8457splits are done. If sep is not specified, any whitespace string\n\
8458is a separator.");
8459
8460static PyObject*
8461unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8462{
8463 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008464 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008465
Martin v. Löwis18e16552006-02-15 17:27:45 +00008466 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008467 return NULL;
8468
8469 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008471 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008473 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008475}
8476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008477PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479\n\
8480Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008481Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008482is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483
8484static PyObject*
8485unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8486{
Guido van Rossum86662912000-04-11 15:38:46 +00008487 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488
Guido van Rossum86662912000-04-11 15:38:46 +00008489 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 return NULL;
8491
Guido van Rossum86662912000-04-11 15:38:46 +00008492 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493}
8494
8495static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008496PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497{
Walter Dörwald346737f2007-05-31 10:44:43 +00008498 if (PyUnicode_CheckExact(self)) {
8499 Py_INCREF(self);
8500 return self;
8501 } else
8502 /* Subtype -- return genuine unicode string with the same value. */
8503 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8504 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505}
8506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008507PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509\n\
8510Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008511and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512
8513static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008514unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 return fixup(self, fixswapcase);
8517}
8518
Georg Brandlceee0772007-11-27 23:48:05 +00008519PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008521\n\
8522Return a translation table usable for str.translate().\n\
8523If there is only one argument, it must be a dictionary mapping Unicode\n\
8524ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008525Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008526If there are two arguments, they must be strings of equal length, and\n\
8527in the resulting dictionary, each character in x will be mapped to the\n\
8528character at the same position in y. If there is a third argument, it\n\
8529must be a string, whose characters will be mapped to None in the result.");
8530
8531static PyObject*
8532unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8533{
8534 PyObject *x, *y = NULL, *z = NULL;
8535 PyObject *new = NULL, *key, *value;
8536 Py_ssize_t i = 0;
8537 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008538
Georg Brandlceee0772007-11-27 23:48:05 +00008539 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8540 return NULL;
8541 new = PyDict_New();
8542 if (!new)
8543 return NULL;
8544 if (y != NULL) {
8545 /* x must be a string too, of equal length */
8546 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8547 if (!PyUnicode_Check(x)) {
8548 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8549 "be a string if there is a second argument");
8550 goto err;
8551 }
8552 if (PyUnicode_GET_SIZE(x) != ylen) {
8553 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8554 "arguments must have equal length");
8555 goto err;
8556 }
8557 /* create entries for translating chars in x to those in y */
8558 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008559 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8560 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008561 if (!key || !value)
8562 goto err;
8563 res = PyDict_SetItem(new, key, value);
8564 Py_DECREF(key);
8565 Py_DECREF(value);
8566 if (res < 0)
8567 goto err;
8568 }
8569 /* create entries for deleting chars in z */
8570 if (z != NULL) {
8571 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008572 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008573 if (!key)
8574 goto err;
8575 res = PyDict_SetItem(new, key, Py_None);
8576 Py_DECREF(key);
8577 if (res < 0)
8578 goto err;
8579 }
8580 }
8581 } else {
8582 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008583 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008584 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8585 "to maketrans it must be a dict");
8586 goto err;
8587 }
8588 /* copy entries into the new dict, converting string keys to int keys */
8589 while (PyDict_Next(x, &i, &key, &value)) {
8590 if (PyUnicode_Check(key)) {
8591 /* convert string keys to integer keys */
8592 PyObject *newkey;
8593 if (PyUnicode_GET_SIZE(key) != 1) {
8594 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8595 "table must be of length 1");
8596 goto err;
8597 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008598 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008599 if (!newkey)
8600 goto err;
8601 res = PyDict_SetItem(new, newkey, value);
8602 Py_DECREF(newkey);
8603 if (res < 0)
8604 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008605 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008606 /* just keep integer keys */
8607 if (PyDict_SetItem(new, key, value) < 0)
8608 goto err;
8609 } else {
8610 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8611 "be strings or integers");
8612 goto err;
8613 }
8614 }
8615 }
8616 return new;
8617 err:
8618 Py_DECREF(new);
8619 return NULL;
8620}
8621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008622PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624\n\
8625Return a copy of the string S, where all characters have been mapped\n\
8626through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008627Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008628Unmapped characters are left untouched. Characters mapped to None\n\
8629are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630
8631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008632unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633{
Georg Brandlceee0772007-11-27 23:48:05 +00008634 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635}
8636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008637PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008640Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641
8642static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008643unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 return fixup(self, fixupper);
8646}
8647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008648PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008651Pad a numeric string S with zeros on the left, to fill a field\n\
8652of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653
8654static PyObject *
8655unicode_zfill(PyUnicodeObject *self, PyObject *args)
8656{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008657 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 PyUnicodeObject *u;
8659
Martin v. Löwis18e16552006-02-15 17:27:45 +00008660 Py_ssize_t width;
8661 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 return NULL;
8663
8664 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008665 if (PyUnicode_CheckExact(self)) {
8666 Py_INCREF(self);
8667 return (PyObject*) self;
8668 }
8669 else
8670 return PyUnicode_FromUnicode(
8671 PyUnicode_AS_UNICODE(self),
8672 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 }
8675
8676 fill = width - self->length;
8677
8678 u = pad(self, fill, 0, '0');
8679
Walter Dörwald068325e2002-04-15 13:36:47 +00008680 if (u == NULL)
8681 return NULL;
8682
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 if (u->str[fill] == '+' || u->str[fill] == '-') {
8684 /* move sign to beginning of string */
8685 u->str[0] = u->str[fill];
8686 u->str[fill] = '0';
8687 }
8688
8689 return (PyObject*) u;
8690}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691
8692#if 0
8693static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008694unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695{
Christian Heimes2202f872008-02-06 14:31:34 +00008696 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697}
8698#endif
8699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008700PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008703Return True if S starts with the specified prefix, False otherwise.\n\
8704With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008705With optional end, stop comparing S at that position.\n\
8706prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707
8708static PyObject *
8709unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008712 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008714 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008715 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008716 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008718 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8720 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008721 if (PyTuple_Check(subobj)) {
8722 Py_ssize_t i;
8723 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8724 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008726 if (substring == NULL)
8727 return NULL;
8728 result = tailmatch(self, substring, start, end, -1);
8729 Py_DECREF(substring);
8730 if (result) {
8731 Py_RETURN_TRUE;
8732 }
8733 }
8734 /* nothing matched */
8735 Py_RETURN_FALSE;
8736 }
8737 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008740 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008742 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743}
8744
8745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008746PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008749Return True if S ends with the specified suffix, False otherwise.\n\
8750With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008751With optional end, stop comparing S at that position.\n\
8752suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753
8754static PyObject *
8755unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008758 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008760 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008761 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008762 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008764 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8766 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008767 if (PyTuple_Check(subobj)) {
8768 Py_ssize_t i;
8769 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8770 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008772 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008774 result = tailmatch(self, substring, start, end, +1);
8775 Py_DECREF(substring);
8776 if (result) {
8777 Py_RETURN_TRUE;
8778 }
8779 }
8780 Py_RETURN_FALSE;
8781 }
8782 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008786 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008788 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789}
8790
Eric Smith8c663262007-08-25 02:26:07 +00008791#include "stringlib/string_format.h"
8792
8793PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008795\n\
8796");
8797
Eric Smith4a7d76d2008-05-30 18:10:19 +00008798static PyObject *
8799unicode__format__(PyObject* self, PyObject* args)
8800{
8801 PyObject *format_spec;
8802
8803 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8804 return NULL;
8805
8806 return _PyUnicode_FormatAdvanced(self,
8807 PyUnicode_AS_UNICODE(format_spec),
8808 PyUnicode_GET_SIZE(format_spec));
8809}
8810
Eric Smith8c663262007-08-25 02:26:07 +00008811PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008813\n\
8814");
8815
8816static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008817unicode__sizeof__(PyUnicodeObject *v)
8818{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008819 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8820 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008821}
8822
8823PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008825
8826static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008827unicode_getnewargs(PyUnicodeObject *v)
8828{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008829 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008830}
8831
8832
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833static PyMethodDef unicode_methods[] = {
8834
8835 /* Order is according to common usage: often used methods should
8836 appear first, since lookup is done sequentially. */
8837
Benjamin Peterson308d6372009-09-18 21:42:35 +00008838 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008839 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8840 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008841 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008842 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8843 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8844 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8845 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8846 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8847 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8848 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008849 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008850 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8851 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8852 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008853 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008854 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8855 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8856 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008857 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008858 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008859 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008860 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008861 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8862 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8863 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8864 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8865 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8866 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8867 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8868 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8869 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8870 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8871 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8872 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8873 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8874 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008875 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008876 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008877 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008878 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008879 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008880 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8881 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008882 {"maketrans", (PyCFunction) unicode_maketrans,
8883 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008884 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008885#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008886 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887#endif
8888
8889#if 0
8890 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008891 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892#endif
8893
Benjamin Peterson14339b62009-01-31 16:36:08 +00008894 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 {NULL, NULL}
8896};
8897
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008898static PyObject *
8899unicode_mod(PyObject *v, PyObject *w)
8900{
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 if (!PyUnicode_Check(v)) {
8902 Py_INCREF(Py_NotImplemented);
8903 return Py_NotImplemented;
8904 }
8905 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008906}
8907
8908static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008909 0, /*nb_add*/
8910 0, /*nb_subtract*/
8911 0, /*nb_multiply*/
8912 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008913};
8914
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008916 (lenfunc) unicode_length, /* sq_length */
8917 PyUnicode_Concat, /* sq_concat */
8918 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8919 (ssizeargfunc) unicode_getitem, /* sq_item */
8920 0, /* sq_slice */
8921 0, /* sq_ass_item */
8922 0, /* sq_ass_slice */
8923 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924};
8925
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008926static PyObject*
8927unicode_subscript(PyUnicodeObject* self, PyObject* item)
8928{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008929 if (PyIndex_Check(item)) {
8930 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008931 if (i == -1 && PyErr_Occurred())
8932 return NULL;
8933 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008934 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008935 return unicode_getitem(self, i);
8936 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008937 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008938 Py_UNICODE* source_buf;
8939 Py_UNICODE* result_buf;
8940 PyObject* result;
8941
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008942 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008944 return NULL;
8945 }
8946
8947 if (slicelength <= 0) {
8948 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008949 } else if (start == 0 && step == 1 && slicelength == self->length &&
8950 PyUnicode_CheckExact(self)) {
8951 Py_INCREF(self);
8952 return (PyObject *)self;
8953 } else if (step == 1) {
8954 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008955 } else {
8956 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008957 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8958 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008959
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 if (result_buf == NULL)
8961 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008962
8963 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8964 result_buf[i] = source_buf[cur];
8965 }
Tim Petersced69f82003-09-16 20:30:58 +00008966
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008967 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008968 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008969 return result;
8970 }
8971 } else {
8972 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8973 return NULL;
8974 }
8975}
8976
8977static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008978 (lenfunc)unicode_length, /* mp_length */
8979 (binaryfunc)unicode_subscript, /* mp_subscript */
8980 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008981};
8982
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984/* Helpers for PyUnicode_Format() */
8985
8986static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008987getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008989 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 (*p_argidx)++;
8992 if (arglen < 0)
8993 return args;
8994 else
8995 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996 }
8997 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999 return NULL;
9000}
9001
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009002/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009004static PyObject *
9005formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009007 char *p;
9008 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009010
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 x = PyFloat_AsDouble(v);
9012 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009013 return NULL;
9014
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009016 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009017
Eric Smith0923d1d2009-04-16 20:16:10 +00009018 p = PyOS_double_to_string(x, type, prec,
9019 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009020 if (p == NULL)
9021 return NULL;
9022 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009023 PyMem_Free(p);
9024 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025}
9026
Tim Peters38fd5b62000-09-21 05:43:11 +00009027static PyObject*
9028formatlong(PyObject *val, int flags, int prec, int type)
9029{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009030 char *buf;
9031 int len;
9032 PyObject *str; /* temporary string object. */
9033 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009034
Benjamin Peterson14339b62009-01-31 16:36:08 +00009035 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9036 if (!str)
9037 return NULL;
9038 result = PyUnicode_FromStringAndSize(buf, len);
9039 Py_DECREF(str);
9040 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009041}
9042
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043static int
9044formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009045 size_t buflen,
9046 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009048 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009049 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009050 if (PyUnicode_GET_SIZE(v) == 1) {
9051 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9052 buf[1] = '\0';
9053 return 1;
9054 }
9055#ifndef Py_UNICODE_WIDE
9056 if (PyUnicode_GET_SIZE(v) == 2) {
9057 /* Decode a valid surrogate pair */
9058 int c0 = PyUnicode_AS_UNICODE(v)[0];
9059 int c1 = PyUnicode_AS_UNICODE(v)[1];
9060 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9061 0xDC00 <= c1 && c1 <= 0xDFFF) {
9062 buf[0] = c0;
9063 buf[1] = c1;
9064 buf[2] = '\0';
9065 return 2;
9066 }
9067 }
9068#endif
9069 goto onError;
9070 }
9071 else {
9072 /* Integer input truncated to a character */
9073 long x;
9074 x = PyLong_AsLong(v);
9075 if (x == -1 && PyErr_Occurred())
9076 goto onError;
9077
9078 if (x < 0 || x > 0x10ffff) {
9079 PyErr_SetString(PyExc_OverflowError,
9080 "%c arg not in range(0x110000)");
9081 return -1;
9082 }
9083
9084#ifndef Py_UNICODE_WIDE
9085 if (x > 0xffff) {
9086 x -= 0x10000;
9087 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9088 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9089 return 2;
9090 }
9091#endif
9092 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009093 buf[1] = '\0';
9094 return 1;
9095 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009096
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009098 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009099 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009100 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101}
9102
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009103/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009104 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009105*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009106#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009107
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009109 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110{
9111 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009112 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113 int args_owned = 0;
9114 PyUnicodeObject *result = NULL;
9115 PyObject *dict = NULL;
9116 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009117
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 PyErr_BadInternalCall();
9120 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121 }
9122 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009123 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009124 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 fmt = PyUnicode_AS_UNICODE(uformat);
9126 fmtcnt = PyUnicode_GET_SIZE(uformat);
9127
9128 reslen = rescnt = fmtcnt + 100;
9129 result = _PyUnicode_New(reslen);
9130 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 res = PyUnicode_AS_UNICODE(result);
9133
9134 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 arglen = PyTuple_Size(args);
9136 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 }
9138 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 arglen = -1;
9140 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009142 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009143 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145
9146 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 if (*fmt != '%') {
9148 if (--rescnt < 0) {
9149 rescnt = fmtcnt + 100;
9150 reslen += rescnt;
9151 if (_PyUnicode_Resize(&result, reslen) < 0)
9152 goto onError;
9153 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9154 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009155 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009157 }
9158 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 /* Got a format specifier */
9160 int flags = 0;
9161 Py_ssize_t width = -1;
9162 int prec = -1;
9163 Py_UNICODE c = '\0';
9164 Py_UNICODE fill;
9165 int isnumok;
9166 PyObject *v = NULL;
9167 PyObject *temp = NULL;
9168 Py_UNICODE *pbuf;
9169 Py_UNICODE sign;
9170 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009171 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172
Benjamin Peterson29060642009-01-31 22:14:21 +00009173 fmt++;
9174 if (*fmt == '(') {
9175 Py_UNICODE *keystart;
9176 Py_ssize_t keylen;
9177 PyObject *key;
9178 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009179
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 if (dict == NULL) {
9181 PyErr_SetString(PyExc_TypeError,
9182 "format requires a mapping");
9183 goto onError;
9184 }
9185 ++fmt;
9186 --fmtcnt;
9187 keystart = fmt;
9188 /* Skip over balanced parentheses */
9189 while (pcount > 0 && --fmtcnt >= 0) {
9190 if (*fmt == ')')
9191 --pcount;
9192 else if (*fmt == '(')
9193 ++pcount;
9194 fmt++;
9195 }
9196 keylen = fmt - keystart - 1;
9197 if (fmtcnt < 0 || pcount > 0) {
9198 PyErr_SetString(PyExc_ValueError,
9199 "incomplete format key");
9200 goto onError;
9201 }
9202#if 0
9203 /* keys are converted to strings using UTF-8 and
9204 then looked up since Python uses strings to hold
9205 variables names etc. in its namespaces and we
9206 wouldn't want to break common idioms. */
9207 key = PyUnicode_EncodeUTF8(keystart,
9208 keylen,
9209 NULL);
9210#else
9211 key = PyUnicode_FromUnicode(keystart, keylen);
9212#endif
9213 if (key == NULL)
9214 goto onError;
9215 if (args_owned) {
9216 Py_DECREF(args);
9217 args_owned = 0;
9218 }
9219 args = PyObject_GetItem(dict, key);
9220 Py_DECREF(key);
9221 if (args == NULL) {
9222 goto onError;
9223 }
9224 args_owned = 1;
9225 arglen = -1;
9226 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009227 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009228 while (--fmtcnt >= 0) {
9229 switch (c = *fmt++) {
9230 case '-': flags |= F_LJUST; continue;
9231 case '+': flags |= F_SIGN; continue;
9232 case ' ': flags |= F_BLANK; continue;
9233 case '#': flags |= F_ALT; continue;
9234 case '0': flags |= F_ZERO; continue;
9235 }
9236 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009237 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 if (c == '*') {
9239 v = getnextarg(args, arglen, &argidx);
9240 if (v == NULL)
9241 goto onError;
9242 if (!PyLong_Check(v)) {
9243 PyErr_SetString(PyExc_TypeError,
9244 "* wants int");
9245 goto onError;
9246 }
9247 width = PyLong_AsLong(v);
9248 if (width == -1 && PyErr_Occurred())
9249 goto onError;
9250 if (width < 0) {
9251 flags |= F_LJUST;
9252 width = -width;
9253 }
9254 if (--fmtcnt >= 0)
9255 c = *fmt++;
9256 }
9257 else if (c >= '0' && c <= '9') {
9258 width = c - '0';
9259 while (--fmtcnt >= 0) {
9260 c = *fmt++;
9261 if (c < '0' || c > '9')
9262 break;
9263 if ((width*10) / 10 != width) {
9264 PyErr_SetString(PyExc_ValueError,
9265 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009266 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009267 }
9268 width = width*10 + (c - '0');
9269 }
9270 }
9271 if (c == '.') {
9272 prec = 0;
9273 if (--fmtcnt >= 0)
9274 c = *fmt++;
9275 if (c == '*') {
9276 v = getnextarg(args, arglen, &argidx);
9277 if (v == NULL)
9278 goto onError;
9279 if (!PyLong_Check(v)) {
9280 PyErr_SetString(PyExc_TypeError,
9281 "* wants int");
9282 goto onError;
9283 }
9284 prec = PyLong_AsLong(v);
9285 if (prec == -1 && PyErr_Occurred())
9286 goto onError;
9287 if (prec < 0)
9288 prec = 0;
9289 if (--fmtcnt >= 0)
9290 c = *fmt++;
9291 }
9292 else if (c >= '0' && c <= '9') {
9293 prec = c - '0';
9294 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009295 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009296 if (c < '0' || c > '9')
9297 break;
9298 if ((prec*10) / 10 != prec) {
9299 PyErr_SetString(PyExc_ValueError,
9300 "prec too big");
9301 goto onError;
9302 }
9303 prec = prec*10 + (c - '0');
9304 }
9305 }
9306 } /* prec */
9307 if (fmtcnt >= 0) {
9308 if (c == 'h' || c == 'l' || c == 'L') {
9309 if (--fmtcnt >= 0)
9310 c = *fmt++;
9311 }
9312 }
9313 if (fmtcnt < 0) {
9314 PyErr_SetString(PyExc_ValueError,
9315 "incomplete format");
9316 goto onError;
9317 }
9318 if (c != '%') {
9319 v = getnextarg(args, arglen, &argidx);
9320 if (v == NULL)
9321 goto onError;
9322 }
9323 sign = 0;
9324 fill = ' ';
9325 switch (c) {
9326
9327 case '%':
9328 pbuf = formatbuf;
9329 /* presume that buffer length is at least 1 */
9330 pbuf[0] = '%';
9331 len = 1;
9332 break;
9333
9334 case 's':
9335 case 'r':
9336 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009337 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009338 temp = v;
9339 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009340 }
9341 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 if (c == 's')
9343 temp = PyObject_Str(v);
9344 else if (c == 'r')
9345 temp = PyObject_Repr(v);
9346 else
9347 temp = PyObject_ASCII(v);
9348 if (temp == NULL)
9349 goto onError;
9350 if (PyUnicode_Check(temp))
9351 /* nothing to do */;
9352 else {
9353 Py_DECREF(temp);
9354 PyErr_SetString(PyExc_TypeError,
9355 "%s argument has non-string str()");
9356 goto onError;
9357 }
9358 }
9359 pbuf = PyUnicode_AS_UNICODE(temp);
9360 len = PyUnicode_GET_SIZE(temp);
9361 if (prec >= 0 && len > prec)
9362 len = prec;
9363 break;
9364
9365 case 'i':
9366 case 'd':
9367 case 'u':
9368 case 'o':
9369 case 'x':
9370 case 'X':
9371 if (c == 'i')
9372 c = 'd';
9373 isnumok = 0;
9374 if (PyNumber_Check(v)) {
9375 PyObject *iobj=NULL;
9376
9377 if (PyLong_Check(v)) {
9378 iobj = v;
9379 Py_INCREF(iobj);
9380 }
9381 else {
9382 iobj = PyNumber_Long(v);
9383 }
9384 if (iobj!=NULL) {
9385 if (PyLong_Check(iobj)) {
9386 isnumok = 1;
9387 temp = formatlong(iobj, flags, prec, c);
9388 Py_DECREF(iobj);
9389 if (!temp)
9390 goto onError;
9391 pbuf = PyUnicode_AS_UNICODE(temp);
9392 len = PyUnicode_GET_SIZE(temp);
9393 sign = 1;
9394 }
9395 else {
9396 Py_DECREF(iobj);
9397 }
9398 }
9399 }
9400 if (!isnumok) {
9401 PyErr_Format(PyExc_TypeError,
9402 "%%%c format: a number is required, "
9403 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9404 goto onError;
9405 }
9406 if (flags & F_ZERO)
9407 fill = '0';
9408 break;
9409
9410 case 'e':
9411 case 'E':
9412 case 'f':
9413 case 'F':
9414 case 'g':
9415 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009416 temp = formatfloat(v, flags, prec, c);
9417 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009418 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009419 pbuf = PyUnicode_AS_UNICODE(temp);
9420 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009421 sign = 1;
9422 if (flags & F_ZERO)
9423 fill = '0';
9424 break;
9425
9426 case 'c':
9427 pbuf = formatbuf;
9428 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9429 if (len < 0)
9430 goto onError;
9431 break;
9432
9433 default:
9434 PyErr_Format(PyExc_ValueError,
9435 "unsupported format character '%c' (0x%x) "
9436 "at index %zd",
9437 (31<=c && c<=126) ? (char)c : '?',
9438 (int)c,
9439 (Py_ssize_t)(fmt - 1 -
9440 PyUnicode_AS_UNICODE(uformat)));
9441 goto onError;
9442 }
9443 if (sign) {
9444 if (*pbuf == '-' || *pbuf == '+') {
9445 sign = *pbuf++;
9446 len--;
9447 }
9448 else if (flags & F_SIGN)
9449 sign = '+';
9450 else if (flags & F_BLANK)
9451 sign = ' ';
9452 else
9453 sign = 0;
9454 }
9455 if (width < len)
9456 width = len;
9457 if (rescnt - (sign != 0) < width) {
9458 reslen -= rescnt;
9459 rescnt = width + fmtcnt + 100;
9460 reslen += rescnt;
9461 if (reslen < 0) {
9462 Py_XDECREF(temp);
9463 PyErr_NoMemory();
9464 goto onError;
9465 }
9466 if (_PyUnicode_Resize(&result, reslen) < 0) {
9467 Py_XDECREF(temp);
9468 goto onError;
9469 }
9470 res = PyUnicode_AS_UNICODE(result)
9471 + reslen - rescnt;
9472 }
9473 if (sign) {
9474 if (fill != ' ')
9475 *res++ = sign;
9476 rescnt--;
9477 if (width > len)
9478 width--;
9479 }
9480 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9481 assert(pbuf[0] == '0');
9482 assert(pbuf[1] == c);
9483 if (fill != ' ') {
9484 *res++ = *pbuf++;
9485 *res++ = *pbuf++;
9486 }
9487 rescnt -= 2;
9488 width -= 2;
9489 if (width < 0)
9490 width = 0;
9491 len -= 2;
9492 }
9493 if (width > len && !(flags & F_LJUST)) {
9494 do {
9495 --rescnt;
9496 *res++ = fill;
9497 } while (--width > len);
9498 }
9499 if (fill == ' ') {
9500 if (sign)
9501 *res++ = sign;
9502 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9503 assert(pbuf[0] == '0');
9504 assert(pbuf[1] == c);
9505 *res++ = *pbuf++;
9506 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009507 }
9508 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 Py_UNICODE_COPY(res, pbuf, len);
9510 res += len;
9511 rescnt -= len;
9512 while (--width >= len) {
9513 --rescnt;
9514 *res++ = ' ';
9515 }
9516 if (dict && (argidx < arglen) && c != '%') {
9517 PyErr_SetString(PyExc_TypeError,
9518 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009519 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009520 goto onError;
9521 }
9522 Py_XDECREF(temp);
9523 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524 } /* until end */
9525 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009526 PyErr_SetString(PyExc_TypeError,
9527 "not all arguments converted during string formatting");
9528 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529 }
9530
Thomas Woutersa96affe2006-03-12 00:29:36 +00009531 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535 }
9536 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 return (PyObject *)result;
9538
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540 Py_XDECREF(result);
9541 Py_DECREF(uformat);
9542 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544 }
9545 return NULL;
9546}
9547
Jeremy Hylton938ace62002-07-17 16:30:39 +00009548static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009549unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9550
Tim Peters6d6c1a32001-08-02 04:15:00 +00009551static PyObject *
9552unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9553{
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009555 static char *kwlist[] = {"object", "encoding", "errors", 0};
9556 char *encoding = NULL;
9557 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009558
Benjamin Peterson14339b62009-01-31 16:36:08 +00009559 if (type != &PyUnicode_Type)
9560 return unicode_subtype_new(type, args, kwds);
9561 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009562 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009563 return NULL;
9564 if (x == NULL)
9565 return (PyObject *)_PyUnicode_New(0);
9566 if (encoding == NULL && errors == NULL)
9567 return PyObject_Str(x);
9568 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009570}
9571
Guido van Rossume023fe02001-08-30 03:12:59 +00009572static PyObject *
9573unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9574{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009575 PyUnicodeObject *tmp, *pnew;
9576 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009577
Benjamin Peterson14339b62009-01-31 16:36:08 +00009578 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9579 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9580 if (tmp == NULL)
9581 return NULL;
9582 assert(PyUnicode_Check(tmp));
9583 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9584 if (pnew == NULL) {
9585 Py_DECREF(tmp);
9586 return NULL;
9587 }
9588 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9589 if (pnew->str == NULL) {
9590 _Py_ForgetReference((PyObject *)pnew);
9591 PyObject_Del(pnew);
9592 Py_DECREF(tmp);
9593 return PyErr_NoMemory();
9594 }
9595 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9596 pnew->length = n;
9597 pnew->hash = tmp->hash;
9598 Py_DECREF(tmp);
9599 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009600}
9601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009602PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009603 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009604\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009605Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009606encoding defaults to the current default string encoding.\n\
9607errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009608
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009609static PyObject *unicode_iter(PyObject *seq);
9610
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009612 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009613 "str", /* tp_name */
9614 sizeof(PyUnicodeObject), /* tp_size */
9615 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009617 (destructor)unicode_dealloc, /* tp_dealloc */
9618 0, /* tp_print */
9619 0, /* tp_getattr */
9620 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009621 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009622 unicode_repr, /* tp_repr */
9623 &unicode_as_number, /* tp_as_number */
9624 &unicode_as_sequence, /* tp_as_sequence */
9625 &unicode_as_mapping, /* tp_as_mapping */
9626 (hashfunc) unicode_hash, /* tp_hash*/
9627 0, /* tp_call*/
9628 (reprfunc) unicode_str, /* tp_str */
9629 PyObject_GenericGetAttr, /* tp_getattro */
9630 0, /* tp_setattro */
9631 0, /* tp_as_buffer */
9632 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009633 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009634 unicode_doc, /* tp_doc */
9635 0, /* tp_traverse */
9636 0, /* tp_clear */
9637 PyUnicode_RichCompare, /* tp_richcompare */
9638 0, /* tp_weaklistoffset */
9639 unicode_iter, /* tp_iter */
9640 0, /* tp_iternext */
9641 unicode_methods, /* tp_methods */
9642 0, /* tp_members */
9643 0, /* tp_getset */
9644 &PyBaseObject_Type, /* tp_base */
9645 0, /* tp_dict */
9646 0, /* tp_descr_get */
9647 0, /* tp_descr_set */
9648 0, /* tp_dictoffset */
9649 0, /* tp_init */
9650 0, /* tp_alloc */
9651 unicode_new, /* tp_new */
9652 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653};
9654
9655/* Initialize the Unicode implementation */
9656
Thomas Wouters78890102000-07-22 19:25:51 +00009657void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009659 int i;
9660
Thomas Wouters477c8d52006-05-27 19:21:47 +00009661 /* XXX - move this array to unicodectype.c ? */
9662 Py_UNICODE linebreak[] = {
9663 0x000A, /* LINE FEED */
9664 0x000D, /* CARRIAGE RETURN */
9665 0x001C, /* FILE SEPARATOR */
9666 0x001D, /* GROUP SEPARATOR */
9667 0x001E, /* RECORD SEPARATOR */
9668 0x0085, /* NEXT LINE */
9669 0x2028, /* LINE SEPARATOR */
9670 0x2029, /* PARAGRAPH SEPARATOR */
9671 };
9672
Fred Drakee4315f52000-05-09 19:53:39 +00009673 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009674 free_list = NULL;
9675 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009677 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009678 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009679
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009680 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009681 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009682 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009683 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009684
9685 /* initialize the linebreak bloom filter */
9686 bloom_linebreak = make_bloom_mask(
9687 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9688 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009689
9690 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691}
9692
9693/* Finalize the Unicode implementation */
9694
Christian Heimesa156e092008-02-16 07:38:31 +00009695int
9696PyUnicode_ClearFreeList(void)
9697{
9698 int freelist_size = numfree;
9699 PyUnicodeObject *u;
9700
9701 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009702 PyUnicodeObject *v = u;
9703 u = *(PyUnicodeObject **)u;
9704 if (v->str)
9705 PyObject_DEL(v->str);
9706 Py_XDECREF(v->defenc);
9707 PyObject_Del(v);
9708 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009709 }
9710 free_list = NULL;
9711 assert(numfree == 0);
9712 return freelist_size;
9713}
9714
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715void
Thomas Wouters78890102000-07-22 19:25:51 +00009716_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009718 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009720 Py_XDECREF(unicode_empty);
9721 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009722
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009723 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009724 if (unicode_latin1[i]) {
9725 Py_DECREF(unicode_latin1[i]);
9726 unicode_latin1[i] = NULL;
9727 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009728 }
Christian Heimesa156e092008-02-16 07:38:31 +00009729 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009731
Walter Dörwald16807132007-05-25 13:52:07 +00009732void
9733PyUnicode_InternInPlace(PyObject **p)
9734{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009735 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9736 PyObject *t;
9737 if (s == NULL || !PyUnicode_Check(s))
9738 Py_FatalError(
9739 "PyUnicode_InternInPlace: unicode strings only please!");
9740 /* If it's a subclass, we don't really know what putting
9741 it in the interned dict might do. */
9742 if (!PyUnicode_CheckExact(s))
9743 return;
9744 if (PyUnicode_CHECK_INTERNED(s))
9745 return;
9746 if (interned == NULL) {
9747 interned = PyDict_New();
9748 if (interned == NULL) {
9749 PyErr_Clear(); /* Don't leave an exception */
9750 return;
9751 }
9752 }
9753 /* It might be that the GetItem call fails even
9754 though the key is present in the dictionary,
9755 namely when this happens during a stack overflow. */
9756 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009757 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009758 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009759
Benjamin Peterson29060642009-01-31 22:14:21 +00009760 if (t) {
9761 Py_INCREF(t);
9762 Py_DECREF(*p);
9763 *p = t;
9764 return;
9765 }
Walter Dörwald16807132007-05-25 13:52:07 +00009766
Benjamin Peterson14339b62009-01-31 16:36:08 +00009767 PyThreadState_GET()->recursion_critical = 1;
9768 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9769 PyErr_Clear();
9770 PyThreadState_GET()->recursion_critical = 0;
9771 return;
9772 }
9773 PyThreadState_GET()->recursion_critical = 0;
9774 /* The two references in interned are not counted by refcnt.
9775 The deallocator will take care of this */
9776 Py_REFCNT(s) -= 2;
9777 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009778}
9779
9780void
9781PyUnicode_InternImmortal(PyObject **p)
9782{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009783 PyUnicode_InternInPlace(p);
9784 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9785 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9786 Py_INCREF(*p);
9787 }
Walter Dörwald16807132007-05-25 13:52:07 +00009788}
9789
9790PyObject *
9791PyUnicode_InternFromString(const char *cp)
9792{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009793 PyObject *s = PyUnicode_FromString(cp);
9794 if (s == NULL)
9795 return NULL;
9796 PyUnicode_InternInPlace(&s);
9797 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009798}
9799
9800void _Py_ReleaseInternedUnicodeStrings(void)
9801{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009802 PyObject *keys;
9803 PyUnicodeObject *s;
9804 Py_ssize_t i, n;
9805 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009806
Benjamin Peterson14339b62009-01-31 16:36:08 +00009807 if (interned == NULL || !PyDict_Check(interned))
9808 return;
9809 keys = PyDict_Keys(interned);
9810 if (keys == NULL || !PyList_Check(keys)) {
9811 PyErr_Clear();
9812 return;
9813 }
Walter Dörwald16807132007-05-25 13:52:07 +00009814
Benjamin Peterson14339b62009-01-31 16:36:08 +00009815 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9816 detector, interned unicode strings are not forcibly deallocated;
9817 rather, we give them their stolen references back, and then clear
9818 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009819
Benjamin Peterson14339b62009-01-31 16:36:08 +00009820 n = PyList_GET_SIZE(keys);
9821 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009822 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009823 for (i = 0; i < n; i++) {
9824 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9825 switch (s->state) {
9826 case SSTATE_NOT_INTERNED:
9827 /* XXX Shouldn't happen */
9828 break;
9829 case SSTATE_INTERNED_IMMORTAL:
9830 Py_REFCNT(s) += 1;
9831 immortal_size += s->length;
9832 break;
9833 case SSTATE_INTERNED_MORTAL:
9834 Py_REFCNT(s) += 2;
9835 mortal_size += s->length;
9836 break;
9837 default:
9838 Py_FatalError("Inconsistent interned string state.");
9839 }
9840 s->state = SSTATE_NOT_INTERNED;
9841 }
9842 fprintf(stderr, "total size of all interned strings: "
9843 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9844 "mortal/immortal\n", mortal_size, immortal_size);
9845 Py_DECREF(keys);
9846 PyDict_Clear(interned);
9847 Py_DECREF(interned);
9848 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009849}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009850
9851
9852/********************* Unicode Iterator **************************/
9853
9854typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009855 PyObject_HEAD
9856 Py_ssize_t it_index;
9857 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009858} unicodeiterobject;
9859
9860static void
9861unicodeiter_dealloc(unicodeiterobject *it)
9862{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009863 _PyObject_GC_UNTRACK(it);
9864 Py_XDECREF(it->it_seq);
9865 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009866}
9867
9868static int
9869unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9870{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009871 Py_VISIT(it->it_seq);
9872 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009873}
9874
9875static PyObject *
9876unicodeiter_next(unicodeiterobject *it)
9877{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009878 PyUnicodeObject *seq;
9879 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009880
Benjamin Peterson14339b62009-01-31 16:36:08 +00009881 assert(it != NULL);
9882 seq = it->it_seq;
9883 if (seq == NULL)
9884 return NULL;
9885 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009886
Benjamin Peterson14339b62009-01-31 16:36:08 +00009887 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9888 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009889 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009890 if (item != NULL)
9891 ++it->it_index;
9892 return item;
9893 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009894
Benjamin Peterson14339b62009-01-31 16:36:08 +00009895 Py_DECREF(seq);
9896 it->it_seq = NULL;
9897 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009898}
9899
9900static PyObject *
9901unicodeiter_len(unicodeiterobject *it)
9902{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009903 Py_ssize_t len = 0;
9904 if (it->it_seq)
9905 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9906 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009907}
9908
9909PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9910
9911static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009912 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009914 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009915};
9916
9917PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009918 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9919 "str_iterator", /* tp_name */
9920 sizeof(unicodeiterobject), /* tp_basicsize */
9921 0, /* tp_itemsize */
9922 /* methods */
9923 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9924 0, /* tp_print */
9925 0, /* tp_getattr */
9926 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009927 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009928 0, /* tp_repr */
9929 0, /* tp_as_number */
9930 0, /* tp_as_sequence */
9931 0, /* tp_as_mapping */
9932 0, /* tp_hash */
9933 0, /* tp_call */
9934 0, /* tp_str */
9935 PyObject_GenericGetAttr, /* tp_getattro */
9936 0, /* tp_setattro */
9937 0, /* tp_as_buffer */
9938 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9939 0, /* tp_doc */
9940 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9941 0, /* tp_clear */
9942 0, /* tp_richcompare */
9943 0, /* tp_weaklistoffset */
9944 PyObject_SelfIter, /* tp_iter */
9945 (iternextfunc)unicodeiter_next, /* tp_iternext */
9946 unicodeiter_methods, /* tp_methods */
9947 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009948};
9949
9950static PyObject *
9951unicode_iter(PyObject *seq)
9952{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009953 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009954
Benjamin Peterson14339b62009-01-31 16:36:08 +00009955 if (!PyUnicode_Check(seq)) {
9956 PyErr_BadInternalCall();
9957 return NULL;
9958 }
9959 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9960 if (it == NULL)
9961 return NULL;
9962 it->it_index = 0;
9963 Py_INCREF(seq);
9964 it->it_seq = (PyUnicodeObject *)seq;
9965 _PyObject_GC_TRACK(it);
9966 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009967}
9968
Martin v. Löwis5b222132007-06-10 09:51:05 +00009969size_t
9970Py_UNICODE_strlen(const Py_UNICODE *u)
9971{
9972 int res = 0;
9973 while(*u++)
9974 res++;
9975 return res;
9976}
9977
9978Py_UNICODE*
9979Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9980{
9981 Py_UNICODE *u = s1;
9982 while ((*u++ = *s2++));
9983 return s1;
9984}
9985
9986Py_UNICODE*
9987Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9988{
9989 Py_UNICODE *u = s1;
9990 while ((*u++ = *s2++))
9991 if (n-- == 0)
9992 break;
9993 return s1;
9994}
9995
Victor Stinnerc4eb7652010-09-01 23:43:50 +00009996Py_UNICODE*
9997Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
9998{
9999 Py_UNICODE *u1 = s1;
10000 u1 += Py_UNICODE_strlen(u1);
10001 Py_UNICODE_strcpy(u1, s2);
10002 return s1;
10003}
10004
Martin v. Löwis5b222132007-06-10 09:51:05 +000010005int
10006Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10007{
10008 while (*s1 && *s2 && *s1 == *s2)
10009 s1++, s2++;
10010 if (*s1 && *s2)
10011 return (*s1 < *s2) ? -1 : +1;
10012 if (*s1)
10013 return 1;
10014 if (*s2)
10015 return -1;
10016 return 0;
10017}
10018
Victor Stinneref8d95c2010-08-16 22:03:11 +000010019int
10020Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10021{
10022 register Py_UNICODE u1, u2;
10023 for (; n != 0; n--) {
10024 u1 = *s1;
10025 u2 = *s2;
10026 if (u1 != u2)
10027 return (u1 < u2) ? -1 : +1;
10028 if (u1 == '\0')
10029 return 0;
10030 s1++;
10031 s2++;
10032 }
10033 return 0;
10034}
10035
Martin v. Löwis5b222132007-06-10 09:51:05 +000010036Py_UNICODE*
10037Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10038{
10039 const Py_UNICODE *p;
10040 for (p = s; *p; p++)
10041 if (*p == c)
10042 return (Py_UNICODE*)p;
10043 return NULL;
10044}
10045
Victor Stinner331ea922010-08-10 16:37:20 +000010046Py_UNICODE*
10047Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10048{
10049 const Py_UNICODE *p;
10050 p = s + Py_UNICODE_strlen(s);
10051 while (p != s) {
10052 p--;
10053 if (*p == c)
10054 return (Py_UNICODE*)p;
10055 }
10056 return NULL;
10057}
10058
Victor Stinner71133ff2010-09-01 23:43:53 +000010059Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010060PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010061{
10062 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10063 Py_UNICODE *copy;
10064 Py_ssize_t size;
10065
10066 /* Ensure we won't overflow the size. */
10067 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10068 PyErr_NoMemory();
10069 return NULL;
10070 }
10071 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10072 size *= sizeof(Py_UNICODE);
10073 copy = PyMem_Malloc(size);
10074 if (copy == NULL) {
10075 PyErr_NoMemory();
10076 return NULL;
10077 }
10078 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10079 return copy;
10080}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010081
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010082#ifdef __cplusplus
10083}
10084#endif