blob: 0e2f95018a55833b265017a6cc77492b03f90d8b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Christian Heimes190d79e2008-01-30 11:58:22 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000119 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000120/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000121/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000122/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000123/* case 0x000C: * FORM FEED */
124/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000127/* case 0x001C: * FILE SEPARATOR */
128/* case 0x001D: * GROUP SEPARATOR */
129/* case 0x001E: * RECORD SEPARATOR */
130/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000131 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson14339b62009-01-31 16:36:08 +0000138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000146};
147
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149 PyObject **errorHandler,const char *encoding, const char *reason,
150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
Victor Stinner31be90b2010-04-22 19:38:16 +0000153static void raise_encode_exception(PyObject **exceptionObject,
154 const char *encoding,
155 const Py_UNICODE *unicode, Py_ssize_t size,
156 Py_ssize_t startpos, Py_ssize_t endpos,
157 const char *reason);
158
Christian Heimes190d79e2008-01-30 11:58:22 +0000159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrouf068f942010-01-13 14:19:12 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
Antoine Pitrouf068f942010-01-13 14:19:12 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
Benjamin Peterson29060642009-01-31 22:14:21 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252}
253
Benjamin Peterson29060642009-01-31 22:14:21 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000278 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000309 Ux0000 terminated; some code (e.g. new_identifier)
310 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000313 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314
315*/
316
317static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319{
320 register PyUnicodeObject *unicode;
321
Thomas Wouters477c8d52006-05-27 19:21:47 +0000322 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 if (length == 0 && unicode_empty != NULL) {
324 Py_INCREF(unicode_empty);
325 return unicode_empty;
326 }
327
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000328 /* Ensure we won't overflow the size. */
329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330 return (PyUnicodeObject *)PyErr_NoMemory();
331 }
332
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000334 if (free_list) {
335 unicode = free_list;
336 free_list = *(PyUnicodeObject **)unicode;
337 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000338 if (unicode->str) {
339 /* Keep-Alive optimization: we only upsize the buffer,
340 never downsize it. */
341 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000342 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000343 PyObject_DEL(unicode->str);
344 unicode->str = NULL;
345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000350 }
351 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 }
353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 if (unicode == NULL)
357 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 }
361
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 PyErr_NoMemory();
364 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000365 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000366 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000367 * the caller fails before initializing str -- unicode_resize()
368 * reads str[0], and the Keep-Alive optimization can keep memory
369 * allocated for str alive across a call to unicode_dealloc(unicode).
370 * We don't want unicode_resize to read uninitialized memory in
371 * that case.
372 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000373 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000377 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000378 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000380
Benjamin Peterson29060642009-01-31 22:14:21 +0000381 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000382 /* XXX UNREF/NEWREF interface should be more symmetrical */
383 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000385 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387}
388
389static
Guido van Rossum9475a232001-10-05 20:51:39 +0000390void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391{
Walter Dörwald16807132007-05-25 13:52:07 +0000392 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_NOT_INTERNED:
394 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000395
Benjamin Peterson29060642009-01-31 22:14:21 +0000396 case SSTATE_INTERNED_MORTAL:
397 /* revive dead object temporarily for DelItem */
398 Py_REFCNT(unicode) = 3;
399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400 Py_FatalError(
401 "deletion of interned string failed");
402 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 case SSTATE_INTERNED_IMMORTAL:
405 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406
Benjamin Peterson29060642009-01-31 22:14:21 +0000407 default:
408 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000409 }
410
Guido van Rossum604ddf82001-12-06 20:03:56 +0000411 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000412 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000413 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415 PyObject_DEL(unicode->str);
416 unicode->str = NULL;
417 unicode->length = 0;
418 }
419 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000420 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 }
422 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000423 *(PyUnicodeObject **)unicode = free_list;
424 free_list = unicode;
425 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000426 }
427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyObject_DEL(unicode->str);
429 Py_XDECREF(unicode->defenc);
430 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431 }
432}
433
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000436{
437 register PyUnicodeObject *v;
438
439 /* Argument checks */
440 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 PyErr_BadInternalCall();
442 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000443 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 PyErr_BadInternalCall();
447 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 }
449
450 /* Resizing unicode_empty and single character objects is not
451 possible since these are being shared. We simply return a fresh
452 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000453 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000454 (v == unicode_empty || v->length == 1)) {
455 PyUnicodeObject *w = _PyUnicode_New(length);
456 if (w == NULL)
457 return -1;
458 Py_UNICODE_COPY(w->str, v->str,
459 length < v->length ? length : v->length);
460 Py_DECREF(*unicode);
461 *unicode = w;
462 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000463 }
464
465 /* Note that we don't have to modify *unicode for unshared Unicode
466 objects, since we can modify them in-place. */
467 return unicode_resize(v, length);
468}
469
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477{
478 PyUnicodeObject *unicode;
479
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects. */
482 if (u != NULL) {
483
Benjamin Peterson29060642009-01-31 22:14:21 +0000484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000489
490 /* Single character Unicode objects in the Latin-1 range are
491 shared when using this constructor */
492 if (size == 1 && *u < 256) {
493 unicode = unicode_latin1[*u];
494 if (!unicode) {
495 unicode = _PyUnicode_New(1);
496 if (!unicode)
497 return NULL;
498 unicode->str[0] = *u;
499 unicode_latin1[*u] = unicode;
500 }
501 Py_INCREF(unicode);
502 return (PyObject *)unicode;
503 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000504 }
Tim Petersced69f82003-09-16 20:30:58 +0000505
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 unicode = _PyUnicode_New(size);
507 if (!unicode)
508 return NULL;
509
510 /* Copy the Unicode data into the new object */
511 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000512 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 return (PyObject *)unicode;
515}
516
Walter Dörwaldd2034312007-05-18 16:29:38 +0000517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518{
519 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000520
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 if (size < 0) {
522 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 return NULL;
525 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000526
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000528 some optimizations which share commonly used objects.
529 Also, this means the input must be UTF-8, so fall back to the
530 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (u != NULL) {
532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533 /* Optimization for empty strings */
534 if (size == 0 && unicode_empty != NULL) {
535 Py_INCREF(unicode_empty);
536 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000537 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000538
539 /* Single characters are shared when using this constructor.
540 Restrict to ASCII, since the input must be UTF-8. */
541 if (size == 1 && Py_CHARMASK(*u) < 128) {
542 unicode = unicode_latin1[Py_CHARMASK(*u)];
543 if (!unicode) {
544 unicode = _PyUnicode_New(1);
545 if (!unicode)
546 return NULL;
547 unicode->str[0] = Py_CHARMASK(*u);
548 unicode_latin1[Py_CHARMASK(*u)] = unicode;
549 }
550 Py_INCREF(unicode);
551 return (PyObject *)unicode;
552 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000553
554 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 }
556
Walter Dörwald55507312007-05-18 13:12:10 +0000557 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 if (!unicode)
559 return NULL;
560
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 return (PyObject *)unicode;
562}
563
Walter Dörwaldd2034312007-05-18 16:29:38 +0000564PyObject *PyUnicode_FromString(const char *u)
565{
566 size_t size = strlen(u);
567 if (size > PY_SSIZE_T_MAX) {
568 PyErr_SetString(PyExc_OverflowError, "input too long");
569 return NULL;
570 }
571
572 return PyUnicode_FromStringAndSize(u, size);
573}
574
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575#ifdef HAVE_WCHAR_H
576
Mark Dickinson081dfee2009-03-18 14:47:41 +0000577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584 to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
588{
589 PyUnicodeObject *unicode;
590 register Py_ssize_t i;
591 Py_ssize_t alloc;
592 const wchar_t *orig_w;
593
594 if (w == NULL) {
595 if (size == 0)
596 return PyUnicode_FromStringAndSize(NULL, 0);
597 PyErr_BadInternalCall();
598 return NULL;
599 }
600
601 if (size == -1) {
602 size = wcslen(w);
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000643 if (size == 0)
644 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000645 PyErr_BadInternalCall();
646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647 }
648
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == -1) {
650 size = wcslen(w);
651 }
652
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 unicode = _PyUnicode_New(size);
654 if (!unicode)
655 return NULL;
656
657 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000660#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000662 register Py_UNICODE *u;
663 register Py_ssize_t i;
664 u = PyUnicode_AS_UNICODE(unicode);
665 for (i = size; i > 0; i--)
666 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 }
668#endif
669
670 return (PyObject *)unicode;
671}
672
Mark Dickinson081dfee2009-03-18 14:47:41 +0000673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
Walter Dörwald346737f2007-05-31 10:44:43 +0000677static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000680{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000681 *fmt++ = '%';
682 if (width) {
683 if (zeropad)
684 *fmt++ = '0';
685 fmt += sprintf(fmt, "%d", width);
686 }
687 if (precision)
688 fmt += sprintf(fmt, ".%d", precision);
689 if (longflag)
690 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000691 else if (longlongflag) {
692 /* longlongflag should only ever be nonzero on machines with
693 HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695 char *f = PY_FORMAT_LONG_LONG;
696 while (*f)
697 *fmt++ = *f++;
698#else
699 /* we shouldn't ever get here */
700 assert(0);
701 *fmt++ = 'l';
702#endif
703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000704 else if (size_tflag) {
705 char *f = PY_FORMAT_SIZE_T;
706 while (*f)
707 *fmt++ = *f++;
708 }
709 *fmt++ = c;
710 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000711}
712
Walter Dörwaldd2034312007-05-18 16:29:38 +0000713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld. 21 characters
718 allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000728 va_list count;
729 Py_ssize_t callcount = 0;
730 PyObject **callresults = NULL;
731 PyObject **callresult = NULL;
732 Py_ssize_t n = 0;
733 int width = 0;
734 int precision = 0;
735 int zeropad;
736 const char* f;
737 Py_UNICODE *s;
738 PyObject *string;
739 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000740 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000741 /* use abuffer instead of buffer, if we need more space
742 * (which can happen if there's a format specifier with width). */
743 char *abuffer = NULL;
744 char *realbuffer;
745 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000749 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000750 /* step 1: count the number of %S/%R/%A/%s format specifications
751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000754 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000755 if (*f == '%') {
756 if (*(f+1)=='%')
757 continue;
758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759 ++callcount;
760 while (ISDIGIT((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763 ;
764 if (*f == 's')
765 ++callcount;
766 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000767 else if (128 <= (unsigned char)*f) {
768 PyErr_Format(PyExc_ValueError,
769 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000770 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000771 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000772 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 }
775 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000776 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000777 if (callcount) {
778 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
779 if (!callresults) {
780 PyErr_NoMemory();
781 return NULL;
782 }
783 callresult = callresults;
784 }
785 /* step 3: figure out how large a buffer we need */
786 for (f = format; *f; f++) {
787 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000788#ifdef HAVE_LONG_LONG
789 int longlongflag = 0;
790#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 const char* p = f;
792 width = 0;
793 while (ISDIGIT((unsigned)*f))
794 width = (width*10) + *f++ - '0';
795 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
796 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797
Benjamin Peterson14339b62009-01-31 16:36:08 +0000798 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
799 * they don't affect the amount of space we reserve.
800 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000801 if (*f == 'l') {
802 if (f[1] == 'd' || f[1] == 'u') {
803 ++f;
804 }
805#ifdef HAVE_LONG_LONG
806 else if (f[1] == 'l' &&
807 (f[2] == 'd' || f[2] == 'u')) {
808 longlongflag = 1;
809 f += 2;
810 }
811#endif
812 }
813 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000814 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000816
Benjamin Peterson14339b62009-01-31 16:36:08 +0000817 switch (*f) {
818 case 'c':
819 (void)va_arg(count, int);
820 /* fall through... */
821 case '%':
822 n++;
823 break;
824 case 'd': case 'u': case 'i': case 'x':
825 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000826#ifdef HAVE_LONG_LONG
827 if (longlongflag) {
828 if (width < MAX_LONG_LONG_CHARS)
829 width = MAX_LONG_LONG_CHARS;
830 }
831 else
832#endif
833 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
834 including sign. Decimal takes the most space. This
835 isn't enough for octal. If a width is specified we
836 need more (which we allocate later). */
837 if (width < MAX_LONG_CHARS)
838 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000839 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000840 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 if (abuffersize < width)
842 abuffersize = width;
843 break;
844 case 's':
845 {
846 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000847 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000848 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
849 if (!str)
850 goto fail;
851 n += PyUnicode_GET_SIZE(str);
852 /* Remember the str and switch to the next slot */
853 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000854 break;
855 }
856 case 'U':
857 {
858 PyObject *obj = va_arg(count, PyObject *);
859 assert(obj && PyUnicode_Check(obj));
860 n += PyUnicode_GET_SIZE(obj);
861 break;
862 }
863 case 'V':
864 {
865 PyObject *obj = va_arg(count, PyObject *);
866 const char *str = va_arg(count, const char *);
867 assert(obj || str);
868 assert(!obj || PyUnicode_Check(obj));
869 if (obj)
870 n += PyUnicode_GET_SIZE(obj);
871 else
872 n += strlen(str);
873 break;
874 }
875 case 'S':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 PyObject *str;
879 assert(obj);
880 str = PyObject_Str(obj);
881 if (!str)
882 goto fail;
883 n += PyUnicode_GET_SIZE(str);
884 /* Remember the str and switch to the next slot */
885 *callresult++ = str;
886 break;
887 }
888 case 'R':
889 {
890 PyObject *obj = va_arg(count, PyObject *);
891 PyObject *repr;
892 assert(obj);
893 repr = PyObject_Repr(obj);
894 if (!repr)
895 goto fail;
896 n += PyUnicode_GET_SIZE(repr);
897 /* Remember the repr and switch to the next slot */
898 *callresult++ = repr;
899 break;
900 }
901 case 'A':
902 {
903 PyObject *obj = va_arg(count, PyObject *);
904 PyObject *ascii;
905 assert(obj);
906 ascii = PyObject_ASCII(obj);
907 if (!ascii)
908 goto fail;
909 n += PyUnicode_GET_SIZE(ascii);
910 /* Remember the repr and switch to the next slot */
911 *callresult++ = ascii;
912 break;
913 }
914 case 'p':
915 (void) va_arg(count, int);
916 /* maximum 64-bit pointer representation:
917 * 0xffffffffffffffff
918 * so 19 characters is enough.
919 * XXX I count 18 -- what's the extra for?
920 */
921 n += 19;
922 break;
923 default:
924 /* if we stumble upon an unknown
925 formatting code, copy the rest of
926 the format string to the output
927 string. (we cannot just skip the
928 code, since there's no way to know
929 what's in the argument list) */
930 n += strlen(p);
931 goto expand;
932 }
933 } else
934 n++;
935 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000936 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000937 if (abuffersize > ITEM_BUFFER_LEN) {
938 /* add 1 for sprintf's trailing null byte */
939 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000940 if (!abuffer) {
941 PyErr_NoMemory();
942 goto fail;
943 }
944 realbuffer = abuffer;
945 }
946 else
947 realbuffer = buffer;
948 /* step 4: fill the buffer */
949 /* Since we've analyzed how much space we need for the worst case,
950 we don't have to resize the string.
951 There can be no errors beyond this point. */
952 string = PyUnicode_FromUnicode(NULL, n);
953 if (!string)
954 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000955
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 s = PyUnicode_AS_UNICODE(string);
957 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000958
Benjamin Peterson14339b62009-01-31 16:36:08 +0000959 for (f = format; *f; f++) {
960 if (*f == '%') {
961 const char* p = f++;
962 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000963 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000964 int size_tflag = 0;
965 zeropad = (*f == '0');
966 /* parse the width.precision part */
967 width = 0;
968 while (ISDIGIT((unsigned)*f))
969 width = (width*10) + *f++ - '0';
970 precision = 0;
971 if (*f == '.') {
972 f++;
973 while (ISDIGIT((unsigned)*f))
974 precision = (precision*10) + *f++ - '0';
975 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000976 /* Handle %ld, %lu, %lld and %llu. */
977 if (*f == 'l') {
978 if (f[1] == 'd' || f[1] == 'u') {
979 longflag = 1;
980 ++f;
981 }
982#ifdef HAVE_LONG_LONG
983 else if (f[1] == 'l' &&
984 (f[2] == 'd' || f[2] == 'u')) {
985 longlongflag = 1;
986 f += 2;
987 }
988#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 }
990 /* handle the size_t flag. */
991 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
992 size_tflag = 1;
993 ++f;
994 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000995
Benjamin Peterson14339b62009-01-31 16:36:08 +0000996 switch (*f) {
997 case 'c':
998 *s++ = va_arg(vargs, int);
999 break;
1000 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001001 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1002 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001003 if (longflag)
1004 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001005#ifdef HAVE_LONG_LONG
1006 else if (longlongflag)
1007 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1008#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001009 else if (size_tflag)
1010 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1011 else
1012 sprintf(realbuffer, fmt, va_arg(vargs, int));
1013 appendstring(realbuffer);
1014 break;
1015 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001016 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1017 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 if (longflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001020#ifdef HAVE_LONG_LONG
1021 else if (longlongflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs,
1023 unsigned PY_LONG_LONG));
1024#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001025 else if (size_tflag)
1026 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1027 else
1028 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1029 appendstring(realbuffer);
1030 break;
1031 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 sprintf(realbuffer, fmt, va_arg(vargs, int));
1034 appendstring(realbuffer);
1035 break;
1036 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001037 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 's':
1042 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001043 /* unused, since we already have the result */
1044 (void) va_arg(vargs, char *);
1045 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1046 PyUnicode_GET_SIZE(*callresult));
1047 s += PyUnicode_GET_SIZE(*callresult);
1048 /* We're done with the unicode()/repr() => forget it */
1049 Py_DECREF(*callresult);
1050 /* switch to next unicode()/repr() result */
1051 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001052 break;
1053 }
1054 case 'U':
1055 {
1056 PyObject *obj = va_arg(vargs, PyObject *);
1057 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1058 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1059 s += size;
1060 break;
1061 }
1062 case 'V':
1063 {
1064 PyObject *obj = va_arg(vargs, PyObject *);
1065 const char *str = va_arg(vargs, const char *);
1066 if (obj) {
1067 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1068 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1069 s += size;
1070 } else {
1071 appendstring(str);
1072 }
1073 break;
1074 }
1075 case 'S':
1076 case 'R':
1077 {
1078 Py_UNICODE *ucopy;
1079 Py_ssize_t usize;
1080 Py_ssize_t upos;
1081 /* unused, since we already have the result */
1082 (void) va_arg(vargs, PyObject *);
1083 ucopy = PyUnicode_AS_UNICODE(*callresult);
1084 usize = PyUnicode_GET_SIZE(*callresult);
1085 for (upos = 0; upos<usize;)
1086 *s++ = ucopy[upos++];
1087 /* We're done with the unicode()/repr() => forget it */
1088 Py_DECREF(*callresult);
1089 /* switch to next unicode()/repr() result */
1090 ++callresult;
1091 break;
1092 }
1093 case 'p':
1094 sprintf(buffer, "%p", va_arg(vargs, void*));
1095 /* %p is ill-defined: ensure leading 0x. */
1096 if (buffer[1] == 'X')
1097 buffer[1] = 'x';
1098 else if (buffer[1] != 'x') {
1099 memmove(buffer+2, buffer, strlen(buffer)+1);
1100 buffer[0] = '0';
1101 buffer[1] = 'x';
1102 }
1103 appendstring(buffer);
1104 break;
1105 case '%':
1106 *s++ = '%';
1107 break;
1108 default:
1109 appendstring(p);
1110 goto end;
1111 }
Victor Stinner1205f272010-09-11 00:54:47 +00001112 }
Victor Stinner1205f272010-09-11 00:54:47 +00001113 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001114 *s++ = *f;
1115 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001116
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 if (callresults)
1119 PyObject_Free(callresults);
1120 if (abuffer)
1121 PyObject_Free(abuffer);
1122 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1123 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001124 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 if (callresults) {
1126 PyObject **callresult2 = callresults;
1127 while (callresult2 < callresult) {
1128 Py_DECREF(*callresult2);
1129 ++callresult2;
1130 }
1131 PyObject_Free(callresults);
1132 }
1133 if (abuffer)
1134 PyObject_Free(abuffer);
1135 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001136}
1137
1138#undef appendstring
1139
1140PyObject *
1141PyUnicode_FromFormat(const char *format, ...)
1142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001143 PyObject* ret;
1144 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001145
1146#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001147 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001148#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001150#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001151 ret = PyUnicode_FromFormatV(format, vargs);
1152 va_end(vargs);
1153 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001154}
1155
Victor Stinner5593d8a2010-10-02 11:11:27 +00001156/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1157 convert a Unicode object to a wide character string.
1158
1159 - If w is NULL: return the number of wide characters (including the nul
1160 character) required to convert the unicode object. Ignore size argument.
1161
1162 - Otherwise: return the number of wide characters (excluding the nul
1163 character) written into w. Write at most size wide characters (including
1164 the nul character). */
1165static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001166unicode_aswidechar(PyUnicodeObject *unicode,
1167 wchar_t *w,
1168 Py_ssize_t size)
1169{
1170#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001171 Py_ssize_t res;
1172 if (w != NULL) {
1173 res = PyUnicode_GET_SIZE(unicode);
1174 if (size > res)
1175 size = res + 1;
1176 else
1177 res = size;
1178 memcpy(w, unicode->str, size * sizeof(wchar_t));
1179 return res;
1180 }
1181 else
1182 return PyUnicode_GET_SIZE(unicode) + 1;
1183#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1184 register const Py_UNICODE *u;
1185 const Py_UNICODE *uend;
1186 const wchar_t *worig, *wend;
1187 Py_ssize_t nchar;
1188
Victor Stinner137c34c2010-09-29 10:25:54 +00001189 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001190 uend = u + PyUnicode_GET_SIZE(unicode);
1191 if (w != NULL) {
1192 worig = w;
1193 wend = w + size;
1194 while (u != uend && w != wend) {
1195 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1196 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1197 {
1198 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1199 u += 2;
1200 }
1201 else {
1202 *w = *u;
1203 u++;
1204 }
1205 w++;
1206 }
1207 if (w != wend)
1208 *w = L'\0';
1209 return w - worig;
1210 }
1211 else {
1212 nchar = 1; /* nul character at the end */
1213 while (u != uend) {
1214 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1215 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1216 u += 2;
1217 else
1218 u++;
1219 nchar++;
1220 }
1221 }
1222 return nchar;
1223#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1224 register Py_UNICODE *u, *uend, ordinal;
1225 register Py_ssize_t i;
1226 wchar_t *worig, *wend;
1227 Py_ssize_t nchar;
1228
1229 u = PyUnicode_AS_UNICODE(unicode);
1230 uend = u + PyUnicode_GET_SIZE(u);
1231 if (w != NULL) {
1232 worig = w;
1233 wend = w + size;
1234 while (u != uend && w != wend) {
1235 ordinal = *u;
1236 if (ordinal > 0xffff) {
1237 ordinal -= 0x10000;
1238 *w++ = 0xD800 | (ordinal >> 10);
1239 *w++ = 0xDC00 | (ordinal & 0x3FF);
1240 }
1241 else
1242 *w++ = ordinal;
1243 u++;
1244 }
1245 if (w != wend)
1246 *w = 0;
1247 return w - worig;
1248 }
1249 else {
1250 nchar = 1; /* nul character */
1251 while (u != uend) {
1252 if (*u > 0xffff)
1253 nchar += 2;
1254 else
1255 nchar++;
1256 u++;
1257 }
1258 return nchar;
1259 }
1260#else
1261# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001262#endif
1263}
1264
1265Py_ssize_t
1266PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1267 wchar_t *w,
1268 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269{
1270 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001271 PyErr_BadInternalCall();
1272 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00001274 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275}
1276
Victor Stinner137c34c2010-09-29 10:25:54 +00001277wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001278PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001279 Py_ssize_t *size)
1280{
1281 wchar_t* buffer;
1282 Py_ssize_t buflen;
1283
1284 if (unicode == NULL) {
1285 PyErr_BadInternalCall();
1286 return NULL;
1287 }
1288
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001289 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001290 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001291 PyErr_NoMemory();
1292 return NULL;
1293 }
1294
Victor Stinner137c34c2010-09-29 10:25:54 +00001295 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1296 if (buffer == NULL) {
1297 PyErr_NoMemory();
1298 return NULL;
1299 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001300 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001301 if (size != NULL)
1302 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001303 return buffer;
1304}
1305
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306#endif
1307
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001308PyObject *PyUnicode_FromOrdinal(int ordinal)
1309{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001310 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001311
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001312 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001313 PyErr_SetString(PyExc_ValueError,
1314 "chr() arg not in range(0x110000)");
1315 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001316 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001317
1318#ifndef Py_UNICODE_WIDE
1319 if (ordinal > 0xffff) {
1320 ordinal -= 0x10000;
1321 s[0] = 0xD800 | (ordinal >> 10);
1322 s[1] = 0xDC00 | (ordinal & 0x3FF);
1323 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001324 }
1325#endif
1326
Hye-Shik Chang40574832004-04-06 07:24:51 +00001327 s[0] = (Py_UNICODE)ordinal;
1328 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001329}
1330
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331PyObject *PyUnicode_FromObject(register PyObject *obj)
1332{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001333 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001334 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001335 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001336 Py_INCREF(obj);
1337 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001338 }
1339 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 /* For a Unicode subtype that's not a Unicode object,
1341 return a true Unicode object with the same data. */
1342 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1343 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001344 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001345 PyErr_Format(PyExc_TypeError,
1346 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001347 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001348 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001349}
1350
1351PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001352 const char *encoding,
1353 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001354{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001355 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001356 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001357
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 PyErr_BadInternalCall();
1360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001362
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001363 /* Decoding bytes objects is the most common case and should be fast */
1364 if (PyBytes_Check(obj)) {
1365 if (PyBytes_GET_SIZE(obj) == 0) {
1366 Py_INCREF(unicode_empty);
1367 v = (PyObject *) unicode_empty;
1368 }
1369 else {
1370 v = PyUnicode_Decode(
1371 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1372 encoding, errors);
1373 }
1374 return v;
1375 }
1376
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001377 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001378 PyErr_SetString(PyExc_TypeError,
1379 "decoding str is not supported");
1380 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001381 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001382
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001383 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1384 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1385 PyErr_Format(PyExc_TypeError,
1386 "coercing to str: need bytes, bytearray "
1387 "or buffer-like object, %.80s found",
1388 Py_TYPE(obj)->tp_name);
1389 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001390 }
Tim Petersced69f82003-09-16 20:30:58 +00001391
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001392 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001393 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001394 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395 }
Tim Petersced69f82003-09-16 20:30:58 +00001396 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001397 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001398
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001399 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001400 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401}
1402
Victor Stinner600d3be2010-06-10 12:00:55 +00001403/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001404 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1405 1 on success. */
1406static int
1407normalize_encoding(const char *encoding,
1408 char *lower,
1409 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001411 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001412 char *l;
1413 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001415 e = encoding;
1416 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001417 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001418 while (*e) {
1419 if (l == l_end)
1420 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001421 if (ISUPPER(*e)) {
1422 *l++ = TOLOWER(*e++);
1423 }
1424 else if (*e == '_') {
1425 *l++ = '-';
1426 e++;
1427 }
1428 else {
1429 *l++ = *e++;
1430 }
1431 }
1432 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001433 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001434}
1435
1436PyObject *PyUnicode_Decode(const char *s,
1437 Py_ssize_t size,
1438 const char *encoding,
1439 const char *errors)
1440{
1441 PyObject *buffer = NULL, *unicode;
1442 Py_buffer info;
1443 char lower[11]; /* Enough for any encoding shortcut */
1444
1445 if (encoding == NULL)
1446 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001447
1448 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001449 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1450 if (strcmp(lower, "utf-8") == 0)
1451 return PyUnicode_DecodeUTF8(s, size, errors);
1452 else if ((strcmp(lower, "latin-1") == 0) ||
1453 (strcmp(lower, "iso-8859-1") == 0))
1454 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001455#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001456 else if (strcmp(lower, "mbcs") == 0)
1457 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001458#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001459 else if (strcmp(lower, "ascii") == 0)
1460 return PyUnicode_DecodeASCII(s, size, errors);
1461 else if (strcmp(lower, "utf-16") == 0)
1462 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1463 else if (strcmp(lower, "utf-32") == 0)
1464 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466
1467 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001468 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001469 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001470 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001471 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001472 if (buffer == NULL)
1473 goto onError;
1474 unicode = PyCodec_Decode(buffer, encoding, errors);
1475 if (unicode == NULL)
1476 goto onError;
1477 if (!PyUnicode_Check(unicode)) {
1478 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001479 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001480 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481 Py_DECREF(unicode);
1482 goto onError;
1483 }
1484 Py_DECREF(buffer);
1485 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001486
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488 Py_XDECREF(buffer);
1489 return NULL;
1490}
1491
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001492PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1493 const char *encoding,
1494 const char *errors)
1495{
1496 PyObject *v;
1497
1498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 goto onError;
1501 }
1502
1503 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001504 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001505
1506 /* Decode via the codec registry */
1507 v = PyCodec_Decode(unicode, encoding, errors);
1508 if (v == NULL)
1509 goto onError;
1510 return v;
1511
Benjamin Peterson29060642009-01-31 22:14:21 +00001512 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001513 return NULL;
1514}
1515
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001516PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1517 const char *encoding,
1518 const char *errors)
1519{
1520 PyObject *v;
1521
1522 if (!PyUnicode_Check(unicode)) {
1523 PyErr_BadArgument();
1524 goto onError;
1525 }
1526
1527 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001528 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001529
1530 /* Decode via the codec registry */
1531 v = PyCodec_Decode(unicode, encoding, errors);
1532 if (v == NULL)
1533 goto onError;
1534 if (!PyUnicode_Check(v)) {
1535 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001536 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001537 Py_TYPE(v)->tp_name);
1538 Py_DECREF(v);
1539 goto onError;
1540 }
1541 return v;
1542
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001544 return NULL;
1545}
1546
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001548 Py_ssize_t size,
1549 const char *encoding,
1550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001551{
1552 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001553
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554 unicode = PyUnicode_FromUnicode(s, size);
1555 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1558 Py_DECREF(unicode);
1559 return v;
1560}
1561
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001562PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1563 const char *encoding,
1564 const char *errors)
1565{
1566 PyObject *v;
1567
1568 if (!PyUnicode_Check(unicode)) {
1569 PyErr_BadArgument();
1570 goto onError;
1571 }
1572
1573 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001575
1576 /* Encode via the codec registry */
1577 v = PyCodec_Encode(unicode, encoding, errors);
1578 if (v == NULL)
1579 goto onError;
1580 return v;
1581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001583 return NULL;
1584}
1585
Victor Stinnerae6265f2010-05-15 16:27:27 +00001586PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1587{
Victor Stinner313a1202010-06-11 23:56:51 +00001588 if (Py_FileSystemDefaultEncoding) {
1589#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1590 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1591 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1592 PyUnicode_GET_SIZE(unicode),
1593 NULL);
1594#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001595 return PyUnicode_AsEncodedString(unicode,
1596 Py_FileSystemDefaultEncoding,
1597 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001598 }
1599 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001600 /* locale encoding with surrogateescape */
1601 wchar_t *wchar;
1602 char *bytes;
1603 PyObject *bytes_obj;
1604
1605 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1606 if (wchar == NULL)
1607 return NULL;
1608 bytes = _Py_wchar2char(wchar);
1609 PyMem_Free(wchar);
1610 if (bytes == NULL)
1611 return NULL;
1612
1613 bytes_obj = PyBytes_FromString(bytes);
1614 PyMem_Free(bytes);
1615 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001616 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00001617}
1618
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1620 const char *encoding,
1621 const char *errors)
1622{
1623 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001624 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001625
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626 if (!PyUnicode_Check(unicode)) {
1627 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 }
Fred Drakee4315f52000-05-09 19:53:39 +00001630
Tim Petersced69f82003-09-16 20:30:58 +00001631 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001632 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001633
1634 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001635 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1636 if (strcmp(lower, "utf-8") == 0)
1637 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1638 PyUnicode_GET_SIZE(unicode),
1639 errors);
1640 else if ((strcmp(lower, "latin-1") == 0) ||
1641 (strcmp(lower, "iso-8859-1") == 0))
1642 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1643 PyUnicode_GET_SIZE(unicode),
1644 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001645#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001646 else if (strcmp(lower, "mbcs") == 0)
1647 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1648 PyUnicode_GET_SIZE(unicode),
1649 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001650#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001651 else if (strcmp(lower, "ascii") == 0)
1652 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1653 PyUnicode_GET_SIZE(unicode),
1654 errors);
1655 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001656 /* During bootstrap, we may need to find the encodings
1657 package, to load the file system encoding, and require the
1658 file system encoding in order to load the encodings
1659 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001660
Victor Stinner59e62db2010-05-15 13:14:32 +00001661 Break out of this dependency by assuming that the path to
1662 the encodings module is ASCII-only. XXX could try wcstombs
1663 instead, if the file system encoding is the locale's
1664 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001665 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001666 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1667 !PyThreadState_GET()->interp->codecs_initialized)
1668 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1669 PyUnicode_GET_SIZE(unicode),
1670 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671
1672 /* Encode via the codec registry */
1673 v = PyCodec_Encode(unicode, encoding, errors);
1674 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001675 return NULL;
1676
1677 /* The normal path */
1678 if (PyBytes_Check(v))
1679 return v;
1680
1681 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001682 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001683 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001684 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001685
1686 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1687 "encoder %s returned bytearray instead of bytes",
1688 encoding);
1689 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001690 Py_DECREF(v);
1691 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001692 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001693
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001694 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1695 Py_DECREF(v);
1696 return b;
1697 }
1698
1699 PyErr_Format(PyExc_TypeError,
1700 "encoder did not return a bytes object (type=%.400s)",
1701 Py_TYPE(v)->tp_name);
1702 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001703 return NULL;
1704}
1705
1706PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1707 const char *encoding,
1708 const char *errors)
1709{
1710 PyObject *v;
1711
1712 if (!PyUnicode_Check(unicode)) {
1713 PyErr_BadArgument();
1714 goto onError;
1715 }
1716
1717 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001718 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001719
1720 /* Encode via the codec registry */
1721 v = PyCodec_Encode(unicode, encoding, errors);
1722 if (v == NULL)
1723 goto onError;
1724 if (!PyUnicode_Check(v)) {
1725 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001726 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001727 Py_TYPE(v)->tp_name);
1728 Py_DECREF(v);
1729 goto onError;
1730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001732
Benjamin Peterson29060642009-01-31 22:14:21 +00001733 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 return NULL;
1735}
1736
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001737PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001738 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001739{
1740 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001741 if (v)
1742 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001743 if (errors != NULL)
1744 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001745 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001746 PyUnicode_GET_SIZE(unicode),
1747 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001748 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001749 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001750 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001751 return v;
1752}
1753
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001754PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001755PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001756 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001757 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1758}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001759
Christian Heimes5894ba72007-11-04 11:43:14 +00001760PyObject*
1761PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1762{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001763 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1764 can be undefined. If it is case, decode using UTF-8. The following assumes
1765 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1766 bootstrapping process where the codecs aren't ready yet.
1767 */
1768 if (Py_FileSystemDefaultEncoding) {
1769#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001770 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001771 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001772 }
1773#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001774 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001775 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001776 }
1777#endif
1778 return PyUnicode_Decode(s, size,
1779 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001780 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001781 }
1782 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001783 /* locale encoding with surrogateescape */
1784 wchar_t *wchar;
1785 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001786 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001787
1788 if (s[size] != '\0' || size != strlen(s)) {
1789 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1790 return NULL;
1791 }
1792
Victor Stinner168e1172010-10-16 23:16:16 +00001793 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001794 if (wchar == NULL)
1795 return NULL;
1796
Victor Stinner168e1172010-10-16 23:16:16 +00001797 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001798 PyMem_Free(wchar);
1799 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001800 }
1801}
1802
Martin v. Löwis011e8422009-05-05 04:43:17 +00001803
1804int
1805PyUnicode_FSConverter(PyObject* arg, void* addr)
1806{
1807 PyObject *output = NULL;
1808 Py_ssize_t size;
1809 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001810 if (arg == NULL) {
1811 Py_DECREF(*(PyObject**)addr);
1812 return 1;
1813 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001814 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001815 output = arg;
1816 Py_INCREF(output);
1817 }
1818 else {
1819 arg = PyUnicode_FromObject(arg);
1820 if (!arg)
1821 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001822 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001823 Py_DECREF(arg);
1824 if (!output)
1825 return 0;
1826 if (!PyBytes_Check(output)) {
1827 Py_DECREF(output);
1828 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1829 return 0;
1830 }
1831 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001832 size = PyBytes_GET_SIZE(output);
1833 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001834 if (size != strlen(data)) {
1835 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1836 Py_DECREF(output);
1837 return 0;
1838 }
1839 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001840 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001841}
1842
1843
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001844int
1845PyUnicode_FSDecoder(PyObject* arg, void* addr)
1846{
1847 PyObject *output = NULL;
1848 Py_ssize_t size;
1849 void *data;
1850 if (arg == NULL) {
1851 Py_DECREF(*(PyObject**)addr);
1852 return 1;
1853 }
1854 if (PyUnicode_Check(arg)) {
1855 output = arg;
1856 Py_INCREF(output);
1857 }
1858 else {
1859 arg = PyBytes_FromObject(arg);
1860 if (!arg)
1861 return 0;
1862 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1863 PyBytes_GET_SIZE(arg));
1864 Py_DECREF(arg);
1865 if (!output)
1866 return 0;
1867 if (!PyUnicode_Check(output)) {
1868 Py_DECREF(output);
1869 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1870 return 0;
1871 }
1872 }
1873 size = PyUnicode_GET_SIZE(output);
1874 data = PyUnicode_AS_UNICODE(output);
1875 if (size != Py_UNICODE_strlen(data)) {
1876 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1877 Py_DECREF(output);
1878 return 0;
1879 }
1880 *(PyObject**)addr = output;
1881 return Py_CLEANUP_SUPPORTED;
1882}
1883
1884
Martin v. Löwis5b222132007-06-10 09:51:05 +00001885char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001886_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001887{
Christian Heimesf3863112007-11-22 07:46:41 +00001888 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001889 if (!PyUnicode_Check(unicode)) {
1890 PyErr_BadArgument();
1891 return NULL;
1892 }
Christian Heimesf3863112007-11-22 07:46:41 +00001893 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1894 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001895 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001896 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001897 *psize = PyBytes_GET_SIZE(bytes);
1898 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001899}
1900
1901char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001902_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001903{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001904 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001905}
1906
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1908{
1909 if (!PyUnicode_Check(unicode)) {
1910 PyErr_BadArgument();
1911 goto onError;
1912 }
1913 return PyUnicode_AS_UNICODE(unicode);
1914
Benjamin Peterson29060642009-01-31 22:14:21 +00001915 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001916 return NULL;
1917}
1918
Martin v. Löwis18e16552006-02-15 17:27:45 +00001919Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920{
1921 if (!PyUnicode_Check(unicode)) {
1922 PyErr_BadArgument();
1923 goto onError;
1924 }
1925 return PyUnicode_GET_SIZE(unicode);
1926
Benjamin Peterson29060642009-01-31 22:14:21 +00001927 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928 return -1;
1929}
1930
Thomas Wouters78890102000-07-22 19:25:51 +00001931const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001932{
Victor Stinner42cb4622010-09-01 19:39:01 +00001933 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001934}
1935
Victor Stinner554f3f02010-06-16 23:33:54 +00001936/* create or adjust a UnicodeDecodeError */
1937static void
1938make_decode_exception(PyObject **exceptionObject,
1939 const char *encoding,
1940 const char *input, Py_ssize_t length,
1941 Py_ssize_t startpos, Py_ssize_t endpos,
1942 const char *reason)
1943{
1944 if (*exceptionObject == NULL) {
1945 *exceptionObject = PyUnicodeDecodeError_Create(
1946 encoding, input, length, startpos, endpos, reason);
1947 }
1948 else {
1949 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1950 goto onError;
1951 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1952 goto onError;
1953 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1954 goto onError;
1955 }
1956 return;
1957
1958onError:
1959 Py_DECREF(*exceptionObject);
1960 *exceptionObject = NULL;
1961}
1962
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001963/* error handling callback helper:
1964 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001965 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001966 and adjust various state variables.
1967 return 0 on success, -1 on error
1968*/
1969
1970static
1971int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001972 const char *encoding, const char *reason,
1973 const char **input, const char **inend, Py_ssize_t *startinpos,
1974 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1975 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001976{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001977 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001978
1979 PyObject *restuple = NULL;
1980 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001981 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001982 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001983 Py_ssize_t requiredsize;
1984 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001986 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001987 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001988 int res = -1;
1989
1990 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001991 *errorHandler = PyCodec_LookupError(errors);
1992 if (*errorHandler == NULL)
1993 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001994 }
1995
Victor Stinner554f3f02010-06-16 23:33:54 +00001996 make_decode_exception(exceptionObject,
1997 encoding,
1998 *input, *inend - *input,
1999 *startinpos, *endinpos,
2000 reason);
2001 if (*exceptionObject == NULL)
2002 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002003
2004 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2005 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002006 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002007 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002008 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002010 }
2011 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002012 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002013
2014 /* Copy back the bytes variables, which might have been modified by the
2015 callback */
2016 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2017 if (!inputobj)
2018 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002019 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002020 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002021 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002022 *input = PyBytes_AS_STRING(inputobj);
2023 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002024 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002025 /* we can DECREF safely, as the exception has another reference,
2026 so the object won't go away. */
2027 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002028
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002030 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002031 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002032 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2033 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002034 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035
2036 /* need more space? (at least enough for what we
2037 have+the replacement+the rest of the string (starting
2038 at the new input position), so we won't have to check space
2039 when there are no errors in the rest of the string) */
2040 repptr = PyUnicode_AS_UNICODE(repunicode);
2041 repsize = PyUnicode_GET_SIZE(repunicode);
2042 requiredsize = *outpos + repsize + insize-newpos;
2043 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002044 if (requiredsize<2*outsize)
2045 requiredsize = 2*outsize;
2046 if (_PyUnicode_Resize(output, requiredsize) < 0)
2047 goto onError;
2048 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 }
2050 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002051 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002052 Py_UNICODE_COPY(*outptr, repptr, repsize);
2053 *outptr += repsize;
2054 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 /* we made it! */
2057 res = 0;
2058
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 Py_XDECREF(restuple);
2061 return res;
2062}
2063
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002064/* --- UTF-7 Codec -------------------------------------------------------- */
2065
Antoine Pitrou244651a2009-05-04 18:56:13 +00002066/* See RFC2152 for details. We encode conservatively and decode liberally. */
2067
2068/* Three simple macros defining base-64. */
2069
2070/* Is c a base-64 character? */
2071
2072#define IS_BASE64(c) \
2073 (((c) >= 'A' && (c) <= 'Z') || \
2074 ((c) >= 'a' && (c) <= 'z') || \
2075 ((c) >= '0' && (c) <= '9') || \
2076 (c) == '+' || (c) == '/')
2077
2078/* given that c is a base-64 character, what is its base-64 value? */
2079
2080#define FROM_BASE64(c) \
2081 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2082 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2083 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2084 (c) == '+' ? 62 : 63)
2085
2086/* What is the base-64 character of the bottom 6 bits of n? */
2087
2088#define TO_BASE64(n) \
2089 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2090
2091/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2092 * decoded as itself. We are permissive on decoding; the only ASCII
2093 * byte not decoding to itself is the + which begins a base64
2094 * string. */
2095
2096#define DECODE_DIRECT(c) \
2097 ((c) <= 127 && (c) != '+')
2098
2099/* The UTF-7 encoder treats ASCII characters differently according to
2100 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2101 * the above). See RFC2152. This array identifies these different
2102 * sets:
2103 * 0 : "Set D"
2104 * alphanumeric and '(),-./:?
2105 * 1 : "Set O"
2106 * !"#$%&*;<=>@[]^_`{|}
2107 * 2 : "whitespace"
2108 * ht nl cr sp
2109 * 3 : special (must be base64 encoded)
2110 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2111 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002112
Tim Petersced69f82003-09-16 20:30:58 +00002113static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002114char utf7_category[128] = {
2115/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2116 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2117/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2118 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2119/* sp ! " # $ % & ' ( ) * + , - . / */
2120 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2121/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2123/* @ A B C D E F G H I J K L M N O */
2124 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2125/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2127/* ` a b c d e f g h i j k l m n o */
2128 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2129/* p q r s t u v w x y z { | } ~ del */
2130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002131};
2132
Antoine Pitrou244651a2009-05-04 18:56:13 +00002133/* ENCODE_DIRECT: this character should be encoded as itself. The
2134 * answer depends on whether we are encoding set O as itself, and also
2135 * on whether we are encoding whitespace as itself. RFC2152 makes it
2136 * clear that the answers to these questions vary between
2137 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002138
Antoine Pitrou244651a2009-05-04 18:56:13 +00002139#define ENCODE_DIRECT(c, directO, directWS) \
2140 ((c) < 128 && (c) > 0 && \
2141 ((utf7_category[(c)] == 0) || \
2142 (directWS && (utf7_category[(c)] == 2)) || \
2143 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002144
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002145PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002146 Py_ssize_t size,
2147 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002148{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002149 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2150}
2151
Antoine Pitrou244651a2009-05-04 18:56:13 +00002152/* The decoder. The only state we preserve is our read position,
2153 * i.e. how many characters we have consumed. So if we end in the
2154 * middle of a shift sequence we have to back off the read position
2155 * and the output to the beginning of the sequence, otherwise we lose
2156 * all the shift state (seen bits, number of bits seen, high
2157 * surrogate). */
2158
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002159PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002160 Py_ssize_t size,
2161 const char *errors,
2162 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002163{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002164 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002165 Py_ssize_t startinpos;
2166 Py_ssize_t endinpos;
2167 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002168 const char *e;
2169 PyUnicodeObject *unicode;
2170 Py_UNICODE *p;
2171 const char *errmsg = "";
2172 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002173 Py_UNICODE *shiftOutStart;
2174 unsigned int base64bits = 0;
2175 unsigned long base64buffer = 0;
2176 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002177 PyObject *errorHandler = NULL;
2178 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002179
2180 unicode = _PyUnicode_New(size);
2181 if (!unicode)
2182 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002183 if (size == 0) {
2184 if (consumed)
2185 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002186 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002187 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002188
2189 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002190 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002191 e = s + size;
2192
2193 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002194 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002195 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002196 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002197
Antoine Pitrou244651a2009-05-04 18:56:13 +00002198 if (inShift) { /* in a base-64 section */
2199 if (IS_BASE64(ch)) { /* consume a base-64 character */
2200 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2201 base64bits += 6;
2202 s++;
2203 if (base64bits >= 16) {
2204 /* we have enough bits for a UTF-16 value */
2205 Py_UNICODE outCh = (Py_UNICODE)
2206 (base64buffer >> (base64bits-16));
2207 base64bits -= 16;
2208 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2209 if (surrogate) {
2210 /* expecting a second surrogate */
2211 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2212#ifdef Py_UNICODE_WIDE
2213 *p++ = (((surrogate & 0x3FF)<<10)
2214 | (outCh & 0x3FF)) + 0x10000;
2215#else
2216 *p++ = surrogate;
2217 *p++ = outCh;
2218#endif
2219 surrogate = 0;
2220 }
2221 else {
2222 surrogate = 0;
2223 errmsg = "second surrogate missing";
2224 goto utf7Error;
2225 }
2226 }
2227 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2228 /* first surrogate */
2229 surrogate = outCh;
2230 }
2231 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2232 errmsg = "unexpected second surrogate";
2233 goto utf7Error;
2234 }
2235 else {
2236 *p++ = outCh;
2237 }
2238 }
2239 }
2240 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002241 inShift = 0;
2242 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002243 if (surrogate) {
2244 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002245 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002246 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002247 if (base64bits > 0) { /* left-over bits */
2248 if (base64bits >= 6) {
2249 /* We've seen at least one base-64 character */
2250 errmsg = "partial character in shift sequence";
2251 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002252 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002253 else {
2254 /* Some bits remain; they should be zero */
2255 if (base64buffer != 0) {
2256 errmsg = "non-zero padding bits in shift sequence";
2257 goto utf7Error;
2258 }
2259 }
2260 }
2261 if (ch != '-') {
2262 /* '-' is absorbed; other terminating
2263 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002264 *p++ = ch;
2265 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002266 }
2267 }
2268 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002269 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002270 s++; /* consume '+' */
2271 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002272 s++;
2273 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002274 }
2275 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002276 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002277 shiftOutStart = p;
2278 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002279 }
2280 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002281 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002282 *p++ = ch;
2283 s++;
2284 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002285 else {
2286 startinpos = s-starts;
2287 s++;
2288 errmsg = "unexpected special character";
2289 goto utf7Error;
2290 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002291 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002292utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002293 outpos = p-PyUnicode_AS_UNICODE(unicode);
2294 endinpos = s-starts;
2295 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002296 errors, &errorHandler,
2297 "utf7", errmsg,
2298 &starts, &e, &startinpos, &endinpos, &exc, &s,
2299 &unicode, &outpos, &p))
2300 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002301 }
2302
Antoine Pitrou244651a2009-05-04 18:56:13 +00002303 /* end of string */
2304
2305 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2306 /* if we're in an inconsistent state, that's an error */
2307 if (surrogate ||
2308 (base64bits >= 6) ||
2309 (base64bits > 0 && base64buffer != 0)) {
2310 outpos = p-PyUnicode_AS_UNICODE(unicode);
2311 endinpos = size;
2312 if (unicode_decode_call_errorhandler(
2313 errors, &errorHandler,
2314 "utf7", "unterminated shift sequence",
2315 &starts, &e, &startinpos, &endinpos, &exc, &s,
2316 &unicode, &outpos, &p))
2317 goto onError;
2318 if (s < e)
2319 goto restart;
2320 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002321 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002322
2323 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002324 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002325 if (inShift) {
2326 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002327 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002328 }
2329 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002330 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002331 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002332 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002333
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002334 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002335 goto onError;
2336
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002337 Py_XDECREF(errorHandler);
2338 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002339 return (PyObject *)unicode;
2340
Benjamin Peterson29060642009-01-31 22:14:21 +00002341 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002342 Py_XDECREF(errorHandler);
2343 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002344 Py_DECREF(unicode);
2345 return NULL;
2346}
2347
2348
2349PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002350 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002351 int base64SetO,
2352 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002353 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002354{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002355 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002356 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002357 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002358 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002359 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002360 unsigned int base64bits = 0;
2361 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002362 char * out;
2363 char * start;
2364
2365 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002366 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002367
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002368 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002369 return PyErr_NoMemory();
2370
Antoine Pitrou244651a2009-05-04 18:56:13 +00002371 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002372 if (v == NULL)
2373 return NULL;
2374
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002375 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002376 for (;i < size; ++i) {
2377 Py_UNICODE ch = s[i];
2378
Antoine Pitrou244651a2009-05-04 18:56:13 +00002379 if (inShift) {
2380 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2381 /* shifting out */
2382 if (base64bits) { /* output remaining bits */
2383 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2384 base64buffer = 0;
2385 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002386 }
2387 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002388 /* Characters not in the BASE64 set implicitly unshift the sequence
2389 so no '-' is required, except if the character is itself a '-' */
2390 if (IS_BASE64(ch) || ch == '-') {
2391 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002392 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002393 *out++ = (char) ch;
2394 }
2395 else {
2396 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002397 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002398 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002399 else { /* not in a shift sequence */
2400 if (ch == '+') {
2401 *out++ = '+';
2402 *out++ = '-';
2403 }
2404 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2405 *out++ = (char) ch;
2406 }
2407 else {
2408 *out++ = '+';
2409 inShift = 1;
2410 goto encode_char;
2411 }
2412 }
2413 continue;
2414encode_char:
2415#ifdef Py_UNICODE_WIDE
2416 if (ch >= 0x10000) {
2417 /* code first surrogate */
2418 base64bits += 16;
2419 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2420 while (base64bits >= 6) {
2421 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2422 base64bits -= 6;
2423 }
2424 /* prepare second surrogate */
2425 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2426 }
2427#endif
2428 base64bits += 16;
2429 base64buffer = (base64buffer << 16) | ch;
2430 while (base64bits >= 6) {
2431 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2432 base64bits -= 6;
2433 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002434 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002435 if (base64bits)
2436 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2437 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002438 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002439 if (_PyBytes_Resize(&v, out - start) < 0)
2440 return NULL;
2441 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002442}
2443
Antoine Pitrou244651a2009-05-04 18:56:13 +00002444#undef IS_BASE64
2445#undef FROM_BASE64
2446#undef TO_BASE64
2447#undef DECODE_DIRECT
2448#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002449
Guido van Rossumd57fd912000-03-10 22:53:23 +00002450/* --- UTF-8 Codec -------------------------------------------------------- */
2451
Tim Petersced69f82003-09-16 20:30:58 +00002452static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002454 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2455 illegal prefix. See RFC 3629 for details */
2456 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2457 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002458 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002463 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2464 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2468 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2469 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2470 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2471 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472};
2473
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002475 Py_ssize_t size,
2476 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002477{
Walter Dörwald69652032004-09-07 20:24:22 +00002478 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2479}
2480
Antoine Pitrouab868312009-01-10 15:40:25 +00002481/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2482#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2483
2484/* Mask to quickly check whether a C 'long' contains a
2485 non-ASCII, UTF8-encoded char. */
2486#if (SIZEOF_LONG == 8)
2487# define ASCII_CHAR_MASK 0x8080808080808080L
2488#elif (SIZEOF_LONG == 4)
2489# define ASCII_CHAR_MASK 0x80808080L
2490#else
2491# error C 'long' size should be either 4 or 8!
2492#endif
2493
Walter Dörwald69652032004-09-07 20:24:22 +00002494PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002495 Py_ssize_t size,
2496 const char *errors,
2497 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002498{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002499 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002501 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002502 Py_ssize_t startinpos;
2503 Py_ssize_t endinpos;
2504 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002505 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506 PyUnicodeObject *unicode;
2507 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002508 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002509 PyObject *errorHandler = NULL;
2510 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511
2512 /* Note: size will always be longer than the resulting Unicode
2513 character count */
2514 unicode = _PyUnicode_New(size);
2515 if (!unicode)
2516 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002517 if (size == 0) {
2518 if (consumed)
2519 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522
2523 /* Unpack UTF-8 encoded data */
2524 p = unicode->str;
2525 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002526 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527
2528 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002529 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530
2531 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002532 /* Fast path for runs of ASCII characters. Given that common UTF-8
2533 input will consist of an overwhelming majority of ASCII
2534 characters, we try to optimize for this case by checking
2535 as many characters as a C 'long' can contain.
2536 First, check if we can do an aligned read, as most CPUs have
2537 a penalty for unaligned reads.
2538 */
2539 if (!((size_t) s & LONG_PTR_MASK)) {
2540 /* Help register allocation */
2541 register const char *_s = s;
2542 register Py_UNICODE *_p = p;
2543 while (_s < aligned_end) {
2544 /* Read a whole long at a time (either 4 or 8 bytes),
2545 and do a fast unrolled copy if it only contains ASCII
2546 characters. */
2547 unsigned long data = *(unsigned long *) _s;
2548 if (data & ASCII_CHAR_MASK)
2549 break;
2550 _p[0] = (unsigned char) _s[0];
2551 _p[1] = (unsigned char) _s[1];
2552 _p[2] = (unsigned char) _s[2];
2553 _p[3] = (unsigned char) _s[3];
2554#if (SIZEOF_LONG == 8)
2555 _p[4] = (unsigned char) _s[4];
2556 _p[5] = (unsigned char) _s[5];
2557 _p[6] = (unsigned char) _s[6];
2558 _p[7] = (unsigned char) _s[7];
2559#endif
2560 _s += SIZEOF_LONG;
2561 _p += SIZEOF_LONG;
2562 }
2563 s = _s;
2564 p = _p;
2565 if (s == e)
2566 break;
2567 ch = (unsigned char)*s;
2568 }
2569 }
2570
2571 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002572 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002573 s++;
2574 continue;
2575 }
2576
2577 n = utf8_code_length[ch];
2578
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002579 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002580 if (consumed)
2581 break;
2582 else {
2583 errmsg = "unexpected end of data";
2584 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002585 endinpos = startinpos+1;
2586 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2587 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002588 goto utf8Error;
2589 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591
2592 switch (n) {
2593
2594 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002595 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002596 startinpos = s-starts;
2597 endinpos = startinpos+1;
2598 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599
2600 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002601 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002602 startinpos = s-starts;
2603 endinpos = startinpos+1;
2604 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605
2606 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002607 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002608 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002609 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002610 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002611 goto utf8Error;
2612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002614 assert ((ch > 0x007F) && (ch <= 0x07FF));
2615 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 break;
2617
2618 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002619 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2620 will result in surrogates in range d800-dfff. Surrogates are
2621 not valid UTF-8 so they are rejected.
2622 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2623 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002624 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002625 (s[2] & 0xc0) != 0x80 ||
2626 ((unsigned char)s[0] == 0xE0 &&
2627 (unsigned char)s[1] < 0xA0) ||
2628 ((unsigned char)s[0] == 0xED &&
2629 (unsigned char)s[1] > 0x9F)) {
2630 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002631 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002632 endinpos = startinpos + 1;
2633
2634 /* if s[1] first two bits are 1 and 0, then the invalid
2635 continuation byte is s[2], so increment endinpos by 1,
2636 if not, s[1] is invalid and endinpos doesn't need to
2637 be incremented. */
2638 if ((s[1] & 0xC0) == 0x80)
2639 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002640 goto utf8Error;
2641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002643 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2644 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002645 break;
2646
2647 case 4:
2648 if ((s[1] & 0xc0) != 0x80 ||
2649 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002650 (s[3] & 0xc0) != 0x80 ||
2651 ((unsigned char)s[0] == 0xF0 &&
2652 (unsigned char)s[1] < 0x90) ||
2653 ((unsigned char)s[0] == 0xF4 &&
2654 (unsigned char)s[1] > 0x8F)) {
2655 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002656 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002657 endinpos = startinpos + 1;
2658 if ((s[1] & 0xC0) == 0x80) {
2659 endinpos++;
2660 if ((s[2] & 0xC0) == 0x80)
2661 endinpos++;
2662 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002663 goto utf8Error;
2664 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002665 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002666 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2667 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2668
Fredrik Lundh8f455852001-06-27 18:59:43 +00002669#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002670 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002671#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002672 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002673
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002674 /* translate from 10000..10FFFF to 0..FFFF */
2675 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002676
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002677 /* high surrogate = top 10 bits added to D800 */
2678 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002679
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002680 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002681 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002682#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 }
2685 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002687
Benjamin Peterson29060642009-01-31 22:14:21 +00002688 utf8Error:
2689 outpos = p-PyUnicode_AS_UNICODE(unicode);
2690 if (unicode_decode_call_errorhandler(
2691 errors, &errorHandler,
2692 "utf8", errmsg,
2693 &starts, &e, &startinpos, &endinpos, &exc, &s,
2694 &unicode, &outpos, &p))
2695 goto onError;
2696 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 }
Walter Dörwald69652032004-09-07 20:24:22 +00002698 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002699 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700
2701 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002702 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703 goto onError;
2704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002705 Py_XDECREF(errorHandler);
2706 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 return (PyObject *)unicode;
2708
Benjamin Peterson29060642009-01-31 22:14:21 +00002709 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 Py_XDECREF(errorHandler);
2711 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 Py_DECREF(unicode);
2713 return NULL;
2714}
2715
Antoine Pitrouab868312009-01-10 15:40:25 +00002716#undef ASCII_CHAR_MASK
2717
2718
Tim Peters602f7402002-04-27 18:03:26 +00002719/* Allocation strategy: if the string is short, convert into a stack buffer
2720 and allocate exactly as much space needed at the end. Else allocate the
2721 maximum possible needed (4 result bytes per Unicode character), and return
2722 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002723*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002724PyObject *
2725PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002726 Py_ssize_t size,
2727 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728{
Tim Peters602f7402002-04-27 18:03:26 +00002729#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002730
Guido van Rossum98297ee2007-11-06 21:34:58 +00002731 Py_ssize_t i; /* index into s of next input byte */
2732 PyObject *result; /* result string object */
2733 char *p; /* next free byte in output buffer */
2734 Py_ssize_t nallocated; /* number of result bytes allocated */
2735 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002736 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002737 PyObject *errorHandler = NULL;
2738 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002739
Tim Peters602f7402002-04-27 18:03:26 +00002740 assert(s != NULL);
2741 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742
Tim Peters602f7402002-04-27 18:03:26 +00002743 if (size <= MAX_SHORT_UNICHARS) {
2744 /* Write into the stack buffer; nallocated can't overflow.
2745 * At the end, we'll allocate exactly as much heap space as it
2746 * turns out we need.
2747 */
2748 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002749 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002750 p = stackbuf;
2751 }
2752 else {
2753 /* Overallocate on the heap, and give the excess back at the end. */
2754 nallocated = size * 4;
2755 if (nallocated / 4 != size) /* overflow! */
2756 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002757 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002758 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002759 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002760 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002761 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002762
Tim Peters602f7402002-04-27 18:03:26 +00002763 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002764 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002765
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002766 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002767 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002769
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002771 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002772 *p++ = (char)(0xc0 | (ch >> 6));
2773 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002774 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002775#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002776 /* Special case: check for high and low surrogate */
2777 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2778 Py_UCS4 ch2 = s[i];
2779 /* Combine the two surrogates to form a UCS4 value */
2780 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2781 i++;
2782
2783 /* Encode UCS4 Unicode ordinals */
2784 *p++ = (char)(0xf0 | (ch >> 18));
2785 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002786 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2787 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002788 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002789#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002790 Py_ssize_t newpos;
2791 PyObject *rep;
2792 Py_ssize_t repsize, k;
2793 rep = unicode_encode_call_errorhandler
2794 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2795 s, size, &exc, i-1, i, &newpos);
2796 if (!rep)
2797 goto error;
2798
2799 if (PyBytes_Check(rep))
2800 repsize = PyBytes_GET_SIZE(rep);
2801 else
2802 repsize = PyUnicode_GET_SIZE(rep);
2803
2804 if (repsize > 4) {
2805 Py_ssize_t offset;
2806
2807 if (result == NULL)
2808 offset = p - stackbuf;
2809 else
2810 offset = p - PyBytes_AS_STRING(result);
2811
2812 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2813 /* integer overflow */
2814 PyErr_NoMemory();
2815 goto error;
2816 }
2817 nallocated += repsize - 4;
2818 if (result != NULL) {
2819 if (_PyBytes_Resize(&result, nallocated) < 0)
2820 goto error;
2821 } else {
2822 result = PyBytes_FromStringAndSize(NULL, nallocated);
2823 if (result == NULL)
2824 goto error;
2825 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2826 }
2827 p = PyBytes_AS_STRING(result) + offset;
2828 }
2829
2830 if (PyBytes_Check(rep)) {
2831 char *prep = PyBytes_AS_STRING(rep);
2832 for(k = repsize; k > 0; k--)
2833 *p++ = *prep++;
2834 } else /* rep is unicode */ {
2835 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2836 Py_UNICODE c;
2837
2838 for(k=0; k<repsize; k++) {
2839 c = prep[k];
2840 if (0x80 <= c) {
2841 raise_encode_exception(&exc, "utf-8", s, size,
2842 i-1, i, "surrogates not allowed");
2843 goto error;
2844 }
2845 *p++ = (char)prep[k];
2846 }
2847 }
2848 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002849#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002850 }
Victor Stinner445a6232010-04-22 20:01:57 +00002851#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002852 } else if (ch < 0x10000) {
2853 *p++ = (char)(0xe0 | (ch >> 12));
2854 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2855 *p++ = (char)(0x80 | (ch & 0x3f));
2856 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002857 /* Encode UCS4 Unicode ordinals */
2858 *p++ = (char)(0xf0 | (ch >> 18));
2859 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2860 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2861 *p++ = (char)(0x80 | (ch & 0x3f));
2862 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002864
Guido van Rossum98297ee2007-11-06 21:34:58 +00002865 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002866 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002867 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002868 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002869 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002870 }
2871 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002872 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002873 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002874 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002875 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002876 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002877 Py_XDECREF(errorHandler);
2878 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002879 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002880 error:
2881 Py_XDECREF(errorHandler);
2882 Py_XDECREF(exc);
2883 Py_XDECREF(result);
2884 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002885
Tim Peters602f7402002-04-27 18:03:26 +00002886#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887}
2888
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2890{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 if (!PyUnicode_Check(unicode)) {
2892 PyErr_BadArgument();
2893 return NULL;
2894 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002895 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002896 PyUnicode_GET_SIZE(unicode),
2897 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898}
2899
Walter Dörwald41980ca2007-08-16 21:55:45 +00002900/* --- UTF-32 Codec ------------------------------------------------------- */
2901
2902PyObject *
2903PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002904 Py_ssize_t size,
2905 const char *errors,
2906 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002907{
2908 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2909}
2910
2911PyObject *
2912PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002913 Py_ssize_t size,
2914 const char *errors,
2915 int *byteorder,
2916 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002917{
2918 const char *starts = s;
2919 Py_ssize_t startinpos;
2920 Py_ssize_t endinpos;
2921 Py_ssize_t outpos;
2922 PyUnicodeObject *unicode;
2923 Py_UNICODE *p;
2924#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002925 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002926 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002927#else
2928 const int pairs = 0;
2929#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002930 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002931 int bo = 0; /* assume native ordering by default */
2932 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002933 /* Offsets from q for retrieving bytes in the right order. */
2934#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2935 int iorder[] = {0, 1, 2, 3};
2936#else
2937 int iorder[] = {3, 2, 1, 0};
2938#endif
2939 PyObject *errorHandler = NULL;
2940 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002941
Walter Dörwald41980ca2007-08-16 21:55:45 +00002942 q = (unsigned char *)s;
2943 e = q + size;
2944
2945 if (byteorder)
2946 bo = *byteorder;
2947
2948 /* Check for BOM marks (U+FEFF) in the input and adjust current
2949 byte order setting accordingly. In native mode, the leading BOM
2950 mark is skipped, in all other modes, it is copied to the output
2951 stream as-is (giving a ZWNBSP character). */
2952 if (bo == 0) {
2953 if (size >= 4) {
2954 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002955 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002956#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002957 if (bom == 0x0000FEFF) {
2958 q += 4;
2959 bo = -1;
2960 }
2961 else if (bom == 0xFFFE0000) {
2962 q += 4;
2963 bo = 1;
2964 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002965#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002966 if (bom == 0x0000FEFF) {
2967 q += 4;
2968 bo = 1;
2969 }
2970 else if (bom == 0xFFFE0000) {
2971 q += 4;
2972 bo = -1;
2973 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002974#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002976 }
2977
2978 if (bo == -1) {
2979 /* force LE */
2980 iorder[0] = 0;
2981 iorder[1] = 1;
2982 iorder[2] = 2;
2983 iorder[3] = 3;
2984 }
2985 else if (bo == 1) {
2986 /* force BE */
2987 iorder[0] = 3;
2988 iorder[1] = 2;
2989 iorder[2] = 1;
2990 iorder[3] = 0;
2991 }
2992
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002993 /* On narrow builds we split characters outside the BMP into two
2994 codepoints => count how much extra space we need. */
2995#ifndef Py_UNICODE_WIDE
2996 for (qq = q; qq < e; qq += 4)
2997 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2998 pairs++;
2999#endif
3000
3001 /* This might be one to much, because of a BOM */
3002 unicode = _PyUnicode_New((size+3)/4+pairs);
3003 if (!unicode)
3004 return NULL;
3005 if (size == 0)
3006 return (PyObject *)unicode;
3007
3008 /* Unpack UTF-32 encoded data */
3009 p = unicode->str;
3010
Walter Dörwald41980ca2007-08-16 21:55:45 +00003011 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003012 Py_UCS4 ch;
3013 /* remaining bytes at the end? (size should be divisible by 4) */
3014 if (e-q<4) {
3015 if (consumed)
3016 break;
3017 errmsg = "truncated data";
3018 startinpos = ((const char *)q)-starts;
3019 endinpos = ((const char *)e)-starts;
3020 goto utf32Error;
3021 /* The remaining input chars are ignored if the callback
3022 chooses to skip the input */
3023 }
3024 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3025 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003026
Benjamin Peterson29060642009-01-31 22:14:21 +00003027 if (ch >= 0x110000)
3028 {
3029 errmsg = "codepoint not in range(0x110000)";
3030 startinpos = ((const char *)q)-starts;
3031 endinpos = startinpos+4;
3032 goto utf32Error;
3033 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003034#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 if (ch >= 0x10000)
3036 {
3037 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3038 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3039 }
3040 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003041#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 *p++ = ch;
3043 q += 4;
3044 continue;
3045 utf32Error:
3046 outpos = p-PyUnicode_AS_UNICODE(unicode);
3047 if (unicode_decode_call_errorhandler(
3048 errors, &errorHandler,
3049 "utf32", errmsg,
3050 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3051 &unicode, &outpos, &p))
3052 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003053 }
3054
3055 if (byteorder)
3056 *byteorder = bo;
3057
3058 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003060
3061 /* Adjust length */
3062 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3063 goto onError;
3064
3065 Py_XDECREF(errorHandler);
3066 Py_XDECREF(exc);
3067 return (PyObject *)unicode;
3068
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003070 Py_DECREF(unicode);
3071 Py_XDECREF(errorHandler);
3072 Py_XDECREF(exc);
3073 return NULL;
3074}
3075
3076PyObject *
3077PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 Py_ssize_t size,
3079 const char *errors,
3080 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003081{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003082 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003083 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003084 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003085#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003086 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003087#else
3088 const int pairs = 0;
3089#endif
3090 /* Offsets from p for storing byte pairs in the right order. */
3091#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3092 int iorder[] = {0, 1, 2, 3};
3093#else
3094 int iorder[] = {3, 2, 1, 0};
3095#endif
3096
Benjamin Peterson29060642009-01-31 22:14:21 +00003097#define STORECHAR(CH) \
3098 do { \
3099 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3100 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3101 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3102 p[iorder[0]] = (CH) & 0xff; \
3103 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003104 } while(0)
3105
3106 /* In narrow builds we can output surrogate pairs as one codepoint,
3107 so we need less space. */
3108#ifndef Py_UNICODE_WIDE
3109 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003110 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3111 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3112 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003113#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003114 nsize = (size - pairs + (byteorder == 0));
3115 bytesize = nsize * 4;
3116 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003117 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003118 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003119 if (v == NULL)
3120 return NULL;
3121
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003122 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003123 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003124 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003125 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003126 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003127
3128 if (byteorder == -1) {
3129 /* force LE */
3130 iorder[0] = 0;
3131 iorder[1] = 1;
3132 iorder[2] = 2;
3133 iorder[3] = 3;
3134 }
3135 else if (byteorder == 1) {
3136 /* force BE */
3137 iorder[0] = 3;
3138 iorder[1] = 2;
3139 iorder[2] = 1;
3140 iorder[3] = 0;
3141 }
3142
3143 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003144 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003145#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3147 Py_UCS4 ch2 = *s;
3148 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3149 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3150 s++;
3151 size--;
3152 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003153 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003154#endif
3155 STORECHAR(ch);
3156 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003157
3158 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003159 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003160#undef STORECHAR
3161}
3162
3163PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3164{
3165 if (!PyUnicode_Check(unicode)) {
3166 PyErr_BadArgument();
3167 return NULL;
3168 }
3169 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003170 PyUnicode_GET_SIZE(unicode),
3171 NULL,
3172 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003173}
3174
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175/* --- UTF-16 Codec ------------------------------------------------------- */
3176
Tim Peters772747b2001-08-09 22:21:55 +00003177PyObject *
3178PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 Py_ssize_t size,
3180 const char *errors,
3181 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182{
Walter Dörwald69652032004-09-07 20:24:22 +00003183 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3184}
3185
Antoine Pitrouab868312009-01-10 15:40:25 +00003186/* Two masks for fast checking of whether a C 'long' may contain
3187 UTF16-encoded surrogate characters. This is an efficient heuristic,
3188 assuming that non-surrogate characters with a code point >= 0x8000 are
3189 rare in most input.
3190 FAST_CHAR_MASK is used when the input is in native byte ordering,
3191 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003192*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003193#if (SIZEOF_LONG == 8)
3194# define FAST_CHAR_MASK 0x8000800080008000L
3195# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3196#elif (SIZEOF_LONG == 4)
3197# define FAST_CHAR_MASK 0x80008000L
3198# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3199#else
3200# error C 'long' size should be either 4 or 8!
3201#endif
3202
Walter Dörwald69652032004-09-07 20:24:22 +00003203PyObject *
3204PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 Py_ssize_t size,
3206 const char *errors,
3207 int *byteorder,
3208 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003209{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003210 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003211 Py_ssize_t startinpos;
3212 Py_ssize_t endinpos;
3213 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 PyUnicodeObject *unicode;
3215 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003216 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003217 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003218 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003219 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003220 /* Offsets from q for retrieving byte pairs in the right order. */
3221#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3222 int ihi = 1, ilo = 0;
3223#else
3224 int ihi = 0, ilo = 1;
3225#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 PyObject *errorHandler = NULL;
3227 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228
3229 /* Note: size will always be longer than the resulting Unicode
3230 character count */
3231 unicode = _PyUnicode_New(size);
3232 if (!unicode)
3233 return NULL;
3234 if (size == 0)
3235 return (PyObject *)unicode;
3236
3237 /* Unpack UTF-16 encoded data */
3238 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003239 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003240 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241
3242 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003243 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003245 /* Check for BOM marks (U+FEFF) in the input and adjust current
3246 byte order setting accordingly. In native mode, the leading BOM
3247 mark is skipped, in all other modes, it is copied to the output
3248 stream as-is (giving a ZWNBSP character). */
3249 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003250 if (size >= 2) {
3251 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003252#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003253 if (bom == 0xFEFF) {
3254 q += 2;
3255 bo = -1;
3256 }
3257 else if (bom == 0xFFFE) {
3258 q += 2;
3259 bo = 1;
3260 }
Tim Petersced69f82003-09-16 20:30:58 +00003261#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 if (bom == 0xFEFF) {
3263 q += 2;
3264 bo = 1;
3265 }
3266 else if (bom == 0xFFFE) {
3267 q += 2;
3268 bo = -1;
3269 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003270#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003271 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273
Tim Peters772747b2001-08-09 22:21:55 +00003274 if (bo == -1) {
3275 /* force LE */
3276 ihi = 1;
3277 ilo = 0;
3278 }
3279 else if (bo == 1) {
3280 /* force BE */
3281 ihi = 0;
3282 ilo = 1;
3283 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003284#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3285 native_ordering = ilo < ihi;
3286#else
3287 native_ordering = ilo > ihi;
3288#endif
Tim Peters772747b2001-08-09 22:21:55 +00003289
Antoine Pitrouab868312009-01-10 15:40:25 +00003290 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003291 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003292 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003293 /* First check for possible aligned read of a C 'long'. Unaligned
3294 reads are more expensive, better to defer to another iteration. */
3295 if (!((size_t) q & LONG_PTR_MASK)) {
3296 /* Fast path for runs of non-surrogate chars. */
3297 register const unsigned char *_q = q;
3298 Py_UNICODE *_p = p;
3299 if (native_ordering) {
3300 /* Native ordering is simple: as long as the input cannot
3301 possibly contain a surrogate char, do an unrolled copy
3302 of several 16-bit code points to the target object.
3303 The non-surrogate check is done on several input bytes
3304 at a time (as many as a C 'long' can contain). */
3305 while (_q < aligned_end) {
3306 unsigned long data = * (unsigned long *) _q;
3307 if (data & FAST_CHAR_MASK)
3308 break;
3309 _p[0] = ((unsigned short *) _q)[0];
3310 _p[1] = ((unsigned short *) _q)[1];
3311#if (SIZEOF_LONG == 8)
3312 _p[2] = ((unsigned short *) _q)[2];
3313 _p[3] = ((unsigned short *) _q)[3];
3314#endif
3315 _q += SIZEOF_LONG;
3316 _p += SIZEOF_LONG / 2;
3317 }
3318 }
3319 else {
3320 /* Byteswapped ordering is similar, but we must decompose
3321 the copy bytewise, and take care of zero'ing out the
3322 upper bytes if the target object is in 32-bit units
3323 (that is, in UCS-4 builds). */
3324 while (_q < aligned_end) {
3325 unsigned long data = * (unsigned long *) _q;
3326 if (data & SWAPPED_FAST_CHAR_MASK)
3327 break;
3328 /* Zero upper bytes in UCS-4 builds */
3329#if (Py_UNICODE_SIZE > 2)
3330 _p[0] = 0;
3331 _p[1] = 0;
3332#if (SIZEOF_LONG == 8)
3333 _p[2] = 0;
3334 _p[3] = 0;
3335#endif
3336#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003337 /* Issue #4916; UCS-4 builds on big endian machines must
3338 fill the two last bytes of each 4-byte unit. */
3339#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3340# define OFF 2
3341#else
3342# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003343#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003344 ((unsigned char *) _p)[OFF + 1] = _q[0];
3345 ((unsigned char *) _p)[OFF + 0] = _q[1];
3346 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3347 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3348#if (SIZEOF_LONG == 8)
3349 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3350 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3351 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3352 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3353#endif
3354#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003355 _q += SIZEOF_LONG;
3356 _p += SIZEOF_LONG / 2;
3357 }
3358 }
3359 p = _p;
3360 q = _q;
3361 if (q >= e)
3362 break;
3363 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003364 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003365
Benjamin Peterson14339b62009-01-31 16:36:08 +00003366 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003367
3368 if (ch < 0xD800 || ch > 0xDFFF) {
3369 *p++ = ch;
3370 continue;
3371 }
3372
3373 /* UTF-16 code pair: */
3374 if (q > e) {
3375 errmsg = "unexpected end of data";
3376 startinpos = (((const char *)q) - 2) - starts;
3377 endinpos = ((const char *)e) + 1 - starts;
3378 goto utf16Error;
3379 }
3380 if (0xD800 <= ch && ch <= 0xDBFF) {
3381 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3382 q += 2;
3383 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003384#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003385 *p++ = ch;
3386 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003387#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003388 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003389#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003390 continue;
3391 }
3392 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003393 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 startinpos = (((const char *)q)-4)-starts;
3395 endinpos = startinpos+2;
3396 goto utf16Error;
3397 }
3398
Benjamin Peterson14339b62009-01-31 16:36:08 +00003399 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003400 errmsg = "illegal encoding";
3401 startinpos = (((const char *)q)-2)-starts;
3402 endinpos = startinpos+2;
3403 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003404
Benjamin Peterson29060642009-01-31 22:14:21 +00003405 utf16Error:
3406 outpos = p - PyUnicode_AS_UNICODE(unicode);
3407 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003408 errors,
3409 &errorHandler,
3410 "utf16", errmsg,
3411 &starts,
3412 (const char **)&e,
3413 &startinpos,
3414 &endinpos,
3415 &exc,
3416 (const char **)&q,
3417 &unicode,
3418 &outpos,
3419 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003420 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003422 /* remaining byte at the end? (size should be even) */
3423 if (e == q) {
3424 if (!consumed) {
3425 errmsg = "truncated data";
3426 startinpos = ((const char *)q) - starts;
3427 endinpos = ((const char *)e) + 1 - starts;
3428 outpos = p - PyUnicode_AS_UNICODE(unicode);
3429 if (unicode_decode_call_errorhandler(
3430 errors,
3431 &errorHandler,
3432 "utf16", errmsg,
3433 &starts,
3434 (const char **)&e,
3435 &startinpos,
3436 &endinpos,
3437 &exc,
3438 (const char **)&q,
3439 &unicode,
3440 &outpos,
3441 &p))
3442 goto onError;
3443 /* The remaining input chars are ignored if the callback
3444 chooses to skip the input */
3445 }
3446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447
3448 if (byteorder)
3449 *byteorder = bo;
3450
Walter Dörwald69652032004-09-07 20:24:22 +00003451 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003452 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003453
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003455 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456 goto onError;
3457
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003458 Py_XDECREF(errorHandler);
3459 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 return (PyObject *)unicode;
3461
Benjamin Peterson29060642009-01-31 22:14:21 +00003462 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464 Py_XDECREF(errorHandler);
3465 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 return NULL;
3467}
3468
Antoine Pitrouab868312009-01-10 15:40:25 +00003469#undef FAST_CHAR_MASK
3470#undef SWAPPED_FAST_CHAR_MASK
3471
Tim Peters772747b2001-08-09 22:21:55 +00003472PyObject *
3473PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003474 Py_ssize_t size,
3475 const char *errors,
3476 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003477{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003478 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003479 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003480 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003481#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003482 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003483#else
3484 const int pairs = 0;
3485#endif
Tim Peters772747b2001-08-09 22:21:55 +00003486 /* Offsets from p for storing byte pairs in the right order. */
3487#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3488 int ihi = 1, ilo = 0;
3489#else
3490 int ihi = 0, ilo = 1;
3491#endif
3492
Benjamin Peterson29060642009-01-31 22:14:21 +00003493#define STORECHAR(CH) \
3494 do { \
3495 p[ihi] = ((CH) >> 8) & 0xff; \
3496 p[ilo] = (CH) & 0xff; \
3497 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003498 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003500#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003501 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003502 if (s[i] >= 0x10000)
3503 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003504#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003505 /* 2 * (size + pairs + (byteorder == 0)) */
3506 if (size > PY_SSIZE_T_MAX ||
3507 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003508 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003509 nsize = size + pairs + (byteorder == 0);
3510 bytesize = nsize * 2;
3511 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003512 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003513 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 if (v == NULL)
3515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003517 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003519 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003520 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003521 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003522
3523 if (byteorder == -1) {
3524 /* force LE */
3525 ihi = 1;
3526 ilo = 0;
3527 }
3528 else if (byteorder == 1) {
3529 /* force BE */
3530 ihi = 0;
3531 ilo = 1;
3532 }
3533
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003534 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 Py_UNICODE ch = *s++;
3536 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003537#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003538 if (ch >= 0x10000) {
3539 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3540 ch = 0xD800 | ((ch-0x10000) >> 10);
3541 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003542#endif
Tim Peters772747b2001-08-09 22:21:55 +00003543 STORECHAR(ch);
3544 if (ch2)
3545 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003546 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003547
3548 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003549 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003550#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551}
3552
3553PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3554{
3555 if (!PyUnicode_Check(unicode)) {
3556 PyErr_BadArgument();
3557 return NULL;
3558 }
3559 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003560 PyUnicode_GET_SIZE(unicode),
3561 NULL,
3562 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563}
3564
3565/* --- Unicode Escape Codec ----------------------------------------------- */
3566
Fredrik Lundh06d12682001-01-24 07:59:11 +00003567static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003568
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003570 Py_ssize_t size,
3571 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003574 Py_ssize_t startinpos;
3575 Py_ssize_t endinpos;
3576 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003579 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003581 char* message;
3582 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 PyObject *errorHandler = NULL;
3584 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003585
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 /* Escaped strings will always be longer than the resulting
3587 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 length after conversion to the true value.
3589 (but if the error callback returns a long replacement string
3590 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 v = _PyUnicode_New(size);
3592 if (v == NULL)
3593 goto onError;
3594 if (size == 0)
3595 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003599
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600 while (s < end) {
3601 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003602 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604
3605 /* Non-escape characters are interpreted as Unicode ordinals */
3606 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003607 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 continue;
3609 }
3610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 /* \ - Escapes */
3613 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003614 c = *s++;
3615 if (s > end)
3616 c = '\0'; /* Invalid after \ */
3617 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618
Benjamin Peterson29060642009-01-31 22:14:21 +00003619 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 case '\n': break;
3621 case '\\': *p++ = '\\'; break;
3622 case '\'': *p++ = '\''; break;
3623 case '\"': *p++ = '\"'; break;
3624 case 'b': *p++ = '\b'; break;
3625 case 'f': *p++ = '\014'; break; /* FF */
3626 case 't': *p++ = '\t'; break;
3627 case 'n': *p++ = '\n'; break;
3628 case 'r': *p++ = '\r'; break;
3629 case 'v': *p++ = '\013'; break; /* VT */
3630 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3631
Benjamin Peterson29060642009-01-31 22:14:21 +00003632 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633 case '0': case '1': case '2': case '3':
3634 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003635 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003636 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003637 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003638 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003639 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003641 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 break;
3643
Benjamin Peterson29060642009-01-31 22:14:21 +00003644 /* hex escapes */
3645 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003647 digits = 2;
3648 message = "truncated \\xXX escape";
3649 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650
Benjamin Peterson29060642009-01-31 22:14:21 +00003651 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003653 digits = 4;
3654 message = "truncated \\uXXXX escape";
3655 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003658 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003659 digits = 8;
3660 message = "truncated \\UXXXXXXXX escape";
3661 hexescape:
3662 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 outpos = p-PyUnicode_AS_UNICODE(v);
3664 if (s+digits>end) {
3665 endinpos = size;
3666 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003667 errors, &errorHandler,
3668 "unicodeescape", "end of string in escape sequence",
3669 &starts, &end, &startinpos, &endinpos, &exc, &s,
3670 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003671 goto onError;
3672 goto nextByte;
3673 }
3674 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003675 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003676 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003677 endinpos = (s+i+1)-starts;
3678 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 errors, &errorHandler,
3680 "unicodeescape", message,
3681 &starts, &end, &startinpos, &endinpos, &exc, &s,
3682 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003683 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003685 }
3686 chr = (chr<<4) & ~0xF;
3687 if (c >= '0' && c <= '9')
3688 chr += c - '0';
3689 else if (c >= 'a' && c <= 'f')
3690 chr += 10 + c - 'a';
3691 else
3692 chr += 10 + c - 'A';
3693 }
3694 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003695 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 /* _decoding_error will have already written into the
3697 target buffer. */
3698 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003699 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003700 /* when we get here, chr is a 32-bit unicode character */
3701 if (chr <= 0xffff)
3702 /* UCS-2 character */
3703 *p++ = (Py_UNICODE) chr;
3704 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003705 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003706 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003707#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003708 *p++ = chr;
3709#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003710 chr -= 0x10000L;
3711 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003712 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003713#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003714 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003715 endinpos = s-starts;
3716 outpos = p-PyUnicode_AS_UNICODE(v);
3717 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003718 errors, &errorHandler,
3719 "unicodeescape", "illegal Unicode character",
3720 &starts, &end, &startinpos, &endinpos, &exc, &s,
3721 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003722 goto onError;
3723 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003724 break;
3725
Benjamin Peterson29060642009-01-31 22:14:21 +00003726 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003727 case 'N':
3728 message = "malformed \\N character escape";
3729 if (ucnhash_CAPI == NULL) {
3730 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003731 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003732 if (ucnhash_CAPI == NULL)
3733 goto ucnhashError;
3734 }
3735 if (*s == '{') {
3736 const char *start = s+1;
3737 /* look for the closing brace */
3738 while (*s != '}' && s < end)
3739 s++;
3740 if (s > start && s < end && *s == '}') {
3741 /* found a name. look it up in the unicode database */
3742 message = "unknown Unicode character name";
3743 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003744 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003745 goto store;
3746 }
3747 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 endinpos = s-starts;
3749 outpos = p-PyUnicode_AS_UNICODE(v);
3750 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003751 errors, &errorHandler,
3752 "unicodeescape", message,
3753 &starts, &end, &startinpos, &endinpos, &exc, &s,
3754 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003755 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003756 break;
3757
3758 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003759 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 message = "\\ at end of string";
3761 s--;
3762 endinpos = s-starts;
3763 outpos = p-PyUnicode_AS_UNICODE(v);
3764 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003765 errors, &errorHandler,
3766 "unicodeescape", message,
3767 &starts, &end, &startinpos, &endinpos, &exc, &s,
3768 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003769 goto onError;
3770 }
3771 else {
3772 *p++ = '\\';
3773 *p++ = (unsigned char)s[-1];
3774 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003775 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003777 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003780 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003782 Py_XDECREF(errorHandler);
3783 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003785
Benjamin Peterson29060642009-01-31 22:14:21 +00003786 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003787 PyErr_SetString(
3788 PyExc_UnicodeError,
3789 "\\N escapes not supported (can't load unicodedata module)"
3790 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003791 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 Py_XDECREF(errorHandler);
3793 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003794 return NULL;
3795
Benjamin Peterson29060642009-01-31 22:14:21 +00003796 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 Py_XDECREF(errorHandler);
3799 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800 return NULL;
3801}
3802
3803/* Return a Unicode-Escape string version of the Unicode object.
3804
3805 If quotes is true, the string is enclosed in u"" or u'' quotes as
3806 appropriate.
3807
3808*/
3809
Thomas Wouters477c8d52006-05-27 19:21:47 +00003810Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003811 Py_ssize_t size,
3812 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003813{
3814 /* like wcschr, but doesn't stop at NULL characters */
3815
3816 while (size-- > 0) {
3817 if (*s == ch)
3818 return s;
3819 s++;
3820 }
3821
3822 return NULL;
3823}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003824
Walter Dörwald79e913e2007-05-12 11:08:06 +00003825static const char *hexdigits = "0123456789abcdef";
3826
3827PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003828 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003830 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003833#ifdef Py_UNICODE_WIDE
3834 const Py_ssize_t expandsize = 10;
3835#else
3836 const Py_ssize_t expandsize = 6;
3837#endif
3838
Thomas Wouters89f507f2006-12-13 04:49:30 +00003839 /* XXX(nnorwitz): rather than over-allocating, it would be
3840 better to choose a different scheme. Perhaps scan the
3841 first N-chars of the string and allocate based on that size.
3842 */
3843 /* Initial allocation is based on the longest-possible unichr
3844 escape.
3845
3846 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3847 unichr, so in this case it's the longest unichr escape. In
3848 narrow (UTF-16) builds this is five chars per source unichr
3849 since there are two unichrs in the surrogate pair, so in narrow
3850 (UTF-16) builds it's not the longest unichr escape.
3851
3852 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3853 so in the narrow (UTF-16) build case it's the longest unichr
3854 escape.
3855 */
3856
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003857 if (size == 0)
3858 return PyBytes_FromStringAndSize(NULL, 0);
3859
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003860 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003861 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003862
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003863 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003864 2
3865 + expandsize*size
3866 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867 if (repr == NULL)
3868 return NULL;
3869
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003870 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 while (size-- > 0) {
3873 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003874
Walter Dörwald79e913e2007-05-12 11:08:06 +00003875 /* Escape backslashes */
3876 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877 *p++ = '\\';
3878 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003879 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003880 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003881
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003882#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003883 /* Map 21-bit characters to '\U00xxxxxx' */
3884 else if (ch >= 0x10000) {
3885 *p++ = '\\';
3886 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003887 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3888 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3889 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3890 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3891 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3892 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3893 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3894 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003895 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003896 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003897#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003898 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3899 else if (ch >= 0xD800 && ch < 0xDC00) {
3900 Py_UNICODE ch2;
3901 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003902
Benjamin Peterson29060642009-01-31 22:14:21 +00003903 ch2 = *s++;
3904 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003905 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003906 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3907 *p++ = '\\';
3908 *p++ = 'U';
3909 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3910 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3911 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3912 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3913 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3914 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3915 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3916 *p++ = hexdigits[ucs & 0x0000000F];
3917 continue;
3918 }
3919 /* Fall through: isolated surrogates are copied as-is */
3920 s--;
3921 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003922 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003923#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003924
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003926 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 *p++ = '\\';
3928 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003929 *p++ = hexdigits[(ch >> 12) & 0x000F];
3930 *p++ = hexdigits[(ch >> 8) & 0x000F];
3931 *p++ = hexdigits[(ch >> 4) & 0x000F];
3932 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003934
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003935 /* Map special whitespace to '\t', \n', '\r' */
3936 else if (ch == '\t') {
3937 *p++ = '\\';
3938 *p++ = 't';
3939 }
3940 else if (ch == '\n') {
3941 *p++ = '\\';
3942 *p++ = 'n';
3943 }
3944 else if (ch == '\r') {
3945 *p++ = '\\';
3946 *p++ = 'r';
3947 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003948
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003949 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003950 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003952 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003953 *p++ = hexdigits[(ch >> 4) & 0x000F];
3954 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003955 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003956
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 /* Copy everything else as-is */
3958 else
3959 *p++ = (char) ch;
3960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003962 assert(p - PyBytes_AS_STRING(repr) > 0);
3963 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3964 return NULL;
3965 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966}
3967
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003968PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003970 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 if (!PyUnicode_Check(unicode)) {
3972 PyErr_BadArgument();
3973 return NULL;
3974 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003975 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3976 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003977 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978}
3979
3980/* --- Raw Unicode Escape Codec ------------------------------------------- */
3981
3982PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 Py_ssize_t size,
3984 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003987 Py_ssize_t startinpos;
3988 Py_ssize_t endinpos;
3989 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 const char *end;
3993 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 PyObject *errorHandler = NULL;
3995 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003996
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 /* Escaped strings will always be longer than the resulting
3998 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003999 length after conversion to the true value. (But decoding error
4000 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 v = _PyUnicode_New(size);
4002 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004005 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004006 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 end = s + size;
4008 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004009 unsigned char c;
4010 Py_UCS4 x;
4011 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004012 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013
Benjamin Peterson29060642009-01-31 22:14:21 +00004014 /* Non-escape characters are interpreted as Unicode ordinals */
4015 if (*s != '\\') {
4016 *p++ = (unsigned char)*s++;
4017 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004018 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004019 startinpos = s-starts;
4020
4021 /* \u-escapes are only interpreted iff the number of leading
4022 backslashes if odd */
4023 bs = s;
4024 for (;s < end;) {
4025 if (*s != '\\')
4026 break;
4027 *p++ = (unsigned char)*s++;
4028 }
4029 if (((s - bs) & 1) == 0 ||
4030 s >= end ||
4031 (*s != 'u' && *s != 'U')) {
4032 continue;
4033 }
4034 p--;
4035 count = *s=='u' ? 4 : 8;
4036 s++;
4037
4038 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4039 outpos = p-PyUnicode_AS_UNICODE(v);
4040 for (x = 0, i = 0; i < count; ++i, ++s) {
4041 c = (unsigned char)*s;
4042 if (!ISXDIGIT(c)) {
4043 endinpos = s-starts;
4044 if (unicode_decode_call_errorhandler(
4045 errors, &errorHandler,
4046 "rawunicodeescape", "truncated \\uXXXX",
4047 &starts, &end, &startinpos, &endinpos, &exc, &s,
4048 &v, &outpos, &p))
4049 goto onError;
4050 goto nextByte;
4051 }
4052 x = (x<<4) & ~0xF;
4053 if (c >= '0' && c <= '9')
4054 x += c - '0';
4055 else if (c >= 'a' && c <= 'f')
4056 x += 10 + c - 'a';
4057 else
4058 x += 10 + c - 'A';
4059 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004060 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004061 /* UCS-2 character */
4062 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004063 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004064 /* UCS-4 character. Either store directly, or as
4065 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004066#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004068#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 x -= 0x10000L;
4070 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4071 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004072#endif
4073 } else {
4074 endinpos = s-starts;
4075 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004076 if (unicode_decode_call_errorhandler(
4077 errors, &errorHandler,
4078 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004079 &starts, &end, &startinpos, &endinpos, &exc, &s,
4080 &v, &outpos, &p))
4081 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004082 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004083 nextByte:
4084 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004086 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004087 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004088 Py_XDECREF(errorHandler);
4089 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004091
Benjamin Peterson29060642009-01-31 22:14:21 +00004092 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094 Py_XDECREF(errorHandler);
4095 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 return NULL;
4097}
4098
4099PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004100 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004102 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 char *p;
4104 char *q;
4105
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004106#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004107 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004108#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004109 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004110#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004111
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004112 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004114
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004115 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 if (repr == NULL)
4117 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004118 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004119 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004121 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 while (size-- > 0) {
4123 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004124#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 /* Map 32-bit characters to '\Uxxxxxxxx' */
4126 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004127 *p++ = '\\';
4128 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004129 *p++ = hexdigits[(ch >> 28) & 0xf];
4130 *p++ = hexdigits[(ch >> 24) & 0xf];
4131 *p++ = hexdigits[(ch >> 20) & 0xf];
4132 *p++ = hexdigits[(ch >> 16) & 0xf];
4133 *p++ = hexdigits[(ch >> 12) & 0xf];
4134 *p++ = hexdigits[(ch >> 8) & 0xf];
4135 *p++ = hexdigits[(ch >> 4) & 0xf];
4136 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004137 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004138 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004139#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4141 if (ch >= 0xD800 && ch < 0xDC00) {
4142 Py_UNICODE ch2;
4143 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004144
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 ch2 = *s++;
4146 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004147 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004148 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4149 *p++ = '\\';
4150 *p++ = 'U';
4151 *p++ = hexdigits[(ucs >> 28) & 0xf];
4152 *p++ = hexdigits[(ucs >> 24) & 0xf];
4153 *p++ = hexdigits[(ucs >> 20) & 0xf];
4154 *p++ = hexdigits[(ucs >> 16) & 0xf];
4155 *p++ = hexdigits[(ucs >> 12) & 0xf];
4156 *p++ = hexdigits[(ucs >> 8) & 0xf];
4157 *p++ = hexdigits[(ucs >> 4) & 0xf];
4158 *p++ = hexdigits[ucs & 0xf];
4159 continue;
4160 }
4161 /* Fall through: isolated surrogates are copied as-is */
4162 s--;
4163 size++;
4164 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004165#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 /* Map 16-bit characters to '\uxxxx' */
4167 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 *p++ = '\\';
4169 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004170 *p++ = hexdigits[(ch >> 12) & 0xf];
4171 *p++ = hexdigits[(ch >> 8) & 0xf];
4172 *p++ = hexdigits[(ch >> 4) & 0xf];
4173 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 /* Copy everything else as-is */
4176 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 *p++ = (char) ch;
4178 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004179 size = p - q;
4180
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004181 assert(size > 0);
4182 if (_PyBytes_Resize(&repr, size) < 0)
4183 return NULL;
4184 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185}
4186
4187PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4188{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004189 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004191 PyErr_BadArgument();
4192 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004194 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4195 PyUnicode_GET_SIZE(unicode));
4196
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004197 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198}
4199
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004200/* --- Unicode Internal Codec ------------------------------------------- */
4201
4202PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 Py_ssize_t size,
4204 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004205{
4206 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004207 Py_ssize_t startinpos;
4208 Py_ssize_t endinpos;
4209 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004210 PyUnicodeObject *v;
4211 Py_UNICODE *p;
4212 const char *end;
4213 const char *reason;
4214 PyObject *errorHandler = NULL;
4215 PyObject *exc = NULL;
4216
Neal Norwitzd43069c2006-01-08 01:12:10 +00004217#ifdef Py_UNICODE_WIDE
4218 Py_UNICODE unimax = PyUnicode_GetMax();
4219#endif
4220
Thomas Wouters89f507f2006-12-13 04:49:30 +00004221 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004222 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4223 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004224 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004225 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004227 p = PyUnicode_AS_UNICODE(v);
4228 end = s + size;
4229
4230 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004231 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004232 /* We have to sanity check the raw data, otherwise doom looms for
4233 some malformed UCS-4 data. */
4234 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004235#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004236 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004237#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004238 end-s < Py_UNICODE_SIZE
4239 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004241 startinpos = s - starts;
4242 if (end-s < Py_UNICODE_SIZE) {
4243 endinpos = end-starts;
4244 reason = "truncated input";
4245 }
4246 else {
4247 endinpos = s - starts + Py_UNICODE_SIZE;
4248 reason = "illegal code point (> 0x10FFFF)";
4249 }
4250 outpos = p - PyUnicode_AS_UNICODE(v);
4251 if (unicode_decode_call_errorhandler(
4252 errors, &errorHandler,
4253 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004254 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004255 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004256 goto onError;
4257 }
4258 }
4259 else {
4260 p++;
4261 s += Py_UNICODE_SIZE;
4262 }
4263 }
4264
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004265 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004266 goto onError;
4267 Py_XDECREF(errorHandler);
4268 Py_XDECREF(exc);
4269 return (PyObject *)v;
4270
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004272 Py_XDECREF(v);
4273 Py_XDECREF(errorHandler);
4274 Py_XDECREF(exc);
4275 return NULL;
4276}
4277
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278/* --- Latin-1 Codec ------------------------------------------------------ */
4279
4280PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 Py_ssize_t size,
4282 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283{
4284 PyUnicodeObject *v;
4285 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004286 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004287
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004289 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 Py_UNICODE r = *(unsigned char*)s;
4291 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004292 }
4293
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294 v = _PyUnicode_New(size);
4295 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004298 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004300 e = s + size;
4301 /* Unrolling the copy makes it much faster by reducing the looping
4302 overhead. This is similar to what many memcpy() implementations do. */
4303 unrolled_end = e - 4;
4304 while (s < unrolled_end) {
4305 p[0] = (unsigned char) s[0];
4306 p[1] = (unsigned char) s[1];
4307 p[2] = (unsigned char) s[2];
4308 p[3] = (unsigned char) s[3];
4309 s += 4;
4310 p += 4;
4311 }
4312 while (s < e)
4313 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004315
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317 Py_XDECREF(v);
4318 return NULL;
4319}
4320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321/* create or adjust a UnicodeEncodeError */
4322static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004323 const char *encoding,
4324 const Py_UNICODE *unicode, Py_ssize_t size,
4325 Py_ssize_t startpos, Py_ssize_t endpos,
4326 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004329 *exceptionObject = PyUnicodeEncodeError_Create(
4330 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 }
4332 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004333 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4334 goto onError;
4335 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4336 goto onError;
4337 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4338 goto onError;
4339 return;
4340 onError:
4341 Py_DECREF(*exceptionObject);
4342 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 }
4344}
4345
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346/* raises a UnicodeEncodeError */
4347static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004348 const char *encoding,
4349 const Py_UNICODE *unicode, Py_ssize_t size,
4350 Py_ssize_t startpos, Py_ssize_t endpos,
4351 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352{
4353 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004354 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004356 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357}
4358
4359/* error handling callback helper:
4360 build arguments, call the callback and check the arguments,
4361 put the result into newpos and return the replacement string, which
4362 has to be freed by the caller */
4363static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004364 PyObject **errorHandler,
4365 const char *encoding, const char *reason,
4366 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4367 Py_ssize_t startpos, Py_ssize_t endpos,
4368 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004370 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371
4372 PyObject *restuple;
4373 PyObject *resunicode;
4374
4375 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004377 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004378 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379 }
4380
4381 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004384 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385
4386 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004387 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004391 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 Py_DECREF(restuple);
4393 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004395 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 &resunicode, newpos)) {
4397 Py_DECREF(restuple);
4398 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004400 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4401 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4402 Py_DECREF(restuple);
4403 return NULL;
4404 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004407 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4409 Py_DECREF(restuple);
4410 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004411 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 Py_INCREF(resunicode);
4413 Py_DECREF(restuple);
4414 return resunicode;
4415}
4416
4417static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004418 Py_ssize_t size,
4419 const char *errors,
4420 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421{
4422 /* output object */
4423 PyObject *res;
4424 /* pointers to the beginning and end+1 of input */
4425 const Py_UNICODE *startp = p;
4426 const Py_UNICODE *endp = p + size;
4427 /* pointer to the beginning of the unencodable characters */
4428 /* const Py_UNICODE *badp = NULL; */
4429 /* pointer into the output */
4430 char *str;
4431 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004433 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4434 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 PyObject *errorHandler = NULL;
4436 PyObject *exc = NULL;
4437 /* the following variable is used for caching string comparisons
4438 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4439 int known_errorHandler = -1;
4440
4441 /* allocate enough for a simple encoding without
4442 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004443 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004444 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004445 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004447 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004448 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449 ressize = size;
4450
4451 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 /* can we encode this? */
4455 if (c<limit) {
4456 /* no overflow check, because we know that the space is enough */
4457 *str++ = (char)c;
4458 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004459 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 else {
4461 Py_ssize_t unicodepos = p-startp;
4462 Py_ssize_t requiredsize;
4463 PyObject *repunicode;
4464 Py_ssize_t repsize;
4465 Py_ssize_t newpos;
4466 Py_ssize_t respos;
4467 Py_UNICODE *uni2;
4468 /* startpos for collecting unencodable chars */
4469 const Py_UNICODE *collstart = p;
4470 const Py_UNICODE *collend = p;
4471 /* find all unecodable characters */
4472 while ((collend < endp) && ((*collend)>=limit))
4473 ++collend;
4474 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4475 if (known_errorHandler==-1) {
4476 if ((errors==NULL) || (!strcmp(errors, "strict")))
4477 known_errorHandler = 1;
4478 else if (!strcmp(errors, "replace"))
4479 known_errorHandler = 2;
4480 else if (!strcmp(errors, "ignore"))
4481 known_errorHandler = 3;
4482 else if (!strcmp(errors, "xmlcharrefreplace"))
4483 known_errorHandler = 4;
4484 else
4485 known_errorHandler = 0;
4486 }
4487 switch (known_errorHandler) {
4488 case 1: /* strict */
4489 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4490 goto onError;
4491 case 2: /* replace */
4492 while (collstart++<collend)
4493 *str++ = '?'; /* fall through */
4494 case 3: /* ignore */
4495 p = collend;
4496 break;
4497 case 4: /* xmlcharrefreplace */
4498 respos = str - PyBytes_AS_STRING(res);
4499 /* determine replacement size (temporarily (mis)uses p) */
4500 for (p = collstart, repsize = 0; p < collend; ++p) {
4501 if (*p<10)
4502 repsize += 2+1+1;
4503 else if (*p<100)
4504 repsize += 2+2+1;
4505 else if (*p<1000)
4506 repsize += 2+3+1;
4507 else if (*p<10000)
4508 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004509#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004510 else
4511 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004512#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 else if (*p<100000)
4514 repsize += 2+5+1;
4515 else if (*p<1000000)
4516 repsize += 2+6+1;
4517 else
4518 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004519#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 }
4521 requiredsize = respos+repsize+(endp-collend);
4522 if (requiredsize > ressize) {
4523 if (requiredsize<2*ressize)
4524 requiredsize = 2*ressize;
4525 if (_PyBytes_Resize(&res, requiredsize))
4526 goto onError;
4527 str = PyBytes_AS_STRING(res) + respos;
4528 ressize = requiredsize;
4529 }
4530 /* generate replacement (temporarily (mis)uses p) */
4531 for (p = collstart; p < collend; ++p) {
4532 str += sprintf(str, "&#%d;", (int)*p);
4533 }
4534 p = collend;
4535 break;
4536 default:
4537 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4538 encoding, reason, startp, size, &exc,
4539 collstart-startp, collend-startp, &newpos);
4540 if (repunicode == NULL)
4541 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004542 if (PyBytes_Check(repunicode)) {
4543 /* Directly copy bytes result to output. */
4544 repsize = PyBytes_Size(repunicode);
4545 if (repsize > 1) {
4546 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004547 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004548 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4549 Py_DECREF(repunicode);
4550 goto onError;
4551 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004552 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004553 ressize += repsize-1;
4554 }
4555 memcpy(str, PyBytes_AsString(repunicode), repsize);
4556 str += repsize;
4557 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004558 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004559 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004560 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004561 /* need more space? (at least enough for what we
4562 have+the replacement+the rest of the string, so
4563 we won't have to check space for encodable characters) */
4564 respos = str - PyBytes_AS_STRING(res);
4565 repsize = PyUnicode_GET_SIZE(repunicode);
4566 requiredsize = respos+repsize+(endp-collend);
4567 if (requiredsize > ressize) {
4568 if (requiredsize<2*ressize)
4569 requiredsize = 2*ressize;
4570 if (_PyBytes_Resize(&res, requiredsize)) {
4571 Py_DECREF(repunicode);
4572 goto onError;
4573 }
4574 str = PyBytes_AS_STRING(res) + respos;
4575 ressize = requiredsize;
4576 }
4577 /* check if there is anything unencodable in the replacement
4578 and copy it to the output */
4579 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4580 c = *uni2;
4581 if (c >= limit) {
4582 raise_encode_exception(&exc, encoding, startp, size,
4583 unicodepos, unicodepos+1, reason);
4584 Py_DECREF(repunicode);
4585 goto onError;
4586 }
4587 *str = (char)c;
4588 }
4589 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004590 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004591 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004592 }
4593 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004594 /* Resize if we allocated to much */
4595 size = str - PyBytes_AS_STRING(res);
4596 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004597 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004598 if (_PyBytes_Resize(&res, size) < 0)
4599 goto onError;
4600 }
4601
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 Py_XDECREF(errorHandler);
4603 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004604 return res;
4605
4606 onError:
4607 Py_XDECREF(res);
4608 Py_XDECREF(errorHandler);
4609 Py_XDECREF(exc);
4610 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611}
4612
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 Py_ssize_t size,
4615 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618}
4619
4620PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4621{
4622 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 PyErr_BadArgument();
4624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004625 }
4626 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 PyUnicode_GET_SIZE(unicode),
4628 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629}
4630
4631/* --- 7-bit ASCII Codec -------------------------------------------------- */
4632
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004634 Py_ssize_t size,
4635 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638 PyUnicodeObject *v;
4639 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004640 Py_ssize_t startinpos;
4641 Py_ssize_t endinpos;
4642 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 const char *e;
4644 PyObject *errorHandler = NULL;
4645 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004646
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004648 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004649 Py_UNICODE r = *(unsigned char*)s;
4650 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004651 }
Tim Petersced69f82003-09-16 20:30:58 +00004652
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 v = _PyUnicode_New(size);
4654 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 e = s + size;
4660 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004661 register unsigned char c = (unsigned char)*s;
4662 if (c < 128) {
4663 *p++ = c;
4664 ++s;
4665 }
4666 else {
4667 startinpos = s-starts;
4668 endinpos = startinpos + 1;
4669 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4670 if (unicode_decode_call_errorhandler(
4671 errors, &errorHandler,
4672 "ascii", "ordinal not in range(128)",
4673 &starts, &e, &startinpos, &endinpos, &exc, &s,
4674 &v, &outpos, &p))
4675 goto onError;
4676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004678 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004679 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4680 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004681 Py_XDECREF(errorHandler);
4682 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004684
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687 Py_XDECREF(errorHandler);
4688 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 return NULL;
4690}
4691
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 Py_ssize_t size,
4694 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697}
4698
4699PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4700{
4701 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004702 PyErr_BadArgument();
4703 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704 }
4705 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 PyUnicode_GET_SIZE(unicode),
4707 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708}
4709
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004710#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004711
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004712/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004713
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004714#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004715#define NEED_RETRY
4716#endif
4717
4718/* XXX This code is limited to "true" double-byte encodings, as
4719 a) it assumes an incomplete character consists of a single byte, and
4720 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004722
4723static int is_dbcs_lead_byte(const char *s, int offset)
4724{
4725 const char *curr = s + offset;
4726
4727 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004728 const char *prev = CharPrev(s, curr);
4729 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004730 }
4731 return 0;
4732}
4733
4734/*
4735 * Decode MBCS string into unicode object. If 'final' is set, converts
4736 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4737 */
4738static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 const char *s, /* MBCS string */
4740 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004741 int final,
4742 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004743{
4744 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004745 Py_ssize_t n;
4746 DWORD usize;
4747 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004748
4749 assert(size >= 0);
4750
Victor Stinner554f3f02010-06-16 23:33:54 +00004751 /* check and handle 'errors' arg */
4752 if (errors==NULL || strcmp(errors, "strict")==0)
4753 flags = MB_ERR_INVALID_CHARS;
4754 else if (strcmp(errors, "ignore")==0)
4755 flags = 0;
4756 else {
4757 PyErr_Format(PyExc_ValueError,
4758 "mbcs encoding does not support errors='%s'",
4759 errors);
4760 return -1;
4761 }
4762
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004763 /* Skip trailing lead-byte unless 'final' is set */
4764 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004765 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004766
4767 /* First get the size of the result */
4768 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004769 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4770 if (usize==0)
4771 goto mbcs_decode_error;
4772 } else
4773 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004774
4775 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 /* Create unicode object */
4777 *v = _PyUnicode_New(usize);
4778 if (*v == NULL)
4779 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004780 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004781 }
4782 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004783 /* Extend unicode object */
4784 n = PyUnicode_GET_SIZE(*v);
4785 if (_PyUnicode_Resize(v, n + usize) < 0)
4786 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004787 }
4788
4789 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004790 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004792 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4793 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004795 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004796 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004797
4798mbcs_decode_error:
4799 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4800 we raise a UnicodeDecodeError - else it is a 'generic'
4801 windows error
4802 */
4803 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4804 /* Ideally, we should get reason from FormatMessage - this
4805 is the Windows 2000 English version of the message
4806 */
4807 PyObject *exc = NULL;
4808 const char *reason = "No mapping for the Unicode character exists "
4809 "in the target multi-byte code page.";
4810 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4811 if (exc != NULL) {
4812 PyCodec_StrictErrors(exc);
4813 Py_DECREF(exc);
4814 }
4815 } else {
4816 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4817 }
4818 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004819}
4820
4821PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004822 Py_ssize_t size,
4823 const char *errors,
4824 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004825{
4826 PyUnicodeObject *v = NULL;
4827 int done;
4828
4829 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004830 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004831
4832#ifdef NEED_RETRY
4833 retry:
4834 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004835 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004836 else
4837#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004838 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004839
4840 if (done < 0) {
4841 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004842 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004843 }
4844
4845 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004847
4848#ifdef NEED_RETRY
4849 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004850 s += done;
4851 size -= done;
4852 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004853 }
4854#endif
4855
4856 return (PyObject *)v;
4857}
4858
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004859PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 Py_ssize_t size,
4861 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004862{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004863 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4864}
4865
4866/*
4867 * Convert unicode into string object (MBCS).
4868 * Returns 0 if succeed, -1 otherwise.
4869 */
4870static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004871 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004872 int size, /* size of unicode */
4873 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004874{
Victor Stinner554f3f02010-06-16 23:33:54 +00004875 BOOL usedDefaultChar = FALSE;
4876 BOOL *pusedDefaultChar;
4877 int mbcssize;
4878 Py_ssize_t n;
4879 PyObject *exc = NULL;
4880 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004881
4882 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004883
Victor Stinner554f3f02010-06-16 23:33:54 +00004884 /* check and handle 'errors' arg */
4885 if (errors==NULL || strcmp(errors, "strict")==0) {
4886 flags = WC_NO_BEST_FIT_CHARS;
4887 pusedDefaultChar = &usedDefaultChar;
4888 } else if (strcmp(errors, "replace")==0) {
4889 flags = 0;
4890 pusedDefaultChar = NULL;
4891 } else {
4892 PyErr_Format(PyExc_ValueError,
4893 "mbcs encoding does not support errors='%s'",
4894 errors);
4895 return -1;
4896 }
4897
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004898 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004899 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004900 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4901 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 if (mbcssize == 0) {
4903 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4904 return -1;
4905 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004906 /* If we used a default char, then we failed! */
4907 if (pusedDefaultChar && *pusedDefaultChar)
4908 goto mbcs_encode_error;
4909 } else {
4910 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004911 }
4912
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004913 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 /* Create string object */
4915 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4916 if (*repr == NULL)
4917 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004918 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004919 }
4920 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004921 /* Extend string object */
4922 n = PyBytes_Size(*repr);
4923 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4924 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004925 }
4926
4927 /* Do the conversion */
4928 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004930 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4931 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004932 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4933 return -1;
4934 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004935 if (pusedDefaultChar && *pusedDefaultChar)
4936 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004937 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004938 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004939
4940mbcs_encode_error:
4941 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4942 Py_XDECREF(exc);
4943 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004944}
4945
4946PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 Py_ssize_t size,
4948 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004949{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004950 PyObject *repr = NULL;
4951 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004952
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004953#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004955 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004956 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004957 else
4958#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004959 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004960
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004961 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 Py_XDECREF(repr);
4963 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004964 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004965
4966#ifdef NEED_RETRY
4967 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004968 p += INT_MAX;
4969 size -= INT_MAX;
4970 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004971 }
4972#endif
4973
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004974 return repr;
4975}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004976
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004977PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4978{
4979 if (!PyUnicode_Check(unicode)) {
4980 PyErr_BadArgument();
4981 return NULL;
4982 }
4983 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004984 PyUnicode_GET_SIZE(unicode),
4985 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004986}
4987
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004988#undef NEED_RETRY
4989
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004990#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004991
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992/* --- Character Mapping Codec -------------------------------------------- */
4993
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004995 Py_ssize_t size,
4996 PyObject *mapping,
4997 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004999 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005000 Py_ssize_t startinpos;
5001 Py_ssize_t endinpos;
5002 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005003 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004 PyUnicodeObject *v;
5005 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005006 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007 PyObject *errorHandler = NULL;
5008 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005009 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005010 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005011
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 /* Default to Latin-1 */
5013 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005014 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015
5016 v = _PyUnicode_New(size);
5017 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005023 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 mapstring = PyUnicode_AS_UNICODE(mapping);
5025 maplen = PyUnicode_GET_SIZE(mapping);
5026 while (s < e) {
5027 unsigned char ch = *s;
5028 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 if (ch < maplen)
5031 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 if (x == 0xfffe) {
5034 /* undefined mapping */
5035 outpos = p-PyUnicode_AS_UNICODE(v);
5036 startinpos = s-starts;
5037 endinpos = startinpos+1;
5038 if (unicode_decode_call_errorhandler(
5039 errors, &errorHandler,
5040 "charmap", "character maps to <undefined>",
5041 &starts, &e, &startinpos, &endinpos, &exc, &s,
5042 &v, &outpos, &p)) {
5043 goto onError;
5044 }
5045 continue;
5046 }
5047 *p++ = x;
5048 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005049 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005050 }
5051 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 while (s < e) {
5053 unsigned char ch = *s;
5054 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005055
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5057 w = PyLong_FromLong((long)ch);
5058 if (w == NULL)
5059 goto onError;
5060 x = PyObject_GetItem(mapping, w);
5061 Py_DECREF(w);
5062 if (x == NULL) {
5063 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5064 /* No mapping found means: mapping is undefined. */
5065 PyErr_Clear();
5066 x = Py_None;
5067 Py_INCREF(x);
5068 } else
5069 goto onError;
5070 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005071
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 /* Apply mapping */
5073 if (PyLong_Check(x)) {
5074 long value = PyLong_AS_LONG(x);
5075 if (value < 0 || value > 65535) {
5076 PyErr_SetString(PyExc_TypeError,
5077 "character mapping must be in range(65536)");
5078 Py_DECREF(x);
5079 goto onError;
5080 }
5081 *p++ = (Py_UNICODE)value;
5082 }
5083 else if (x == Py_None) {
5084 /* undefined mapping */
5085 outpos = p-PyUnicode_AS_UNICODE(v);
5086 startinpos = s-starts;
5087 endinpos = startinpos+1;
5088 if (unicode_decode_call_errorhandler(
5089 errors, &errorHandler,
5090 "charmap", "character maps to <undefined>",
5091 &starts, &e, &startinpos, &endinpos, &exc, &s,
5092 &v, &outpos, &p)) {
5093 Py_DECREF(x);
5094 goto onError;
5095 }
5096 Py_DECREF(x);
5097 continue;
5098 }
5099 else if (PyUnicode_Check(x)) {
5100 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005101
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 if (targetsize == 1)
5103 /* 1-1 mapping */
5104 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005105
Benjamin Peterson29060642009-01-31 22:14:21 +00005106 else if (targetsize > 1) {
5107 /* 1-n mapping */
5108 if (targetsize > extrachars) {
5109 /* resize first */
5110 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5111 Py_ssize_t needed = (targetsize - extrachars) + \
5112 (targetsize << 2);
5113 extrachars += needed;
5114 /* XXX overflow detection missing */
5115 if (_PyUnicode_Resize(&v,
5116 PyUnicode_GET_SIZE(v) + needed) < 0) {
5117 Py_DECREF(x);
5118 goto onError;
5119 }
5120 p = PyUnicode_AS_UNICODE(v) + oldpos;
5121 }
5122 Py_UNICODE_COPY(p,
5123 PyUnicode_AS_UNICODE(x),
5124 targetsize);
5125 p += targetsize;
5126 extrachars -= targetsize;
5127 }
5128 /* 1-0 mapping: skip the character */
5129 }
5130 else {
5131 /* wrong return value */
5132 PyErr_SetString(PyExc_TypeError,
5133 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005134 Py_DECREF(x);
5135 goto onError;
5136 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 Py_DECREF(x);
5138 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140 }
5141 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5143 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005144 Py_XDECREF(errorHandler);
5145 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005147
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149 Py_XDECREF(errorHandler);
5150 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151 Py_XDECREF(v);
5152 return NULL;
5153}
5154
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005155/* Charmap encoding: the lookup table */
5156
5157struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 PyObject_HEAD
5159 unsigned char level1[32];
5160 int count2, count3;
5161 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005162};
5163
5164static PyObject*
5165encoding_map_size(PyObject *obj, PyObject* args)
5166{
5167 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005168 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005169 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005170}
5171
5172static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005173 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 PyDoc_STR("Return the size (in bytes) of this object") },
5175 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005176};
5177
5178static void
5179encoding_map_dealloc(PyObject* o)
5180{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005181 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005182}
5183
5184static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005185 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005186 "EncodingMap", /*tp_name*/
5187 sizeof(struct encoding_map), /*tp_basicsize*/
5188 0, /*tp_itemsize*/
5189 /* methods */
5190 encoding_map_dealloc, /*tp_dealloc*/
5191 0, /*tp_print*/
5192 0, /*tp_getattr*/
5193 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005194 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 0, /*tp_repr*/
5196 0, /*tp_as_number*/
5197 0, /*tp_as_sequence*/
5198 0, /*tp_as_mapping*/
5199 0, /*tp_hash*/
5200 0, /*tp_call*/
5201 0, /*tp_str*/
5202 0, /*tp_getattro*/
5203 0, /*tp_setattro*/
5204 0, /*tp_as_buffer*/
5205 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5206 0, /*tp_doc*/
5207 0, /*tp_traverse*/
5208 0, /*tp_clear*/
5209 0, /*tp_richcompare*/
5210 0, /*tp_weaklistoffset*/
5211 0, /*tp_iter*/
5212 0, /*tp_iternext*/
5213 encoding_map_methods, /*tp_methods*/
5214 0, /*tp_members*/
5215 0, /*tp_getset*/
5216 0, /*tp_base*/
5217 0, /*tp_dict*/
5218 0, /*tp_descr_get*/
5219 0, /*tp_descr_set*/
5220 0, /*tp_dictoffset*/
5221 0, /*tp_init*/
5222 0, /*tp_alloc*/
5223 0, /*tp_new*/
5224 0, /*tp_free*/
5225 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005226};
5227
5228PyObject*
5229PyUnicode_BuildEncodingMap(PyObject* string)
5230{
5231 Py_UNICODE *decode;
5232 PyObject *result;
5233 struct encoding_map *mresult;
5234 int i;
5235 int need_dict = 0;
5236 unsigned char level1[32];
5237 unsigned char level2[512];
5238 unsigned char *mlevel1, *mlevel2, *mlevel3;
5239 int count2 = 0, count3 = 0;
5240
5241 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5242 PyErr_BadArgument();
5243 return NULL;
5244 }
5245 decode = PyUnicode_AS_UNICODE(string);
5246 memset(level1, 0xFF, sizeof level1);
5247 memset(level2, 0xFF, sizeof level2);
5248
5249 /* If there isn't a one-to-one mapping of NULL to \0,
5250 or if there are non-BMP characters, we need to use
5251 a mapping dictionary. */
5252 if (decode[0] != 0)
5253 need_dict = 1;
5254 for (i = 1; i < 256; i++) {
5255 int l1, l2;
5256 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005257#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005258 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005259#endif
5260 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005261 need_dict = 1;
5262 break;
5263 }
5264 if (decode[i] == 0xFFFE)
5265 /* unmapped character */
5266 continue;
5267 l1 = decode[i] >> 11;
5268 l2 = decode[i] >> 7;
5269 if (level1[l1] == 0xFF)
5270 level1[l1] = count2++;
5271 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005272 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005273 }
5274
5275 if (count2 >= 0xFF || count3 >= 0xFF)
5276 need_dict = 1;
5277
5278 if (need_dict) {
5279 PyObject *result = PyDict_New();
5280 PyObject *key, *value;
5281 if (!result)
5282 return NULL;
5283 for (i = 0; i < 256; i++) {
5284 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005285 key = PyLong_FromLong(decode[i]);
5286 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005287 if (!key || !value)
5288 goto failed1;
5289 if (PyDict_SetItem(result, key, value) == -1)
5290 goto failed1;
5291 Py_DECREF(key);
5292 Py_DECREF(value);
5293 }
5294 return result;
5295 failed1:
5296 Py_XDECREF(key);
5297 Py_XDECREF(value);
5298 Py_DECREF(result);
5299 return NULL;
5300 }
5301
5302 /* Create a three-level trie */
5303 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5304 16*count2 + 128*count3 - 1);
5305 if (!result)
5306 return PyErr_NoMemory();
5307 PyObject_Init(result, &EncodingMapType);
5308 mresult = (struct encoding_map*)result;
5309 mresult->count2 = count2;
5310 mresult->count3 = count3;
5311 mlevel1 = mresult->level1;
5312 mlevel2 = mresult->level23;
5313 mlevel3 = mresult->level23 + 16*count2;
5314 memcpy(mlevel1, level1, 32);
5315 memset(mlevel2, 0xFF, 16*count2);
5316 memset(mlevel3, 0, 128*count3);
5317 count3 = 0;
5318 for (i = 1; i < 256; i++) {
5319 int o1, o2, o3, i2, i3;
5320 if (decode[i] == 0xFFFE)
5321 /* unmapped character */
5322 continue;
5323 o1 = decode[i]>>11;
5324 o2 = (decode[i]>>7) & 0xF;
5325 i2 = 16*mlevel1[o1] + o2;
5326 if (mlevel2[i2] == 0xFF)
5327 mlevel2[i2] = count3++;
5328 o3 = decode[i] & 0x7F;
5329 i3 = 128*mlevel2[i2] + o3;
5330 mlevel3[i3] = i;
5331 }
5332 return result;
5333}
5334
5335static int
5336encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5337{
5338 struct encoding_map *map = (struct encoding_map*)mapping;
5339 int l1 = c>>11;
5340 int l2 = (c>>7) & 0xF;
5341 int l3 = c & 0x7F;
5342 int i;
5343
5344#ifdef Py_UNICODE_WIDE
5345 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005347 }
5348#endif
5349 if (c == 0)
5350 return 0;
5351 /* level 1*/
5352 i = map->level1[l1];
5353 if (i == 0xFF) {
5354 return -1;
5355 }
5356 /* level 2*/
5357 i = map->level23[16*i+l2];
5358 if (i == 0xFF) {
5359 return -1;
5360 }
5361 /* level 3 */
5362 i = map->level23[16*map->count2 + 128*i + l3];
5363 if (i == 0) {
5364 return -1;
5365 }
5366 return i;
5367}
5368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005369/* Lookup the character ch in the mapping. If the character
5370 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005371 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373{
Christian Heimes217cfd12007-12-02 14:31:20 +00005374 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005375 PyObject *x;
5376
5377 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005379 x = PyObject_GetItem(mapping, w);
5380 Py_DECREF(w);
5381 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005382 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5383 /* No mapping found means: mapping is undefined. */
5384 PyErr_Clear();
5385 x = Py_None;
5386 Py_INCREF(x);
5387 return x;
5388 } else
5389 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005391 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005393 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 long value = PyLong_AS_LONG(x);
5395 if (value < 0 || value > 255) {
5396 PyErr_SetString(PyExc_TypeError,
5397 "character mapping must be in range(256)");
5398 Py_DECREF(x);
5399 return NULL;
5400 }
5401 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005403 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 /* wrong return value */
5407 PyErr_Format(PyExc_TypeError,
5408 "character mapping must return integer, bytes or None, not %.400s",
5409 x->ob_type->tp_name);
5410 Py_DECREF(x);
5411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 }
5413}
5414
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005415static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005416charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005417{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005418 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5419 /* exponentially overallocate to minimize reallocations */
5420 if (requiredsize < 2*outsize)
5421 requiredsize = 2*outsize;
5422 if (_PyBytes_Resize(outobj, requiredsize))
5423 return -1;
5424 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005425}
5426
Benjamin Peterson14339b62009-01-31 16:36:08 +00005427typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005429}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005430/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005431 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 space is available. Return a new reference to the object that
5433 was put in the output buffer, or Py_None, if the mapping was undefined
5434 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005435 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005436static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005437charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005439{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005440 PyObject *rep;
5441 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005442 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005443
Christian Heimes90aa7642007-12-19 02:45:37 +00005444 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005445 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005447 if (res == -1)
5448 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 if (outsize<requiredsize)
5450 if (charmapencode_resize(outobj, outpos, requiredsize))
5451 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005452 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 outstart[(*outpos)++] = (char)res;
5454 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005455 }
5456
5457 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005460 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 Py_DECREF(rep);
5462 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005463 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 if (PyLong_Check(rep)) {
5465 Py_ssize_t requiredsize = *outpos+1;
5466 if (outsize<requiredsize)
5467 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5468 Py_DECREF(rep);
5469 return enc_EXCEPTION;
5470 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005471 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005473 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 else {
5475 const char *repchars = PyBytes_AS_STRING(rep);
5476 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5477 Py_ssize_t requiredsize = *outpos+repsize;
5478 if (outsize<requiredsize)
5479 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5480 Py_DECREF(rep);
5481 return enc_EXCEPTION;
5482 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005483 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 memcpy(outstart + *outpos, repchars, repsize);
5485 *outpos += repsize;
5486 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005487 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005488 Py_DECREF(rep);
5489 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005490}
5491
5492/* handle an error in PyUnicode_EncodeCharmap
5493 Return 0 on success, -1 on error */
5494static
5495int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005496 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005497 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005498 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005499 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005500{
5501 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005502 Py_ssize_t repsize;
5503 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005504 Py_UNICODE *uni2;
5505 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005506 Py_ssize_t collstartpos = *inpos;
5507 Py_ssize_t collendpos = *inpos+1;
5508 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509 char *encoding = "charmap";
5510 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005511 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005513 /* find all unencodable characters */
5514 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005515 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005516 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 int res = encoding_map_lookup(p[collendpos], mapping);
5518 if (res != -1)
5519 break;
5520 ++collendpos;
5521 continue;
5522 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005523
Benjamin Peterson29060642009-01-31 22:14:21 +00005524 rep = charmapencode_lookup(p[collendpos], mapping);
5525 if (rep==NULL)
5526 return -1;
5527 else if (rep!=Py_None) {
5528 Py_DECREF(rep);
5529 break;
5530 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005531 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005533 }
5534 /* cache callback name lookup
5535 * (if not done yet, i.e. it's the first error) */
5536 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 if ((errors==NULL) || (!strcmp(errors, "strict")))
5538 *known_errorHandler = 1;
5539 else if (!strcmp(errors, "replace"))
5540 *known_errorHandler = 2;
5541 else if (!strcmp(errors, "ignore"))
5542 *known_errorHandler = 3;
5543 else if (!strcmp(errors, "xmlcharrefreplace"))
5544 *known_errorHandler = 4;
5545 else
5546 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005547 }
5548 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005549 case 1: /* strict */
5550 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5551 return -1;
5552 case 2: /* replace */
5553 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 x = charmapencode_output('?', mapping, res, respos);
5555 if (x==enc_EXCEPTION) {
5556 return -1;
5557 }
5558 else if (x==enc_FAILED) {
5559 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5560 return -1;
5561 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005562 }
5563 /* fall through */
5564 case 3: /* ignore */
5565 *inpos = collendpos;
5566 break;
5567 case 4: /* xmlcharrefreplace */
5568 /* generate replacement (temporarily (mis)uses p) */
5569 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 char buffer[2+29+1+1];
5571 char *cp;
5572 sprintf(buffer, "&#%d;", (int)p[collpos]);
5573 for (cp = buffer; *cp; ++cp) {
5574 x = charmapencode_output(*cp, mapping, res, respos);
5575 if (x==enc_EXCEPTION)
5576 return -1;
5577 else if (x==enc_FAILED) {
5578 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5579 return -1;
5580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005581 }
5582 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005583 *inpos = collendpos;
5584 break;
5585 default:
5586 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 encoding, reason, p, size, exceptionObject,
5588 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005589 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005591 if (PyBytes_Check(repunicode)) {
5592 /* Directly copy bytes result to output. */
5593 Py_ssize_t outsize = PyBytes_Size(*res);
5594 Py_ssize_t requiredsize;
5595 repsize = PyBytes_Size(repunicode);
5596 requiredsize = *respos + repsize;
5597 if (requiredsize > outsize)
5598 /* Make room for all additional bytes. */
5599 if (charmapencode_resize(res, respos, requiredsize)) {
5600 Py_DECREF(repunicode);
5601 return -1;
5602 }
5603 memcpy(PyBytes_AsString(*res) + *respos,
5604 PyBytes_AsString(repunicode), repsize);
5605 *respos += repsize;
5606 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005607 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005608 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005609 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005610 /* generate replacement */
5611 repsize = PyUnicode_GET_SIZE(repunicode);
5612 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 x = charmapencode_output(*uni2, mapping, res, respos);
5614 if (x==enc_EXCEPTION) {
5615 return -1;
5616 }
5617 else if (x==enc_FAILED) {
5618 Py_DECREF(repunicode);
5619 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5620 return -1;
5621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005622 }
5623 *inpos = newpos;
5624 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 }
5626 return 0;
5627}
5628
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 Py_ssize_t size,
5631 PyObject *mapping,
5632 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005634 /* output object */
5635 PyObject *res = NULL;
5636 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005637 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005638 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005639 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640 PyObject *errorHandler = NULL;
5641 PyObject *exc = NULL;
5642 /* the following variable is used for caching string comparisons
5643 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5644 * 3=ignore, 4=xmlcharrefreplace */
5645 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646
5647 /* Default to Latin-1 */
5648 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 /* allocate enough for a simple encoding without
5652 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005653 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654 if (res == NULL)
5655 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005656 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005659 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 /* try to encode it */
5661 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5662 if (x==enc_EXCEPTION) /* error */
5663 goto onError;
5664 if (x==enc_FAILED) { /* unencodable character */
5665 if (charmap_encoding_error(p, size, &inpos, mapping,
5666 &exc,
5667 &known_errorHandler, &errorHandler, errors,
5668 &res, &respos)) {
5669 goto onError;
5670 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005671 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 else
5673 /* done with this character => adjust input position */
5674 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005678 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005679 if (_PyBytes_Resize(&res, respos) < 0)
5680 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005681
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005682 Py_XDECREF(exc);
5683 Py_XDECREF(errorHandler);
5684 return res;
5685
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687 Py_XDECREF(res);
5688 Py_XDECREF(exc);
5689 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 return NULL;
5691}
5692
5693PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695{
5696 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 PyErr_BadArgument();
5698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 }
5700 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 PyUnicode_GET_SIZE(unicode),
5702 mapping,
5703 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704}
5705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706/* create or adjust a UnicodeTranslateError */
5707static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 const Py_UNICODE *unicode, Py_ssize_t size,
5709 Py_ssize_t startpos, Py_ssize_t endpos,
5710 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005713 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 }
5716 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5718 goto onError;
5719 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5720 goto onError;
5721 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5722 goto onError;
5723 return;
5724 onError:
5725 Py_DECREF(*exceptionObject);
5726 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 }
5728}
5729
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005730/* raises a UnicodeTranslateError */
5731static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005732 const Py_UNICODE *unicode, Py_ssize_t size,
5733 Py_ssize_t startpos, Py_ssize_t endpos,
5734 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735{
5736 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005738 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740}
5741
5742/* error handling callback helper:
5743 build arguments, call the callback and check the arguments,
5744 put the result into newpos and return the replacement string, which
5745 has to be freed by the caller */
5746static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 PyObject **errorHandler,
5748 const char *reason,
5749 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5750 Py_ssize_t startpos, Py_ssize_t endpos,
5751 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005753 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005755 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 PyObject *restuple;
5757 PyObject *resunicode;
5758
5759 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763 }
5764
5765 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769
5770 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005772 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005775 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 Py_DECREF(restuple);
5777 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 }
5779 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 &resunicode, &i_newpos)) {
5781 Py_DECREF(restuple);
5782 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005784 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005786 else
5787 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005788 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5790 Py_DECREF(restuple);
5791 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005792 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 Py_INCREF(resunicode);
5794 Py_DECREF(restuple);
5795 return resunicode;
5796}
5797
5798/* Lookup the character ch in the mapping and put the result in result,
5799 which must be decrefed by the caller.
5800 Return 0 on success, -1 on error */
5801static
5802int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5803{
Christian Heimes217cfd12007-12-02 14:31:20 +00005804 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 PyObject *x;
5806
5807 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 x = PyObject_GetItem(mapping, w);
5810 Py_DECREF(w);
5811 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5813 /* No mapping found means: use 1:1 mapping. */
5814 PyErr_Clear();
5815 *result = NULL;
5816 return 0;
5817 } else
5818 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005819 }
5820 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 *result = x;
5822 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005823 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005824 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 long value = PyLong_AS_LONG(x);
5826 long max = PyUnicode_GetMax();
5827 if (value < 0 || value > max) {
5828 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005829 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 Py_DECREF(x);
5831 return -1;
5832 }
5833 *result = x;
5834 return 0;
5835 }
5836 else if (PyUnicode_Check(x)) {
5837 *result = x;
5838 return 0;
5839 }
5840 else {
5841 /* wrong return value */
5842 PyErr_SetString(PyExc_TypeError,
5843 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005844 Py_DECREF(x);
5845 return -1;
5846 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005847}
5848/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 if not reallocate and adjust various state variables.
5850 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851static
Walter Dörwald4894c302003-10-24 14:25:28 +00005852int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005854{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005855 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005856 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 /* remember old output position */
5858 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5859 /* exponentially overallocate to minimize reallocations */
5860 if (requiredsize < 2 * oldsize)
5861 requiredsize = 2 * oldsize;
5862 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5863 return -1;
5864 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005865 }
5866 return 0;
5867}
5868/* lookup the character, put the result in the output string and adjust
5869 various state variables. Return a new reference to the object that
5870 was put in the output buffer in *result, or Py_None, if the mapping was
5871 undefined (in which case no character was written).
5872 The called must decref result.
5873 Return 0 on success, -1 on error. */
5874static
Walter Dörwald4894c302003-10-24 14:25:28 +00005875int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5877 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878{
Walter Dörwald4894c302003-10-24 14:25:28 +00005879 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005881 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 /* not found => default to 1:1 mapping */
5883 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884 }
5885 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005887 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 /* no overflow check, because we know that the space is enough */
5889 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890 }
5891 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5893 if (repsize==1) {
5894 /* no overflow check, because we know that the space is enough */
5895 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5896 }
5897 else if (repsize!=0) {
5898 /* more than one character */
5899 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5900 (insize - (curinp-startinp)) +
5901 repsize - 1;
5902 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5903 return -1;
5904 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5905 *outp += repsize;
5906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005907 }
5908 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 return 0;
5911}
5912
5913PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 Py_ssize_t size,
5915 PyObject *mapping,
5916 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918 /* output object */
5919 PyObject *res = NULL;
5920 /* pointers to the beginning and end+1 of input */
5921 const Py_UNICODE *startp = p;
5922 const Py_UNICODE *endp = p + size;
5923 /* pointer into the output */
5924 Py_UNICODE *str;
5925 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005926 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 char *reason = "character maps to <undefined>";
5928 PyObject *errorHandler = NULL;
5929 PyObject *exc = NULL;
5930 /* the following variable is used for caching string comparisons
5931 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5932 * 3=ignore, 4=xmlcharrefreplace */
5933 int known_errorHandler = -1;
5934
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 PyErr_BadArgument();
5937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005939
5940 /* allocate enough for a simple 1:1 translation without
5941 replacements, if we need more, we'll resize */
5942 res = PyUnicode_FromUnicode(NULL, size);
5943 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005947 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005949 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 /* try to encode it */
5951 PyObject *x = NULL;
5952 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5953 Py_XDECREF(x);
5954 goto onError;
5955 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005956 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 if (x!=Py_None) /* it worked => adjust input pointer */
5958 ++p;
5959 else { /* untranslatable character */
5960 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5961 Py_ssize_t repsize;
5962 Py_ssize_t newpos;
5963 Py_UNICODE *uni2;
5964 /* startpos for collecting untranslatable chars */
5965 const Py_UNICODE *collstart = p;
5966 const Py_UNICODE *collend = p+1;
5967 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 /* find all untranslatable characters */
5970 while (collend < endp) {
5971 if (charmaptranslate_lookup(*collend, mapping, &x))
5972 goto onError;
5973 Py_XDECREF(x);
5974 if (x!=Py_None)
5975 break;
5976 ++collend;
5977 }
5978 /* cache callback name lookup
5979 * (if not done yet, i.e. it's the first error) */
5980 if (known_errorHandler==-1) {
5981 if ((errors==NULL) || (!strcmp(errors, "strict")))
5982 known_errorHandler = 1;
5983 else if (!strcmp(errors, "replace"))
5984 known_errorHandler = 2;
5985 else if (!strcmp(errors, "ignore"))
5986 known_errorHandler = 3;
5987 else if (!strcmp(errors, "xmlcharrefreplace"))
5988 known_errorHandler = 4;
5989 else
5990 known_errorHandler = 0;
5991 }
5992 switch (known_errorHandler) {
5993 case 1: /* strict */
5994 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005995 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 case 2: /* replace */
5997 /* No need to check for space, this is a 1:1 replacement */
5998 for (coll = collstart; coll<collend; ++coll)
5999 *str++ = '?';
6000 /* fall through */
6001 case 3: /* ignore */
6002 p = collend;
6003 break;
6004 case 4: /* xmlcharrefreplace */
6005 /* generate replacement (temporarily (mis)uses p) */
6006 for (p = collstart; p < collend; ++p) {
6007 char buffer[2+29+1+1];
6008 char *cp;
6009 sprintf(buffer, "&#%d;", (int)*p);
6010 if (charmaptranslate_makespace(&res, &str,
6011 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6012 goto onError;
6013 for (cp = buffer; *cp; ++cp)
6014 *str++ = *cp;
6015 }
6016 p = collend;
6017 break;
6018 default:
6019 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6020 reason, startp, size, &exc,
6021 collstart-startp, collend-startp, &newpos);
6022 if (repunicode == NULL)
6023 goto onError;
6024 /* generate replacement */
6025 repsize = PyUnicode_GET_SIZE(repunicode);
6026 if (charmaptranslate_makespace(&res, &str,
6027 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6028 Py_DECREF(repunicode);
6029 goto onError;
6030 }
6031 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6032 *str++ = *uni2;
6033 p = startp + newpos;
6034 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006036 }
6037 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006038 /* Resize if we allocated to much */
6039 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006040 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 if (PyUnicode_Resize(&res, respos) < 0)
6042 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 }
6044 Py_XDECREF(exc);
6045 Py_XDECREF(errorHandler);
6046 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049 Py_XDECREF(res);
6050 Py_XDECREF(exc);
6051 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 return NULL;
6053}
6054
6055PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 PyObject *mapping,
6057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058{
6059 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006060
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 str = PyUnicode_FromObject(str);
6062 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 PyUnicode_GET_SIZE(str),
6066 mapping,
6067 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 Py_DECREF(str);
6069 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006070
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 Py_XDECREF(str);
6073 return NULL;
6074}
Tim Petersced69f82003-09-16 20:30:58 +00006075
Guido van Rossum9e896b32000-04-05 20:11:21 +00006076/* --- Decimal Encoder ---------------------------------------------------- */
6077
6078int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 Py_ssize_t length,
6080 char *output,
6081 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006082{
6083 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006084 PyObject *errorHandler = NULL;
6085 PyObject *exc = NULL;
6086 const char *encoding = "decimal";
6087 const char *reason = "invalid decimal Unicode string";
6088 /* the following variable is used for caching string comparisons
6089 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6090 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006091
6092 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 PyErr_BadArgument();
6094 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006095 }
6096
6097 p = s;
6098 end = s + length;
6099 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 register Py_UNICODE ch = *p;
6101 int decimal;
6102 PyObject *repunicode;
6103 Py_ssize_t repsize;
6104 Py_ssize_t newpos;
6105 Py_UNICODE *uni2;
6106 Py_UNICODE *collstart;
6107 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006108
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006110 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 ++p;
6112 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006113 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 decimal = Py_UNICODE_TODECIMAL(ch);
6115 if (decimal >= 0) {
6116 *output++ = '0' + decimal;
6117 ++p;
6118 continue;
6119 }
6120 if (0 < ch && ch < 256) {
6121 *output++ = (char)ch;
6122 ++p;
6123 continue;
6124 }
6125 /* All other characters are considered unencodable */
6126 collstart = p;
6127 collend = p+1;
6128 while (collend < end) {
6129 if ((0 < *collend && *collend < 256) ||
6130 !Py_UNICODE_ISSPACE(*collend) ||
6131 Py_UNICODE_TODECIMAL(*collend))
6132 break;
6133 }
6134 /* cache callback name lookup
6135 * (if not done yet, i.e. it's the first error) */
6136 if (known_errorHandler==-1) {
6137 if ((errors==NULL) || (!strcmp(errors, "strict")))
6138 known_errorHandler = 1;
6139 else if (!strcmp(errors, "replace"))
6140 known_errorHandler = 2;
6141 else if (!strcmp(errors, "ignore"))
6142 known_errorHandler = 3;
6143 else if (!strcmp(errors, "xmlcharrefreplace"))
6144 known_errorHandler = 4;
6145 else
6146 known_errorHandler = 0;
6147 }
6148 switch (known_errorHandler) {
6149 case 1: /* strict */
6150 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6151 goto onError;
6152 case 2: /* replace */
6153 for (p = collstart; p < collend; ++p)
6154 *output++ = '?';
6155 /* fall through */
6156 case 3: /* ignore */
6157 p = collend;
6158 break;
6159 case 4: /* xmlcharrefreplace */
6160 /* generate replacement (temporarily (mis)uses p) */
6161 for (p = collstart; p < collend; ++p)
6162 output += sprintf(output, "&#%d;", (int)*p);
6163 p = collend;
6164 break;
6165 default:
6166 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6167 encoding, reason, s, length, &exc,
6168 collstart-s, collend-s, &newpos);
6169 if (repunicode == NULL)
6170 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006171 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006172 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006173 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6174 Py_DECREF(repunicode);
6175 goto onError;
6176 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 /* generate replacement */
6178 repsize = PyUnicode_GET_SIZE(repunicode);
6179 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6180 Py_UNICODE ch = *uni2;
6181 if (Py_UNICODE_ISSPACE(ch))
6182 *output++ = ' ';
6183 else {
6184 decimal = Py_UNICODE_TODECIMAL(ch);
6185 if (decimal >= 0)
6186 *output++ = '0' + decimal;
6187 else if (0 < ch && ch < 256)
6188 *output++ = (char)ch;
6189 else {
6190 Py_DECREF(repunicode);
6191 raise_encode_exception(&exc, encoding,
6192 s, length, collstart-s, collend-s, reason);
6193 goto onError;
6194 }
6195 }
6196 }
6197 p = s + newpos;
6198 Py_DECREF(repunicode);
6199 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006200 }
6201 /* 0-terminate the output string */
6202 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006203 Py_XDECREF(exc);
6204 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006205 return 0;
6206
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 Py_XDECREF(exc);
6209 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006210 return -1;
6211}
6212
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213/* --- Helpers ------------------------------------------------------------ */
6214
Eric Smith8c663262007-08-25 02:26:07 +00006215#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006216#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006217
Thomas Wouters477c8d52006-05-27 19:21:47 +00006218#include "stringlib/count.h"
6219#include "stringlib/find.h"
6220#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006221#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006222
Eric Smith5807c412008-05-11 21:00:57 +00006223#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006224#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006225#include "stringlib/localeutil.h"
6226
Thomas Wouters477c8d52006-05-27 19:21:47 +00006227/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006228#define ADJUST_INDICES(start, end, len) \
6229 if (end > len) \
6230 end = len; \
6231 else if (end < 0) { \
6232 end += len; \
6233 if (end < 0) \
6234 end = 0; \
6235 } \
6236 if (start < 0) { \
6237 start += len; \
6238 if (start < 0) \
6239 start = 0; \
6240 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006241
Martin v. Löwis18e16552006-02-15 17:27:45 +00006242Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006243 PyObject *substr,
6244 Py_ssize_t start,
6245 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006247 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006248 PyUnicodeObject* str_obj;
6249 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006250
Thomas Wouters477c8d52006-05-27 19:21:47 +00006251 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6252 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006254 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6255 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 Py_DECREF(str_obj);
6257 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 }
Tim Petersced69f82003-09-16 20:30:58 +00006259
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006260 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006261 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006262 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6263 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006264 );
6265
6266 Py_DECREF(sub_obj);
6267 Py_DECREF(str_obj);
6268
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 return result;
6270}
6271
Martin v. Löwis18e16552006-02-15 17:27:45 +00006272Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006273 PyObject *sub,
6274 Py_ssize_t start,
6275 Py_ssize_t end,
6276 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006278 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006279
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006281 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006283 sub = PyUnicode_FromObject(sub);
6284 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 Py_DECREF(str);
6286 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 }
Tim Petersced69f82003-09-16 20:30:58 +00006288
Thomas Wouters477c8d52006-05-27 19:21:47 +00006289 if (direction > 0)
6290 result = stringlib_find_slice(
6291 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6292 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6293 start, end
6294 );
6295 else
6296 result = stringlib_rfind_slice(
6297 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6298 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6299 start, end
6300 );
6301
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006303 Py_DECREF(sub);
6304
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 return result;
6306}
6307
Tim Petersced69f82003-09-16 20:30:58 +00006308static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 PyUnicodeObject *substring,
6311 Py_ssize_t start,
6312 Py_ssize_t end,
6313 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 if (substring->length == 0)
6316 return 1;
6317
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006318 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 end -= substring->length;
6320 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322
6323 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 if (Py_UNICODE_MATCH(self, end, substring))
6325 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326 } else {
6327 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329 }
6330
6331 return 0;
6332}
6333
Martin v. Löwis18e16552006-02-15 17:27:45 +00006334Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 PyObject *substr,
6336 Py_ssize_t start,
6337 Py_ssize_t end,
6338 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006340 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006341
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 str = PyUnicode_FromObject(str);
6343 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 substr = PyUnicode_FromObject(substr);
6346 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 Py_DECREF(str);
6348 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 }
Tim Petersced69f82003-09-16 20:30:58 +00006350
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 (PyUnicodeObject *)substr,
6353 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 Py_DECREF(str);
6355 Py_DECREF(substr);
6356 return result;
6357}
6358
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359/* Apply fixfct filter to the Unicode object self and return a
6360 reference to the modified object */
6361
Tim Petersced69f82003-09-16 20:30:58 +00006362static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365{
6366
6367 PyUnicodeObject *u;
6368
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006369 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006372
6373 Py_UNICODE_COPY(u->str, self->str, self->length);
6374
Tim Peters7a29bd52001-09-12 03:03:31 +00006375 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 /* fixfct should return TRUE if it modified the buffer. If
6377 FALSE, return a reference to the original buffer instead
6378 (to save space, not time) */
6379 Py_INCREF(self);
6380 Py_DECREF(u);
6381 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382 }
6383 return (PyObject*) u;
6384}
6385
Tim Petersced69f82003-09-16 20:30:58 +00006386static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387int fixupper(PyUnicodeObject *self)
6388{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006389 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 Py_UNICODE *s = self->str;
6391 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006392
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006395
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 ch = Py_UNICODE_TOUPPER(*s);
6397 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 *s = ch;
6400 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 s++;
6402 }
6403
6404 return status;
6405}
6406
Tim Petersced69f82003-09-16 20:30:58 +00006407static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408int fixlower(PyUnicodeObject *self)
6409{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006410 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 Py_UNICODE *s = self->str;
6412 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006413
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006416
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 ch = Py_UNICODE_TOLOWER(*s);
6418 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 *s = ch;
6421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 s++;
6423 }
6424
6425 return status;
6426}
6427
Tim Petersced69f82003-09-16 20:30:58 +00006428static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429int fixswapcase(PyUnicodeObject *self)
6430{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006431 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 Py_UNICODE *s = self->str;
6433 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006434
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 while (len-- > 0) {
6436 if (Py_UNICODE_ISUPPER(*s)) {
6437 *s = Py_UNICODE_TOLOWER(*s);
6438 status = 1;
6439 } else if (Py_UNICODE_ISLOWER(*s)) {
6440 *s = Py_UNICODE_TOUPPER(*s);
6441 status = 1;
6442 }
6443 s++;
6444 }
6445
6446 return status;
6447}
6448
Tim Petersced69f82003-09-16 20:30:58 +00006449static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450int fixcapitalize(PyUnicodeObject *self)
6451{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006452 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006453 Py_UNICODE *s = self->str;
6454 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006455
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006456 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006458 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 *s = Py_UNICODE_TOUPPER(*s);
6460 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006462 s++;
6463 while (--len > 0) {
6464 if (Py_UNICODE_ISUPPER(*s)) {
6465 *s = Py_UNICODE_TOLOWER(*s);
6466 status = 1;
6467 }
6468 s++;
6469 }
6470 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471}
6472
6473static
6474int fixtitle(PyUnicodeObject *self)
6475{
6476 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6477 register Py_UNICODE *e;
6478 int previous_is_cased;
6479
6480 /* Shortcut for single character strings */
6481 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6483 if (*p != ch) {
6484 *p = ch;
6485 return 1;
6486 }
6487 else
6488 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 }
Tim Petersced69f82003-09-16 20:30:58 +00006490
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 e = p + PyUnicode_GET_SIZE(self);
6492 previous_is_cased = 0;
6493 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006495
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 if (previous_is_cased)
6497 *p = Py_UNICODE_TOLOWER(ch);
6498 else
6499 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006500
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 if (Py_UNICODE_ISLOWER(ch) ||
6502 Py_UNICODE_ISUPPER(ch) ||
6503 Py_UNICODE_ISTITLE(ch))
6504 previous_is_cased = 1;
6505 else
6506 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 }
6508 return 1;
6509}
6510
Tim Peters8ce9f162004-08-27 01:49:32 +00006511PyObject *
6512PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513{
Skip Montanaro6543b452004-09-16 03:28:13 +00006514 const Py_UNICODE blank = ' ';
6515 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006516 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006517 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006518 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6519 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006520 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6521 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006522 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006523 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524
Tim Peters05eba1f2004-08-27 21:32:02 +00006525 fseq = PySequence_Fast(seq, "");
6526 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006527 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006528 }
6529
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006530 /* NOTE: the following code can't call back into Python code,
6531 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006532 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006533
Tim Peters05eba1f2004-08-27 21:32:02 +00006534 seqlen = PySequence_Fast_GET_SIZE(fseq);
6535 /* If empty sequence, return u"". */
6536 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006537 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6538 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006539 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006540 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006541 /* If singleton sequence with an exact Unicode, return that. */
6542 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 item = items[0];
6544 if (PyUnicode_CheckExact(item)) {
6545 Py_INCREF(item);
6546 res = (PyUnicodeObject *)item;
6547 goto Done;
6548 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006549 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006550 else {
6551 /* Set up sep and seplen */
6552 if (separator == NULL) {
6553 sep = &blank;
6554 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006555 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006556 else {
6557 if (!PyUnicode_Check(separator)) {
6558 PyErr_Format(PyExc_TypeError,
6559 "separator: expected str instance,"
6560 " %.80s found",
6561 Py_TYPE(separator)->tp_name);
6562 goto onError;
6563 }
6564 sep = PyUnicode_AS_UNICODE(separator);
6565 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006566 }
6567 }
6568
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006569 /* There are at least two things to join, or else we have a subclass
6570 * of str in the sequence.
6571 * Do a pre-pass to figure out the total amount of space we'll
6572 * need (sz), and see whether all argument are strings.
6573 */
6574 sz = 0;
6575 for (i = 0; i < seqlen; i++) {
6576 const Py_ssize_t old_sz = sz;
6577 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 if (!PyUnicode_Check(item)) {
6579 PyErr_Format(PyExc_TypeError,
6580 "sequence item %zd: expected str instance,"
6581 " %.80s found",
6582 i, Py_TYPE(item)->tp_name);
6583 goto onError;
6584 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006585 sz += PyUnicode_GET_SIZE(item);
6586 if (i != 0)
6587 sz += seplen;
6588 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6589 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006591 goto onError;
6592 }
6593 }
Tim Petersced69f82003-09-16 20:30:58 +00006594
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006595 res = _PyUnicode_New(sz);
6596 if (res == NULL)
6597 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006598
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006599 /* Catenate everything. */
6600 res_p = PyUnicode_AS_UNICODE(res);
6601 for (i = 0; i < seqlen; ++i) {
6602 Py_ssize_t itemlen;
6603 item = items[i];
6604 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 /* Copy item, and maybe the separator. */
6606 if (i) {
6607 Py_UNICODE_COPY(res_p, sep, seplen);
6608 res_p += seplen;
6609 }
6610 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6611 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006612 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006613
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006615 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 return (PyObject *)res;
6617
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006619 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006620 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 return NULL;
6622}
6623
Tim Petersced69f82003-09-16 20:30:58 +00006624static
6625PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 Py_ssize_t left,
6627 Py_ssize_t right,
6628 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629{
6630 PyUnicodeObject *u;
6631
6632 if (left < 0)
6633 left = 0;
6634 if (right < 0)
6635 right = 0;
6636
Tim Peters7a29bd52001-09-12 03:03:31 +00006637 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 Py_INCREF(self);
6639 return self;
6640 }
6641
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006642 if (left > PY_SSIZE_T_MAX - self->length ||
6643 right > PY_SSIZE_T_MAX - (left + self->length)) {
6644 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6645 return NULL;
6646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 u = _PyUnicode_New(left + self->length + right);
6648 if (u) {
6649 if (left)
6650 Py_UNICODE_FILL(u->str, fill, left);
6651 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6652 if (right)
6653 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6654 }
6655
6656 return u;
6657}
6658
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006659PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662
6663 string = PyUnicode_FromObject(string);
6664 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006667 list = stringlib_splitlines(
6668 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6669 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670
6671 Py_DECREF(string);
6672 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673}
6674
Tim Petersced69f82003-09-16 20:30:58 +00006675static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 PyUnicodeObject *substring,
6678 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006681 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006684 return stringlib_split_whitespace(
6685 (PyObject*) self, self->str, self->length, maxcount
6686 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006688 return stringlib_split(
6689 (PyObject*) self, self->str, self->length,
6690 substring->str, substring->length,
6691 maxcount
6692 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693}
6694
Tim Petersced69f82003-09-16 20:30:58 +00006695static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006696PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006697 PyUnicodeObject *substring,
6698 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006699{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006700 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006701 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006702
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006703 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006704 return stringlib_rsplit_whitespace(
6705 (PyObject*) self, self->str, self->length, maxcount
6706 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006707
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006708 return stringlib_rsplit(
6709 (PyObject*) self, self->str, self->length,
6710 substring->str, substring->length,
6711 maxcount
6712 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006713}
6714
6715static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 PyUnicodeObject *str1,
6718 PyUnicodeObject *str2,
6719 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
6721 PyUnicodeObject *u;
6722
6723 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006725 else if (maxcount == 0 || self->length == 0)
6726 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727
Thomas Wouters477c8d52006-05-27 19:21:47 +00006728 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006729 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006730 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006731 if (str1->length == 0)
6732 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006733 if (str1->length == 1) {
6734 /* replace characters */
6735 Py_UNICODE u1, u2;
6736 if (!findchar(self->str, self->length, str1->str[0]))
6737 goto nothing;
6738 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6739 if (!u)
6740 return NULL;
6741 Py_UNICODE_COPY(u->str, self->str, self->length);
6742 u1 = str1->str[0];
6743 u2 = str2->str[0];
6744 for (i = 0; i < u->length; i++)
6745 if (u->str[i] == u1) {
6746 if (--maxcount < 0)
6747 break;
6748 u->str[i] = u2;
6749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006751 i = stringlib_find(
6752 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006754 if (i < 0)
6755 goto nothing;
6756 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6757 if (!u)
6758 return NULL;
6759 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006760
6761 /* change everything in-place, starting with this one */
6762 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6763 i += str1->length;
6764
6765 while ( --maxcount > 0) {
6766 i = stringlib_find(self->str+i, self->length-i,
6767 str1->str, str1->length,
6768 i);
6769 if (i == -1)
6770 break;
6771 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6772 i += str1->length;
6773 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006776
6777 Py_ssize_t n, i, j, e;
6778 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 Py_UNICODE *p;
6780
6781 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006782 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6783 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006784 if (n == 0)
6785 goto nothing;
6786 /* new_size = self->length + n * (str2->length - str1->length)); */
6787 delta = (str2->length - str1->length);
6788 if (delta == 0) {
6789 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006791 product = n * (str2->length - str1->length);
6792 if ((product / (str2->length - str1->length)) != n) {
6793 PyErr_SetString(PyExc_OverflowError,
6794 "replace string is too long");
6795 return NULL;
6796 }
6797 new_size = self->length + product;
6798 if (new_size < 0) {
6799 PyErr_SetString(PyExc_OverflowError,
6800 "replace string is too long");
6801 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 }
6803 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006804 u = _PyUnicode_New(new_size);
6805 if (!u)
6806 return NULL;
6807 i = 0;
6808 p = u->str;
6809 e = self->length - str1->length;
6810 if (str1->length > 0) {
6811 while (n-- > 0) {
6812 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006813 j = stringlib_find(self->str+i, self->length-i,
6814 str1->str, str1->length,
6815 i);
6816 if (j == -1)
6817 break;
6818 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006819 /* copy unchanged part [i:j] */
6820 Py_UNICODE_COPY(p, self->str+i, j-i);
6821 p += j - i;
6822 }
6823 /* copy substitution string */
6824 if (str2->length > 0) {
6825 Py_UNICODE_COPY(p, str2->str, str2->length);
6826 p += str2->length;
6827 }
6828 i = j + str1->length;
6829 }
6830 if (i < self->length)
6831 /* copy tail [i:] */
6832 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6833 } else {
6834 /* interleave */
6835 while (n > 0) {
6836 Py_UNICODE_COPY(p, str2->str, str2->length);
6837 p += str2->length;
6838 if (--n <= 0)
6839 break;
6840 *p++ = self->str[i++];
6841 }
6842 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006846
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006848 /* nothing to replace; return original string (when possible) */
6849 if (PyUnicode_CheckExact(self)) {
6850 Py_INCREF(self);
6851 return (PyObject *) self;
6852 }
6853 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854}
6855
6856/* --- Unicode Object Methods --------------------------------------------- */
6857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006858PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860\n\
6861Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006862characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863
6864static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006865unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 return fixup(self, fixtitle);
6868}
6869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006870PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872\n\
6873Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006874have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875
6876static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006877unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 return fixup(self, fixcapitalize);
6880}
6881
6882#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006883PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885\n\
6886Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006887normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888
6889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006890unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891{
6892 PyObject *list;
6893 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006894 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 /* Split into words */
6897 list = split(self, NULL, -1);
6898 if (!list)
6899 return NULL;
6900
6901 /* Capitalize each word */
6902 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6903 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 if (item == NULL)
6906 goto onError;
6907 Py_DECREF(PyList_GET_ITEM(list, i));
6908 PyList_SET_ITEM(list, i, item);
6909 }
6910
6911 /* Join the words to form a new string */
6912 item = PyUnicode_Join(NULL, list);
6913
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 Py_DECREF(list);
6916 return (PyObject *)item;
6917}
6918#endif
6919
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006920/* Argument converter. Coerces to a single unicode character */
6921
6922static int
6923convert_uc(PyObject *obj, void *addr)
6924{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006925 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6926 PyObject *uniobj;
6927 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006928
Benjamin Peterson14339b62009-01-31 16:36:08 +00006929 uniobj = PyUnicode_FromObject(obj);
6930 if (uniobj == NULL) {
6931 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006933 return 0;
6934 }
6935 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6936 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006938 Py_DECREF(uniobj);
6939 return 0;
6940 }
6941 unistr = PyUnicode_AS_UNICODE(uniobj);
6942 *fillcharloc = unistr[0];
6943 Py_DECREF(uniobj);
6944 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006945}
6946
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006947PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006950Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006951done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952
6953static PyObject *
6954unicode_center(PyUnicodeObject *self, PyObject *args)
6955{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006956 Py_ssize_t marg, left;
6957 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006958 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959
Thomas Woutersde017742006-02-16 19:34:37 +00006960 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961 return NULL;
6962
Tim Peters7a29bd52001-09-12 03:03:31 +00006963 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 Py_INCREF(self);
6965 return (PyObject*) self;
6966 }
6967
6968 marg = width - self->length;
6969 left = marg / 2 + (marg & width & 1);
6970
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006971 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972}
6973
Marc-André Lemburge5034372000-08-08 08:04:29 +00006974#if 0
6975
6976/* This code should go into some future Unicode collation support
6977 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006978 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006979
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006980/* speedy UTF-16 code point order comparison */
6981/* gleaned from: */
6982/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6983
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006984static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006985{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006986 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006987 0, 0, 0, 0, 0, 0, 0, 0,
6988 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006989 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006990};
6991
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992static int
6993unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6994{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006995 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006996
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 Py_UNICODE *s1 = str1->str;
6998 Py_UNICODE *s2 = str2->str;
6999
7000 len1 = str1->length;
7001 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007002
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007004 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007005
7006 c1 = *s1++;
7007 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007008
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 if (c1 > (1<<11) * 26)
7010 c1 += utf16Fixup[c1>>11];
7011 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007012 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007013 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007014
7015 if (c1 != c2)
7016 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007017
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007018 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 }
7020
7021 return (len1 < len2) ? -1 : (len1 != len2);
7022}
7023
Marc-André Lemburge5034372000-08-08 08:04:29 +00007024#else
7025
7026static int
7027unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7028{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007029 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007030
7031 Py_UNICODE *s1 = str1->str;
7032 Py_UNICODE *s2 = str2->str;
7033
7034 len1 = str1->length;
7035 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007036
Marc-André Lemburge5034372000-08-08 08:04:29 +00007037 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007038 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007039
Fredrik Lundh45714e92001-06-26 16:39:36 +00007040 c1 = *s1++;
7041 c2 = *s2++;
7042
7043 if (c1 != c2)
7044 return (c1 < c2) ? -1 : 1;
7045
Marc-André Lemburge5034372000-08-08 08:04:29 +00007046 len1--; len2--;
7047 }
7048
7049 return (len1 < len2) ? -1 : (len1 != len2);
7050}
7051
7052#endif
7053
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007057 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7058 return unicode_compare((PyUnicodeObject *)left,
7059 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007060 PyErr_Format(PyExc_TypeError,
7061 "Can't compare %.100s and %.100s",
7062 left->ob_type->tp_name,
7063 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 return -1;
7065}
7066
Martin v. Löwis5b222132007-06-10 09:51:05 +00007067int
7068PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7069{
7070 int i;
7071 Py_UNICODE *id;
7072 assert(PyUnicode_Check(uni));
7073 id = PyUnicode_AS_UNICODE(uni);
7074 /* Compare Unicode string and source character set string */
7075 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007076 if (id[i] != str[i])
7077 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007078 /* This check keeps Python strings that end in '\0' from comparing equal
7079 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007080 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007082 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007084 return 0;
7085}
7086
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007087
Benjamin Peterson29060642009-01-31 22:14:21 +00007088#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007089 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007090
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007091PyObject *PyUnicode_RichCompare(PyObject *left,
7092 PyObject *right,
7093 int op)
7094{
7095 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007096
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007097 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7098 PyObject *v;
7099 if (((PyUnicodeObject *) left)->length !=
7100 ((PyUnicodeObject *) right)->length) {
7101 if (op == Py_EQ) {
7102 Py_INCREF(Py_False);
7103 return Py_False;
7104 }
7105 if (op == Py_NE) {
7106 Py_INCREF(Py_True);
7107 return Py_True;
7108 }
7109 }
7110 if (left == right)
7111 result = 0;
7112 else
7113 result = unicode_compare((PyUnicodeObject *)left,
7114 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007115
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007116 /* Convert the return value to a Boolean */
7117 switch (op) {
7118 case Py_EQ:
7119 v = TEST_COND(result == 0);
7120 break;
7121 case Py_NE:
7122 v = TEST_COND(result != 0);
7123 break;
7124 case Py_LE:
7125 v = TEST_COND(result <= 0);
7126 break;
7127 case Py_GE:
7128 v = TEST_COND(result >= 0);
7129 break;
7130 case Py_LT:
7131 v = TEST_COND(result == -1);
7132 break;
7133 case Py_GT:
7134 v = TEST_COND(result == 1);
7135 break;
7136 default:
7137 PyErr_BadArgument();
7138 return NULL;
7139 }
7140 Py_INCREF(v);
7141 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007142 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007143
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007144 Py_INCREF(Py_NotImplemented);
7145 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007146}
7147
Guido van Rossum403d68b2000-03-13 15:55:09 +00007148int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007150{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007151 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007152 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007153
7154 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007155 sub = PyUnicode_FromObject(element);
7156 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007157 PyErr_Format(PyExc_TypeError,
7158 "'in <string>' requires string as left operand, not %s",
7159 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007160 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007161 }
7162
Thomas Wouters477c8d52006-05-27 19:21:47 +00007163 str = PyUnicode_FromObject(container);
7164 if (!str) {
7165 Py_DECREF(sub);
7166 return -1;
7167 }
7168
7169 result = stringlib_contains_obj(str, sub);
7170
7171 Py_DECREF(str);
7172 Py_DECREF(sub);
7173
Guido van Rossum403d68b2000-03-13 15:55:09 +00007174 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007175}
7176
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177/* Concat to string or Unicode object giving a new Unicode object. */
7178
7179PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181{
7182 PyUnicodeObject *u = NULL, *v = NULL, *w;
7183
7184 /* Coerce the two arguments */
7185 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7186 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7189 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191
7192 /* Shortcuts */
7193 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 Py_DECREF(v);
7195 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196 }
7197 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 Py_DECREF(u);
7199 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200 }
7201
7202 /* Concat the two Unicode strings */
7203 w = _PyUnicode_New(u->length + v->length);
7204 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 Py_UNICODE_COPY(w->str, u->str, u->length);
7207 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7208
7209 Py_DECREF(u);
7210 Py_DECREF(v);
7211 return (PyObject *)w;
7212
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 Py_XDECREF(u);
7215 Py_XDECREF(v);
7216 return NULL;
7217}
7218
Walter Dörwald1ab83302007-05-18 17:15:44 +00007219void
7220PyUnicode_Append(PyObject **pleft, PyObject *right)
7221{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007222 PyObject *new;
7223 if (*pleft == NULL)
7224 return;
7225 if (right == NULL || !PyUnicode_Check(*pleft)) {
7226 Py_DECREF(*pleft);
7227 *pleft = NULL;
7228 return;
7229 }
7230 new = PyUnicode_Concat(*pleft, right);
7231 Py_DECREF(*pleft);
7232 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007233}
7234
7235void
7236PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7237{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007238 PyUnicode_Append(pleft, right);
7239 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007240}
7241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007242PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007245Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007246string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007247interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248
7249static PyObject *
7250unicode_count(PyUnicodeObject *self, PyObject *args)
7251{
7252 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007253 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007254 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 PyObject *result;
7256
Guido van Rossumb8872e62000-05-09 14:14:27 +00007257 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259 return NULL;
7260
7261 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007262 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007265
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007266 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007267 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007268 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007269 substring->str, substring->length,
7270 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007271 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272
7273 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007274
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 return result;
7276}
7277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007278PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007281Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007282to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007283handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007284a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7285'xmlcharrefreplace' as well as any other name registered with\n\
7286codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287
7288static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007289unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007291 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 char *encoding = NULL;
7293 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007294 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007295
Benjamin Peterson308d6372009-09-18 21:42:35 +00007296 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7297 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007299 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007300 if (v == NULL)
7301 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007302 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007303 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007304 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007305 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007306 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007307 Py_DECREF(v);
7308 return NULL;
7309 }
7310 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007311
Benjamin Peterson29060642009-01-31 22:14:21 +00007312 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007313 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007314}
7315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007316PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318\n\
7319Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007320If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321
7322static PyObject*
7323unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7324{
7325 Py_UNICODE *e;
7326 Py_UNICODE *p;
7327 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007328 Py_UNICODE *qe;
7329 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 PyUnicodeObject *u;
7331 int tabsize = 8;
7332
7333 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335
Thomas Wouters7e474022000-07-16 12:04:32 +00007336 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007337 i = 0; /* chars up to and including most recent \n or \r */
7338 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7339 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 for (p = self->str; p < e; p++)
7341 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 if (tabsize > 0) {
7343 incr = tabsize - (j % tabsize); /* cannot overflow */
7344 if (j > PY_SSIZE_T_MAX - incr)
7345 goto overflow1;
7346 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007347 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 if (j > PY_SSIZE_T_MAX - 1)
7351 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 j++;
7353 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007354 if (i > PY_SSIZE_T_MAX - j)
7355 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007357 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 }
7359 }
7360
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007361 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007362 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007363
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364 /* Second pass: create output string and fill it */
7365 u = _PyUnicode_New(i + j);
7366 if (!u)
7367 return NULL;
7368
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007369 j = 0; /* same as in first pass */
7370 q = u->str; /* next output char */
7371 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372
7373 for (p = self->str; p < e; p++)
7374 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 if (tabsize > 0) {
7376 i = tabsize - (j % tabsize);
7377 j += i;
7378 while (i--) {
7379 if (q >= qe)
7380 goto overflow2;
7381 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007382 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007384 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 else {
7386 if (q >= qe)
7387 goto overflow2;
7388 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007389 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390 if (*p == '\n' || *p == '\r')
7391 j = 0;
7392 }
7393
7394 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007395
7396 overflow2:
7397 Py_DECREF(u);
7398 overflow1:
7399 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401}
7402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007403PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405\n\
7406Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007407such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408arguments start and end are interpreted as in slice notation.\n\
7409\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007410Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411
7412static PyObject *
7413unicode_find(PyUnicodeObject *self, PyObject *args)
7414{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007415 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007416 Py_ssize_t start;
7417 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007418 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419
Christian Heimes9cd17752007-11-18 19:35:23 +00007420 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422
Thomas Wouters477c8d52006-05-27 19:21:47 +00007423 result = stringlib_find_slice(
7424 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7425 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7426 start, end
7427 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428
7429 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007430
Christian Heimes217cfd12007-12-02 14:31:20 +00007431 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432}
7433
7434static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007435unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436{
7437 if (index < 0 || index >= self->length) {
7438 PyErr_SetString(PyExc_IndexError, "string index out of range");
7439 return NULL;
7440 }
7441
7442 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7443}
7444
Guido van Rossumc2504932007-09-18 19:42:40 +00007445/* Believe it or not, this produces the same value for ASCII strings
7446 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007447static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007448unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
Guido van Rossumc2504932007-09-18 19:42:40 +00007450 Py_ssize_t len;
7451 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007452 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007453
7454 if (self->hash != -1)
7455 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007456 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007457 p = self->str;
7458 x = *p << 7;
7459 while (--len >= 0)
7460 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007461 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007462 if (x == -1)
7463 x = -2;
7464 self->hash = x;
7465 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466}
7467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007468PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007471Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472
7473static PyObject *
7474unicode_index(PyUnicodeObject *self, PyObject *args)
7475{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007476 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007477 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007478 Py_ssize_t start;
7479 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480
Christian Heimes9cd17752007-11-18 19:35:23 +00007481 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483
Thomas Wouters477c8d52006-05-27 19:21:47 +00007484 result = stringlib_find_slice(
7485 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7486 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7487 start, end
7488 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489
7490 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007491
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 if (result < 0) {
7493 PyErr_SetString(PyExc_ValueError, "substring not found");
7494 return NULL;
7495 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007496
Christian Heimes217cfd12007-12-02 14:31:20 +00007497 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498}
7499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007500PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007503Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007504at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505
7506static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007507unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508{
7509 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7510 register const Py_UNICODE *e;
7511 int cased;
7512
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 /* Shortcut for single character strings */
7514 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007517 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007518 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007520
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521 e = p + PyUnicode_GET_SIZE(self);
7522 cased = 0;
7523 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007524 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007525
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7527 return PyBool_FromLong(0);
7528 else if (!cased && Py_UNICODE_ISLOWER(ch))
7529 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007531 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532}
7533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007534PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007537Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007538at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539
7540static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007541unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542{
7543 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7544 register const Py_UNICODE *e;
7545 int cased;
7546
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 /* Shortcut for single character strings */
7548 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007549 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007551 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007552 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007554
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555 e = p + PyUnicode_GET_SIZE(self);
7556 cased = 0;
7557 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007559
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7561 return PyBool_FromLong(0);
7562 else if (!cased && Py_UNICODE_ISUPPER(ch))
7563 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007565 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566}
7567
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007568PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007569 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007571Return True if S is a titlecased string and there is at least one\n\
7572character in S, i.e. upper- and titlecase characters may only\n\
7573follow uncased characters and lowercase characters only cased ones.\n\
7574Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575
7576static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007577unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578{
7579 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7580 register const Py_UNICODE *e;
7581 int cased, previous_is_cased;
7582
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583 /* Shortcut for single character strings */
7584 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7586 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007588 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007589 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007591
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 e = p + PyUnicode_GET_SIZE(self);
7593 cased = 0;
7594 previous_is_cased = 0;
7595 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007597
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7599 if (previous_is_cased)
7600 return PyBool_FromLong(0);
7601 previous_is_cased = 1;
7602 cased = 1;
7603 }
7604 else if (Py_UNICODE_ISLOWER(ch)) {
7605 if (!previous_is_cased)
7606 return PyBool_FromLong(0);
7607 previous_is_cased = 1;
7608 cased = 1;
7609 }
7610 else
7611 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007613 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614}
7615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007616PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007619Return True if all characters in S are whitespace\n\
7620and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621
7622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007623unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624{
7625 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7626 register const Py_UNICODE *e;
7627
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 /* Shortcut for single character strings */
7629 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 Py_UNICODE_ISSPACE(*p))
7631 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007633 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007634 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007636
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 e = p + PyUnicode_GET_SIZE(self);
7638 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 if (!Py_UNICODE_ISSPACE(*p))
7640 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007642 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643}
7644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007645PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007647\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007648Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007649and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007650
7651static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007652unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007653{
7654 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7655 register const Py_UNICODE *e;
7656
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007657 /* Shortcut for single character strings */
7658 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 Py_UNICODE_ISALPHA(*p))
7660 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007661
7662 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007663 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007665
7666 e = p + PyUnicode_GET_SIZE(self);
7667 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 if (!Py_UNICODE_ISALPHA(*p))
7669 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007670 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007671 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007672}
7673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007674PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007676\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007677Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007678and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007679
7680static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007681unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007682{
7683 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7684 register const Py_UNICODE *e;
7685
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007686 /* Shortcut for single character strings */
7687 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 Py_UNICODE_ISALNUM(*p))
7689 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007690
7691 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007692 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007694
7695 e = p + PyUnicode_GET_SIZE(self);
7696 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 if (!Py_UNICODE_ISALNUM(*p))
7698 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007699 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007700 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007701}
7702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007703PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007706Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007707False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708
7709static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007710unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711{
7712 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7713 register const Py_UNICODE *e;
7714
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 /* Shortcut for single character strings */
7716 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 Py_UNICODE_ISDECIMAL(*p))
7718 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007720 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007721 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007723
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 e = p + PyUnicode_GET_SIZE(self);
7725 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 if (!Py_UNICODE_ISDECIMAL(*p))
7727 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007729 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730}
7731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007732PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007735Return True if all characters in S are digits\n\
7736and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737
7738static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007739unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740{
7741 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7742 register const Py_UNICODE *e;
7743
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 /* Shortcut for single character strings */
7745 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 Py_UNICODE_ISDIGIT(*p))
7747 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007749 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007750 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007752
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 e = p + PyUnicode_GET_SIZE(self);
7754 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 if (!Py_UNICODE_ISDIGIT(*p))
7756 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007758 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759}
7760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007761PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007764Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007765False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766
7767static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007768unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769{
7770 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7771 register const Py_UNICODE *e;
7772
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 /* Shortcut for single character strings */
7774 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 Py_UNICODE_ISNUMERIC(*p))
7776 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007778 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007779 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007781
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782 e = p + PyUnicode_GET_SIZE(self);
7783 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 if (!Py_UNICODE_ISNUMERIC(*p))
7785 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007787 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788}
7789
Martin v. Löwis47383402007-08-15 07:32:56 +00007790int
7791PyUnicode_IsIdentifier(PyObject *self)
7792{
7793 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7794 register const Py_UNICODE *e;
7795
7796 /* Special case for empty strings */
7797 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007799
7800 /* PEP 3131 says that the first character must be in
7801 XID_Start and subsequent characters in XID_Continue,
7802 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007803 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007804 letters, digits, underscore). However, given the current
7805 definition of XID_Start and XID_Continue, it is sufficient
7806 to check just for these, except that _ must be allowed
7807 as starting an identifier. */
7808 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7809 return 0;
7810
7811 e = p + PyUnicode_GET_SIZE(self);
7812 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 if (!_PyUnicode_IsXidContinue(*p))
7814 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007815 }
7816 return 1;
7817}
7818
7819PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007821\n\
7822Return True if S is a valid identifier according\n\
7823to the language definition.");
7824
7825static PyObject*
7826unicode_isidentifier(PyObject *self)
7827{
7828 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7829}
7830
Georg Brandl559e5d72008-06-11 18:37:52 +00007831PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007833\n\
7834Return True if all characters in S are considered\n\
7835printable in repr() or S is empty, False otherwise.");
7836
7837static PyObject*
7838unicode_isprintable(PyObject *self)
7839{
7840 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7841 register const Py_UNICODE *e;
7842
7843 /* Shortcut for single character strings */
7844 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7845 Py_RETURN_TRUE;
7846 }
7847
7848 e = p + PyUnicode_GET_SIZE(self);
7849 for (; p < e; p++) {
7850 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7851 Py_RETURN_FALSE;
7852 }
7853 }
7854 Py_RETURN_TRUE;
7855}
7856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007857PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007858 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859\n\
7860Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007861iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862
7863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007864unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007866 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867}
7868
Martin v. Löwis18e16552006-02-15 17:27:45 +00007869static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870unicode_length(PyUnicodeObject *self)
7871{
7872 return self->length;
7873}
7874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007875PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007878Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007879done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880
7881static PyObject *
7882unicode_ljust(PyUnicodeObject *self, PyObject *args)
7883{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007884 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007885 Py_UNICODE fillchar = ' ';
7886
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007887 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 return NULL;
7889
Tim Peters7a29bd52001-09-12 03:03:31 +00007890 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 Py_INCREF(self);
7892 return (PyObject*) self;
7893 }
7894
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007895 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896}
7897
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007898PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007901Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902
7903static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007904unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906 return fixup(self, fixlower);
7907}
7908
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007909#define LEFTSTRIP 0
7910#define RIGHTSTRIP 1
7911#define BOTHSTRIP 2
7912
7913/* Arrays indexed by above */
7914static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7915
7916#define STRIPNAME(i) (stripformat[i]+3)
7917
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007918/* externally visible for str.strip(unicode) */
7919PyObject *
7920_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7921{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007922 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7923 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7924 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7925 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7926 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007927
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007929
Benjamin Peterson14339b62009-01-31 16:36:08 +00007930 i = 0;
7931 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7933 i++;
7934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007935 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007936
Benjamin Peterson14339b62009-01-31 16:36:08 +00007937 j = len;
7938 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 do {
7940 j--;
7941 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7942 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007943 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007944
Benjamin Peterson14339b62009-01-31 16:36:08 +00007945 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 Py_INCREF(self);
7947 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007948 }
7949 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007951}
7952
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953
7954static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007955do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007957 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7958 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007959
Benjamin Peterson14339b62009-01-31 16:36:08 +00007960 i = 0;
7961 if (striptype != RIGHTSTRIP) {
7962 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7963 i++;
7964 }
7965 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007966
Benjamin Peterson14339b62009-01-31 16:36:08 +00007967 j = len;
7968 if (striptype != LEFTSTRIP) {
7969 do {
7970 j--;
7971 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7972 j++;
7973 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007974
Benjamin Peterson14339b62009-01-31 16:36:08 +00007975 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7976 Py_INCREF(self);
7977 return (PyObject*)self;
7978 }
7979 else
7980 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981}
7982
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007983
7984static PyObject *
7985do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7986{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007987 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007988
Benjamin Peterson14339b62009-01-31 16:36:08 +00007989 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7990 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007991
Benjamin Peterson14339b62009-01-31 16:36:08 +00007992 if (sep != NULL && sep != Py_None) {
7993 if (PyUnicode_Check(sep))
7994 return _PyUnicode_XStrip(self, striptype, sep);
7995 else {
7996 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 "%s arg must be None or str",
7998 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007999 return NULL;
8000 }
8001 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008002
Benjamin Peterson14339b62009-01-31 16:36:08 +00008003 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008004}
8005
8006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008007PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008009\n\
8010Return a copy of the string S with leading and trailing\n\
8011whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008012If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008013
8014static PyObject *
8015unicode_strip(PyUnicodeObject *self, PyObject *args)
8016{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008017 if (PyTuple_GET_SIZE(args) == 0)
8018 return do_strip(self, BOTHSTRIP); /* Common case */
8019 else
8020 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008021}
8022
8023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008024PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008026\n\
8027Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008028If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008029
8030static PyObject *
8031unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8032{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008033 if (PyTuple_GET_SIZE(args) == 0)
8034 return do_strip(self, LEFTSTRIP); /* Common case */
8035 else
8036 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008037}
8038
8039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008040PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008042\n\
8043Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008044If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008045
8046static PyObject *
8047unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8048{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008049 if (PyTuple_GET_SIZE(args) == 0)
8050 return do_strip(self, RIGHTSTRIP); /* Common case */
8051 else
8052 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008053}
8054
8055
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008057unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058{
8059 PyUnicodeObject *u;
8060 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008061 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008062 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063
Georg Brandl222de0f2009-04-12 12:01:50 +00008064 if (len < 1) {
8065 Py_INCREF(unicode_empty);
8066 return (PyObject *)unicode_empty;
8067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068
Tim Peters7a29bd52001-09-12 03:03:31 +00008069 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 /* no repeat, return original string */
8071 Py_INCREF(str);
8072 return (PyObject*) str;
8073 }
Tim Peters8f422462000-09-09 06:13:41 +00008074
8075 /* ensure # of chars needed doesn't overflow int and # of bytes
8076 * needed doesn't overflow size_t
8077 */
8078 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008079 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008080 PyErr_SetString(PyExc_OverflowError,
8081 "repeated string is too long");
8082 return NULL;
8083 }
8084 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8085 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8086 PyErr_SetString(PyExc_OverflowError,
8087 "repeated string is too long");
8088 return NULL;
8089 }
8090 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091 if (!u)
8092 return NULL;
8093
8094 p = u->str;
8095
Georg Brandl222de0f2009-04-12 12:01:50 +00008096 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008097 Py_UNICODE_FILL(p, str->str[0], len);
8098 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008099 Py_ssize_t done = str->length; /* number of characters copied this far */
8100 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008102 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008103 Py_UNICODE_COPY(p+done, p, n);
8104 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 }
8107
8108 return (PyObject*) u;
8109}
8110
8111PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 PyObject *subobj,
8113 PyObject *replobj,
8114 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115{
8116 PyObject *self;
8117 PyObject *str1;
8118 PyObject *str2;
8119 PyObject *result;
8120
8121 self = PyUnicode_FromObject(obj);
8122 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 str1 = PyUnicode_FromObject(subobj);
8125 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 Py_DECREF(self);
8127 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 }
8129 str2 = PyUnicode_FromObject(replobj);
8130 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 Py_DECREF(self);
8132 Py_DECREF(str1);
8133 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 }
Tim Petersced69f82003-09-16 20:30:58 +00008135 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 (PyUnicodeObject *)str1,
8137 (PyUnicodeObject *)str2,
8138 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 Py_DECREF(self);
8140 Py_DECREF(str1);
8141 Py_DECREF(str2);
8142 return result;
8143}
8144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008145PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008146 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147\n\
8148Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008149old replaced by new. If the optional argument count is\n\
8150given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151
8152static PyObject*
8153unicode_replace(PyUnicodeObject *self, PyObject *args)
8154{
8155 PyUnicodeObject *str1;
8156 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008157 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 PyObject *result;
8159
Martin v. Löwis18e16552006-02-15 17:27:45 +00008160 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 return NULL;
8162 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8163 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008165 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008166 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 Py_DECREF(str1);
8168 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170
8171 result = replace(self, str1, str2, maxcount);
8172
8173 Py_DECREF(str1);
8174 Py_DECREF(str2);
8175 return result;
8176}
8177
8178static
8179PyObject *unicode_repr(PyObject *unicode)
8180{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008181 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008182 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008183 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8184 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8185
8186 /* XXX(nnorwitz): rather than over-allocating, it would be
8187 better to choose a different scheme. Perhaps scan the
8188 first N-chars of the string and allocate based on that size.
8189 */
8190 /* Initial allocation is based on the longest-possible unichr
8191 escape.
8192
8193 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8194 unichr, so in this case it's the longest unichr escape. In
8195 narrow (UTF-16) builds this is five chars per source unichr
8196 since there are two unichrs in the surrogate pair, so in narrow
8197 (UTF-16) builds it's not the longest unichr escape.
8198
8199 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8200 so in the narrow (UTF-16) build case it's the longest unichr
8201 escape.
8202 */
8203
Walter Dörwald1ab83302007-05-18 17:15:44 +00008204 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008206#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008207 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008208#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008210#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008212 if (repr == NULL)
8213 return NULL;
8214
Walter Dörwald1ab83302007-05-18 17:15:44 +00008215 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008216
8217 /* Add quote */
8218 *p++ = (findchar(s, size, '\'') &&
8219 !findchar(s, size, '"')) ? '"' : '\'';
8220 while (size-- > 0) {
8221 Py_UNICODE ch = *s++;
8222
8223 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008224 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008225 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008226 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008227 continue;
8228 }
8229
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008231 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008232 *p++ = '\\';
8233 *p++ = 't';
8234 }
8235 else if (ch == '\n') {
8236 *p++ = '\\';
8237 *p++ = 'n';
8238 }
8239 else if (ch == '\r') {
8240 *p++ = '\\';
8241 *p++ = 'r';
8242 }
8243
8244 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008245 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008246 *p++ = '\\';
8247 *p++ = 'x';
8248 *p++ = hexdigits[(ch >> 4) & 0x000F];
8249 *p++ = hexdigits[ch & 0x000F];
8250 }
8251
Georg Brandl559e5d72008-06-11 18:37:52 +00008252 /* Copy ASCII characters as-is */
8253 else if (ch < 0x7F) {
8254 *p++ = ch;
8255 }
8256
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008258 else {
8259 Py_UCS4 ucs = ch;
8260
8261#ifndef Py_UNICODE_WIDE
8262 Py_UNICODE ch2 = 0;
8263 /* Get code point from surrogate pair */
8264 if (size > 0) {
8265 ch2 = *s;
8266 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008268 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008270 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008271 size--;
8272 }
8273 }
8274#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008275 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008276 (categories Z* and C* except ASCII space)
8277 */
8278 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8279 /* Map 8-bit characters to '\xhh' */
8280 if (ucs <= 0xff) {
8281 *p++ = '\\';
8282 *p++ = 'x';
8283 *p++ = hexdigits[(ch >> 4) & 0x000F];
8284 *p++ = hexdigits[ch & 0x000F];
8285 }
8286 /* Map 21-bit characters to '\U00xxxxxx' */
8287 else if (ucs >= 0x10000) {
8288 *p++ = '\\';
8289 *p++ = 'U';
8290 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8291 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8292 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8293 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8294 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8295 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8296 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8297 *p++ = hexdigits[ucs & 0x0000000F];
8298 }
8299 /* Map 16-bit characters to '\uxxxx' */
8300 else {
8301 *p++ = '\\';
8302 *p++ = 'u';
8303 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8304 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8305 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8306 *p++ = hexdigits[ucs & 0x000F];
8307 }
8308 }
8309 /* Copy characters as-is */
8310 else {
8311 *p++ = ch;
8312#ifndef Py_UNICODE_WIDE
8313 if (ucs >= 0x10000)
8314 *p++ = ch2;
8315#endif
8316 }
8317 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008318 }
8319 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008320 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008321
8322 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008323 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008324 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325}
8326
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008327PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329\n\
8330Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008331such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332arguments start and end are interpreted as in slice notation.\n\
8333\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008334Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335
8336static PyObject *
8337unicode_rfind(PyUnicodeObject *self, PyObject *args)
8338{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008339 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008340 Py_ssize_t start;
8341 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008342 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343
Christian Heimes9cd17752007-11-18 19:35:23 +00008344 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008345 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346
Thomas Wouters477c8d52006-05-27 19:21:47 +00008347 result = stringlib_rfind_slice(
8348 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8349 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8350 start, end
8351 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352
8353 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008354
Christian Heimes217cfd12007-12-02 14:31:20 +00008355 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356}
8357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008358PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008361Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362
8363static PyObject *
8364unicode_rindex(PyUnicodeObject *self, PyObject *args)
8365{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008366 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008367 Py_ssize_t start;
8368 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008369 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370
Christian Heimes9cd17752007-11-18 19:35:23 +00008371 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373
Thomas Wouters477c8d52006-05-27 19:21:47 +00008374 result = stringlib_rfind_slice(
8375 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8376 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8377 start, end
8378 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379
8380 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008381
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 if (result < 0) {
8383 PyErr_SetString(PyExc_ValueError, "substring not found");
8384 return NULL;
8385 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008386 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387}
8388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008389PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008392Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008393done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394
8395static PyObject *
8396unicode_rjust(PyUnicodeObject *self, PyObject *args)
8397{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008398 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008399 Py_UNICODE fillchar = ' ';
8400
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008401 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 return NULL;
8403
Tim Peters7a29bd52001-09-12 03:03:31 +00008404 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 Py_INCREF(self);
8406 return (PyObject*) self;
8407 }
8408
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008409 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410}
8411
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 PyObject *sep,
8414 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415{
8416 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008417
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418 s = PyUnicode_FromObject(s);
8419 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008420 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 if (sep != NULL) {
8422 sep = PyUnicode_FromObject(sep);
8423 if (sep == NULL) {
8424 Py_DECREF(s);
8425 return NULL;
8426 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427 }
8428
8429 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8430
8431 Py_DECREF(s);
8432 Py_XDECREF(sep);
8433 return result;
8434}
8435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008436PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438\n\
8439Return a list of the words in S, using sep as the\n\
8440delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008441splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008442whitespace string is a separator and empty strings are\n\
8443removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444
8445static PyObject*
8446unicode_split(PyUnicodeObject *self, PyObject *args)
8447{
8448 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008449 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450
Martin v. Löwis18e16552006-02-15 17:27:45 +00008451 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 return NULL;
8453
8454 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460}
8461
Thomas Wouters477c8d52006-05-27 19:21:47 +00008462PyObject *
8463PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8464{
8465 PyObject* str_obj;
8466 PyObject* sep_obj;
8467 PyObject* out;
8468
8469 str_obj = PyUnicode_FromObject(str_in);
8470 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008472 sep_obj = PyUnicode_FromObject(sep_in);
8473 if (!sep_obj) {
8474 Py_DECREF(str_obj);
8475 return NULL;
8476 }
8477
8478 out = stringlib_partition(
8479 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8480 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8481 );
8482
8483 Py_DECREF(sep_obj);
8484 Py_DECREF(str_obj);
8485
8486 return out;
8487}
8488
8489
8490PyObject *
8491PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8492{
8493 PyObject* str_obj;
8494 PyObject* sep_obj;
8495 PyObject* out;
8496
8497 str_obj = PyUnicode_FromObject(str_in);
8498 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008500 sep_obj = PyUnicode_FromObject(sep_in);
8501 if (!sep_obj) {
8502 Py_DECREF(str_obj);
8503 return NULL;
8504 }
8505
8506 out = stringlib_rpartition(
8507 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8508 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8509 );
8510
8511 Py_DECREF(sep_obj);
8512 Py_DECREF(str_obj);
8513
8514 return out;
8515}
8516
8517PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008519\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008520Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008521the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008522found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008523
8524static PyObject*
8525unicode_partition(PyUnicodeObject *self, PyObject *separator)
8526{
8527 return PyUnicode_Partition((PyObject *)self, separator);
8528}
8529
8530PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008531 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008532\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008533Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008534the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008535separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008536
8537static PyObject*
8538unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8539{
8540 return PyUnicode_RPartition((PyObject *)self, separator);
8541}
8542
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008543PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 PyObject *sep,
8545 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008546{
8547 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008548
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008549 s = PyUnicode_FromObject(s);
8550 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008551 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 if (sep != NULL) {
8553 sep = PyUnicode_FromObject(sep);
8554 if (sep == NULL) {
8555 Py_DECREF(s);
8556 return NULL;
8557 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008558 }
8559
8560 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8561
8562 Py_DECREF(s);
8563 Py_XDECREF(sep);
8564 return result;
8565}
8566
8567PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008569\n\
8570Return a list of the words in S, using sep as the\n\
8571delimiter string, starting at the end of the string and\n\
8572working to the front. If maxsplit is given, at most maxsplit\n\
8573splits are done. If sep is not specified, any whitespace string\n\
8574is a separator.");
8575
8576static PyObject*
8577unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8578{
8579 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008580 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008581
Martin v. Löwis18e16552006-02-15 17:27:45 +00008582 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008583 return NULL;
8584
8585 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008587 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008589 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008591}
8592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008593PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595\n\
8596Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008597Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008598is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599
8600static PyObject*
8601unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8602{
Guido van Rossum86662912000-04-11 15:38:46 +00008603 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604
Guido van Rossum86662912000-04-11 15:38:46 +00008605 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 return NULL;
8607
Guido van Rossum86662912000-04-11 15:38:46 +00008608 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609}
8610
8611static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008612PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613{
Walter Dörwald346737f2007-05-31 10:44:43 +00008614 if (PyUnicode_CheckExact(self)) {
8615 Py_INCREF(self);
8616 return self;
8617 } else
8618 /* Subtype -- return genuine unicode string with the same value. */
8619 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8620 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621}
8622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008623PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625\n\
8626Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008627and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628
8629static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008630unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 return fixup(self, fixswapcase);
8633}
8634
Georg Brandlceee0772007-11-27 23:48:05 +00008635PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008637\n\
8638Return a translation table usable for str.translate().\n\
8639If there is only one argument, it must be a dictionary mapping Unicode\n\
8640ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008641Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008642If there are two arguments, they must be strings of equal length, and\n\
8643in the resulting dictionary, each character in x will be mapped to the\n\
8644character at the same position in y. If there is a third argument, it\n\
8645must be a string, whose characters will be mapped to None in the result.");
8646
8647static PyObject*
8648unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8649{
8650 PyObject *x, *y = NULL, *z = NULL;
8651 PyObject *new = NULL, *key, *value;
8652 Py_ssize_t i = 0;
8653 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008654
Georg Brandlceee0772007-11-27 23:48:05 +00008655 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8656 return NULL;
8657 new = PyDict_New();
8658 if (!new)
8659 return NULL;
8660 if (y != NULL) {
8661 /* x must be a string too, of equal length */
8662 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8663 if (!PyUnicode_Check(x)) {
8664 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8665 "be a string if there is a second argument");
8666 goto err;
8667 }
8668 if (PyUnicode_GET_SIZE(x) != ylen) {
8669 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8670 "arguments must have equal length");
8671 goto err;
8672 }
8673 /* create entries for translating chars in x to those in y */
8674 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008675 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8676 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008677 if (!key || !value)
8678 goto err;
8679 res = PyDict_SetItem(new, key, value);
8680 Py_DECREF(key);
8681 Py_DECREF(value);
8682 if (res < 0)
8683 goto err;
8684 }
8685 /* create entries for deleting chars in z */
8686 if (z != NULL) {
8687 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008688 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008689 if (!key)
8690 goto err;
8691 res = PyDict_SetItem(new, key, Py_None);
8692 Py_DECREF(key);
8693 if (res < 0)
8694 goto err;
8695 }
8696 }
8697 } else {
8698 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008699 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008700 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8701 "to maketrans it must be a dict");
8702 goto err;
8703 }
8704 /* copy entries into the new dict, converting string keys to int keys */
8705 while (PyDict_Next(x, &i, &key, &value)) {
8706 if (PyUnicode_Check(key)) {
8707 /* convert string keys to integer keys */
8708 PyObject *newkey;
8709 if (PyUnicode_GET_SIZE(key) != 1) {
8710 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8711 "table must be of length 1");
8712 goto err;
8713 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008714 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008715 if (!newkey)
8716 goto err;
8717 res = PyDict_SetItem(new, newkey, value);
8718 Py_DECREF(newkey);
8719 if (res < 0)
8720 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008721 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008722 /* just keep integer keys */
8723 if (PyDict_SetItem(new, key, value) < 0)
8724 goto err;
8725 } else {
8726 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8727 "be strings or integers");
8728 goto err;
8729 }
8730 }
8731 }
8732 return new;
8733 err:
8734 Py_DECREF(new);
8735 return NULL;
8736}
8737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008738PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740\n\
8741Return a copy of the string S, where all characters have been mapped\n\
8742through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008743Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008744Unmapped characters are left untouched. Characters mapped to None\n\
8745are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746
8747static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008748unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749{
Georg Brandlceee0772007-11-27 23:48:05 +00008750 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751}
8752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008753PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008756Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757
8758static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008759unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761 return fixup(self, fixupper);
8762}
8763
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008764PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008767Pad a numeric string S with zeros on the left, to fill a field\n\
8768of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769
8770static PyObject *
8771unicode_zfill(PyUnicodeObject *self, PyObject *args)
8772{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008773 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 PyUnicodeObject *u;
8775
Martin v. Löwis18e16552006-02-15 17:27:45 +00008776 Py_ssize_t width;
8777 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 return NULL;
8779
8780 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008781 if (PyUnicode_CheckExact(self)) {
8782 Py_INCREF(self);
8783 return (PyObject*) self;
8784 }
8785 else
8786 return PyUnicode_FromUnicode(
8787 PyUnicode_AS_UNICODE(self),
8788 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790 }
8791
8792 fill = width - self->length;
8793
8794 u = pad(self, fill, 0, '0');
8795
Walter Dörwald068325e2002-04-15 13:36:47 +00008796 if (u == NULL)
8797 return NULL;
8798
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 if (u->str[fill] == '+' || u->str[fill] == '-') {
8800 /* move sign to beginning of string */
8801 u->str[0] = u->str[fill];
8802 u->str[fill] = '0';
8803 }
8804
8805 return (PyObject*) u;
8806}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807
8808#if 0
8809static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008810unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811{
Christian Heimes2202f872008-02-06 14:31:34 +00008812 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813}
8814#endif
8815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008816PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008817 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008819Return True if S starts with the specified prefix, False otherwise.\n\
8820With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008821With optional end, stop comparing S at that position.\n\
8822prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823
8824static PyObject *
8825unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008828 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008829 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008830 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008831 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008832 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008834 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8836 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008837 if (PyTuple_Check(subobj)) {
8838 Py_ssize_t i;
8839 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8840 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008842 if (substring == NULL)
8843 return NULL;
8844 result = tailmatch(self, substring, start, end, -1);
8845 Py_DECREF(substring);
8846 if (result) {
8847 Py_RETURN_TRUE;
8848 }
8849 }
8850 /* nothing matched */
8851 Py_RETURN_FALSE;
8852 }
8853 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008856 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008858 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859}
8860
8861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008862PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008865Return True if S ends with the specified suffix, False otherwise.\n\
8866With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008867With optional end, stop comparing S at that position.\n\
8868suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869
8870static PyObject *
8871unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008874 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008876 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008877 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008878 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008880 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8882 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008883 if (PyTuple_Check(subobj)) {
8884 Py_ssize_t i;
8885 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8886 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008888 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008890 result = tailmatch(self, substring, start, end, +1);
8891 Py_DECREF(substring);
8892 if (result) {
8893 Py_RETURN_TRUE;
8894 }
8895 }
8896 Py_RETURN_FALSE;
8897 }
8898 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008902 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008904 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905}
8906
Eric Smith8c663262007-08-25 02:26:07 +00008907#include "stringlib/string_format.h"
8908
8909PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008911\n\
8912");
8913
Eric Smith4a7d76d2008-05-30 18:10:19 +00008914static PyObject *
8915unicode__format__(PyObject* self, PyObject* args)
8916{
8917 PyObject *format_spec;
8918
8919 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8920 return NULL;
8921
8922 return _PyUnicode_FormatAdvanced(self,
8923 PyUnicode_AS_UNICODE(format_spec),
8924 PyUnicode_GET_SIZE(format_spec));
8925}
8926
Eric Smith8c663262007-08-25 02:26:07 +00008927PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008928 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008929\n\
8930");
8931
8932static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008933unicode__sizeof__(PyUnicodeObject *v)
8934{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008935 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8936 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008937}
8938
8939PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008940 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008941
8942static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008943unicode_getnewargs(PyUnicodeObject *v)
8944{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008945 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008946}
8947
8948
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949static PyMethodDef unicode_methods[] = {
8950
8951 /* Order is according to common usage: often used methods should
8952 appear first, since lookup is done sequentially. */
8953
Benjamin Peterson308d6372009-09-18 21:42:35 +00008954 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008955 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8956 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008957 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008958 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8959 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8960 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8961 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8962 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8963 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8964 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008965 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008966 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8967 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8968 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008969 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008970 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8971 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8972 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008973 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008974 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008975 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008976 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008977 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8978 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8979 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8980 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8981 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8982 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8983 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8984 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8985 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8986 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8987 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8988 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8989 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8990 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008991 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008992 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008993 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008994 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008995 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00008996 {"maketrans", (PyCFunction) unicode_maketrans,
8997 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008998 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008999#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009000 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001#endif
9002
9003#if 0
9004 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009005 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006#endif
9007
Benjamin Peterson14339b62009-01-31 16:36:08 +00009008 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 {NULL, NULL}
9010};
9011
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009012static PyObject *
9013unicode_mod(PyObject *v, PyObject *w)
9014{
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 if (!PyUnicode_Check(v)) {
9016 Py_INCREF(Py_NotImplemented);
9017 return Py_NotImplemented;
9018 }
9019 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009020}
9021
9022static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009023 0, /*nb_add*/
9024 0, /*nb_subtract*/
9025 0, /*nb_multiply*/
9026 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009027};
9028
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009030 (lenfunc) unicode_length, /* sq_length */
9031 PyUnicode_Concat, /* sq_concat */
9032 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9033 (ssizeargfunc) unicode_getitem, /* sq_item */
9034 0, /* sq_slice */
9035 0, /* sq_ass_item */
9036 0, /* sq_ass_slice */
9037 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038};
9039
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009040static PyObject*
9041unicode_subscript(PyUnicodeObject* self, PyObject* item)
9042{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009043 if (PyIndex_Check(item)) {
9044 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009045 if (i == -1 && PyErr_Occurred())
9046 return NULL;
9047 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009048 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009049 return unicode_getitem(self, i);
9050 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009051 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009052 Py_UNICODE* source_buf;
9053 Py_UNICODE* result_buf;
9054 PyObject* result;
9055
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009056 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009058 return NULL;
9059 }
9060
9061 if (slicelength <= 0) {
9062 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009063 } else if (start == 0 && step == 1 && slicelength == self->length &&
9064 PyUnicode_CheckExact(self)) {
9065 Py_INCREF(self);
9066 return (PyObject *)self;
9067 } else if (step == 1) {
9068 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009069 } else {
9070 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009071 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9072 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009073
Benjamin Peterson29060642009-01-31 22:14:21 +00009074 if (result_buf == NULL)
9075 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009076
9077 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9078 result_buf[i] = source_buf[cur];
9079 }
Tim Petersced69f82003-09-16 20:30:58 +00009080
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009081 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009082 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009083 return result;
9084 }
9085 } else {
9086 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9087 return NULL;
9088 }
9089}
9090
9091static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009092 (lenfunc)unicode_length, /* mp_length */
9093 (binaryfunc)unicode_subscript, /* mp_subscript */
9094 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009095};
9096
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098/* Helpers for PyUnicode_Format() */
9099
9100static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009101getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009103 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 (*p_argidx)++;
9106 if (arglen < 0)
9107 return args;
9108 else
9109 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 }
9111 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009112 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113 return NULL;
9114}
9115
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009116/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009118static PyObject *
9119formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009121 char *p;
9122 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009124
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 x = PyFloat_AsDouble(v);
9126 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009127 return NULL;
9128
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009131
Eric Smith0923d1d2009-04-16 20:16:10 +00009132 p = PyOS_double_to_string(x, type, prec,
9133 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009134 if (p == NULL)
9135 return NULL;
9136 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009137 PyMem_Free(p);
9138 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139}
9140
Tim Peters38fd5b62000-09-21 05:43:11 +00009141static PyObject*
9142formatlong(PyObject *val, int flags, int prec, int type)
9143{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009144 char *buf;
9145 int len;
9146 PyObject *str; /* temporary string object. */
9147 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009148
Benjamin Peterson14339b62009-01-31 16:36:08 +00009149 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9150 if (!str)
9151 return NULL;
9152 result = PyUnicode_FromStringAndSize(buf, len);
9153 Py_DECREF(str);
9154 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009155}
9156
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157static int
9158formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009159 size_t buflen,
9160 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009162 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009163 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 if (PyUnicode_GET_SIZE(v) == 1) {
9165 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9166 buf[1] = '\0';
9167 return 1;
9168 }
9169#ifndef Py_UNICODE_WIDE
9170 if (PyUnicode_GET_SIZE(v) == 2) {
9171 /* Decode a valid surrogate pair */
9172 int c0 = PyUnicode_AS_UNICODE(v)[0];
9173 int c1 = PyUnicode_AS_UNICODE(v)[1];
9174 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9175 0xDC00 <= c1 && c1 <= 0xDFFF) {
9176 buf[0] = c0;
9177 buf[1] = c1;
9178 buf[2] = '\0';
9179 return 2;
9180 }
9181 }
9182#endif
9183 goto onError;
9184 }
9185 else {
9186 /* Integer input truncated to a character */
9187 long x;
9188 x = PyLong_AsLong(v);
9189 if (x == -1 && PyErr_Occurred())
9190 goto onError;
9191
9192 if (x < 0 || x > 0x10ffff) {
9193 PyErr_SetString(PyExc_OverflowError,
9194 "%c arg not in range(0x110000)");
9195 return -1;
9196 }
9197
9198#ifndef Py_UNICODE_WIDE
9199 if (x > 0xffff) {
9200 x -= 0x10000;
9201 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9202 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9203 return 2;
9204 }
9205#endif
9206 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009207 buf[1] = '\0';
9208 return 1;
9209 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009210
Benjamin Peterson29060642009-01-31 22:14:21 +00009211 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009212 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009213 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009214 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215}
9216
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009217/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009218 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009219*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009220#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009221
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009223 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224{
9225 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009226 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227 int args_owned = 0;
9228 PyUnicodeObject *result = NULL;
9229 PyObject *dict = NULL;
9230 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009231
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009233 PyErr_BadInternalCall();
9234 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235 }
9236 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009237 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239 fmt = PyUnicode_AS_UNICODE(uformat);
9240 fmtcnt = PyUnicode_GET_SIZE(uformat);
9241
9242 reslen = rescnt = fmtcnt + 100;
9243 result = _PyUnicode_New(reslen);
9244 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009245 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246 res = PyUnicode_AS_UNICODE(result);
9247
9248 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009249 arglen = PyTuple_Size(args);
9250 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251 }
9252 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009253 arglen = -1;
9254 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009256 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009257 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009258 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259
9260 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009261 if (*fmt != '%') {
9262 if (--rescnt < 0) {
9263 rescnt = fmtcnt + 100;
9264 reslen += rescnt;
9265 if (_PyUnicode_Resize(&result, reslen) < 0)
9266 goto onError;
9267 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9268 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009269 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009270 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009271 }
9272 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009273 /* Got a format specifier */
9274 int flags = 0;
9275 Py_ssize_t width = -1;
9276 int prec = -1;
9277 Py_UNICODE c = '\0';
9278 Py_UNICODE fill;
9279 int isnumok;
9280 PyObject *v = NULL;
9281 PyObject *temp = NULL;
9282 Py_UNICODE *pbuf;
9283 Py_UNICODE sign;
9284 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009285 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286
Benjamin Peterson29060642009-01-31 22:14:21 +00009287 fmt++;
9288 if (*fmt == '(') {
9289 Py_UNICODE *keystart;
9290 Py_ssize_t keylen;
9291 PyObject *key;
9292 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009293
Benjamin Peterson29060642009-01-31 22:14:21 +00009294 if (dict == NULL) {
9295 PyErr_SetString(PyExc_TypeError,
9296 "format requires a mapping");
9297 goto onError;
9298 }
9299 ++fmt;
9300 --fmtcnt;
9301 keystart = fmt;
9302 /* Skip over balanced parentheses */
9303 while (pcount > 0 && --fmtcnt >= 0) {
9304 if (*fmt == ')')
9305 --pcount;
9306 else if (*fmt == '(')
9307 ++pcount;
9308 fmt++;
9309 }
9310 keylen = fmt - keystart - 1;
9311 if (fmtcnt < 0 || pcount > 0) {
9312 PyErr_SetString(PyExc_ValueError,
9313 "incomplete format key");
9314 goto onError;
9315 }
9316#if 0
9317 /* keys are converted to strings using UTF-8 and
9318 then looked up since Python uses strings to hold
9319 variables names etc. in its namespaces and we
9320 wouldn't want to break common idioms. */
9321 key = PyUnicode_EncodeUTF8(keystart,
9322 keylen,
9323 NULL);
9324#else
9325 key = PyUnicode_FromUnicode(keystart, keylen);
9326#endif
9327 if (key == NULL)
9328 goto onError;
9329 if (args_owned) {
9330 Py_DECREF(args);
9331 args_owned = 0;
9332 }
9333 args = PyObject_GetItem(dict, key);
9334 Py_DECREF(key);
9335 if (args == NULL) {
9336 goto onError;
9337 }
9338 args_owned = 1;
9339 arglen = -1;
9340 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009341 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 while (--fmtcnt >= 0) {
9343 switch (c = *fmt++) {
9344 case '-': flags |= F_LJUST; continue;
9345 case '+': flags |= F_SIGN; continue;
9346 case ' ': flags |= F_BLANK; continue;
9347 case '#': flags |= F_ALT; continue;
9348 case '0': flags |= F_ZERO; continue;
9349 }
9350 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009351 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009352 if (c == '*') {
9353 v = getnextarg(args, arglen, &argidx);
9354 if (v == NULL)
9355 goto onError;
9356 if (!PyLong_Check(v)) {
9357 PyErr_SetString(PyExc_TypeError,
9358 "* wants int");
9359 goto onError;
9360 }
9361 width = PyLong_AsLong(v);
9362 if (width == -1 && PyErr_Occurred())
9363 goto onError;
9364 if (width < 0) {
9365 flags |= F_LJUST;
9366 width = -width;
9367 }
9368 if (--fmtcnt >= 0)
9369 c = *fmt++;
9370 }
9371 else if (c >= '0' && c <= '9') {
9372 width = c - '0';
9373 while (--fmtcnt >= 0) {
9374 c = *fmt++;
9375 if (c < '0' || c > '9')
9376 break;
9377 if ((width*10) / 10 != width) {
9378 PyErr_SetString(PyExc_ValueError,
9379 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009380 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009381 }
9382 width = width*10 + (c - '0');
9383 }
9384 }
9385 if (c == '.') {
9386 prec = 0;
9387 if (--fmtcnt >= 0)
9388 c = *fmt++;
9389 if (c == '*') {
9390 v = getnextarg(args, arglen, &argidx);
9391 if (v == NULL)
9392 goto onError;
9393 if (!PyLong_Check(v)) {
9394 PyErr_SetString(PyExc_TypeError,
9395 "* wants int");
9396 goto onError;
9397 }
9398 prec = PyLong_AsLong(v);
9399 if (prec == -1 && PyErr_Occurred())
9400 goto onError;
9401 if (prec < 0)
9402 prec = 0;
9403 if (--fmtcnt >= 0)
9404 c = *fmt++;
9405 }
9406 else if (c >= '0' && c <= '9') {
9407 prec = c - '0';
9408 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009409 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 if (c < '0' || c > '9')
9411 break;
9412 if ((prec*10) / 10 != prec) {
9413 PyErr_SetString(PyExc_ValueError,
9414 "prec too big");
9415 goto onError;
9416 }
9417 prec = prec*10 + (c - '0');
9418 }
9419 }
9420 } /* prec */
9421 if (fmtcnt >= 0) {
9422 if (c == 'h' || c == 'l' || c == 'L') {
9423 if (--fmtcnt >= 0)
9424 c = *fmt++;
9425 }
9426 }
9427 if (fmtcnt < 0) {
9428 PyErr_SetString(PyExc_ValueError,
9429 "incomplete format");
9430 goto onError;
9431 }
9432 if (c != '%') {
9433 v = getnextarg(args, arglen, &argidx);
9434 if (v == NULL)
9435 goto onError;
9436 }
9437 sign = 0;
9438 fill = ' ';
9439 switch (c) {
9440
9441 case '%':
9442 pbuf = formatbuf;
9443 /* presume that buffer length is at least 1 */
9444 pbuf[0] = '%';
9445 len = 1;
9446 break;
9447
9448 case 's':
9449 case 'r':
9450 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009451 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009452 temp = v;
9453 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009454 }
9455 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009456 if (c == 's')
9457 temp = PyObject_Str(v);
9458 else if (c == 'r')
9459 temp = PyObject_Repr(v);
9460 else
9461 temp = PyObject_ASCII(v);
9462 if (temp == NULL)
9463 goto onError;
9464 if (PyUnicode_Check(temp))
9465 /* nothing to do */;
9466 else {
9467 Py_DECREF(temp);
9468 PyErr_SetString(PyExc_TypeError,
9469 "%s argument has non-string str()");
9470 goto onError;
9471 }
9472 }
9473 pbuf = PyUnicode_AS_UNICODE(temp);
9474 len = PyUnicode_GET_SIZE(temp);
9475 if (prec >= 0 && len > prec)
9476 len = prec;
9477 break;
9478
9479 case 'i':
9480 case 'd':
9481 case 'u':
9482 case 'o':
9483 case 'x':
9484 case 'X':
9485 if (c == 'i')
9486 c = 'd';
9487 isnumok = 0;
9488 if (PyNumber_Check(v)) {
9489 PyObject *iobj=NULL;
9490
9491 if (PyLong_Check(v)) {
9492 iobj = v;
9493 Py_INCREF(iobj);
9494 }
9495 else {
9496 iobj = PyNumber_Long(v);
9497 }
9498 if (iobj!=NULL) {
9499 if (PyLong_Check(iobj)) {
9500 isnumok = 1;
9501 temp = formatlong(iobj, flags, prec, c);
9502 Py_DECREF(iobj);
9503 if (!temp)
9504 goto onError;
9505 pbuf = PyUnicode_AS_UNICODE(temp);
9506 len = PyUnicode_GET_SIZE(temp);
9507 sign = 1;
9508 }
9509 else {
9510 Py_DECREF(iobj);
9511 }
9512 }
9513 }
9514 if (!isnumok) {
9515 PyErr_Format(PyExc_TypeError,
9516 "%%%c format: a number is required, "
9517 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9518 goto onError;
9519 }
9520 if (flags & F_ZERO)
9521 fill = '0';
9522 break;
9523
9524 case 'e':
9525 case 'E':
9526 case 'f':
9527 case 'F':
9528 case 'g':
9529 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009530 temp = formatfloat(v, flags, prec, c);
9531 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009533 pbuf = PyUnicode_AS_UNICODE(temp);
9534 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 sign = 1;
9536 if (flags & F_ZERO)
9537 fill = '0';
9538 break;
9539
9540 case 'c':
9541 pbuf = formatbuf;
9542 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9543 if (len < 0)
9544 goto onError;
9545 break;
9546
9547 default:
9548 PyErr_Format(PyExc_ValueError,
9549 "unsupported format character '%c' (0x%x) "
9550 "at index %zd",
9551 (31<=c && c<=126) ? (char)c : '?',
9552 (int)c,
9553 (Py_ssize_t)(fmt - 1 -
9554 PyUnicode_AS_UNICODE(uformat)));
9555 goto onError;
9556 }
9557 if (sign) {
9558 if (*pbuf == '-' || *pbuf == '+') {
9559 sign = *pbuf++;
9560 len--;
9561 }
9562 else if (flags & F_SIGN)
9563 sign = '+';
9564 else if (flags & F_BLANK)
9565 sign = ' ';
9566 else
9567 sign = 0;
9568 }
9569 if (width < len)
9570 width = len;
9571 if (rescnt - (sign != 0) < width) {
9572 reslen -= rescnt;
9573 rescnt = width + fmtcnt + 100;
9574 reslen += rescnt;
9575 if (reslen < 0) {
9576 Py_XDECREF(temp);
9577 PyErr_NoMemory();
9578 goto onError;
9579 }
9580 if (_PyUnicode_Resize(&result, reslen) < 0) {
9581 Py_XDECREF(temp);
9582 goto onError;
9583 }
9584 res = PyUnicode_AS_UNICODE(result)
9585 + reslen - rescnt;
9586 }
9587 if (sign) {
9588 if (fill != ' ')
9589 *res++ = sign;
9590 rescnt--;
9591 if (width > len)
9592 width--;
9593 }
9594 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9595 assert(pbuf[0] == '0');
9596 assert(pbuf[1] == c);
9597 if (fill != ' ') {
9598 *res++ = *pbuf++;
9599 *res++ = *pbuf++;
9600 }
9601 rescnt -= 2;
9602 width -= 2;
9603 if (width < 0)
9604 width = 0;
9605 len -= 2;
9606 }
9607 if (width > len && !(flags & F_LJUST)) {
9608 do {
9609 --rescnt;
9610 *res++ = fill;
9611 } while (--width > len);
9612 }
9613 if (fill == ' ') {
9614 if (sign)
9615 *res++ = sign;
9616 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9617 assert(pbuf[0] == '0');
9618 assert(pbuf[1] == c);
9619 *res++ = *pbuf++;
9620 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009621 }
9622 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009623 Py_UNICODE_COPY(res, pbuf, len);
9624 res += len;
9625 rescnt -= len;
9626 while (--width >= len) {
9627 --rescnt;
9628 *res++ = ' ';
9629 }
9630 if (dict && (argidx < arglen) && c != '%') {
9631 PyErr_SetString(PyExc_TypeError,
9632 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009633 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009634 goto onError;
9635 }
9636 Py_XDECREF(temp);
9637 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 } /* until end */
9639 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009640 PyErr_SetString(PyExc_TypeError,
9641 "not all arguments converted during string formatting");
9642 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643 }
9644
Thomas Woutersa96affe2006-03-12 00:29:36 +00009645 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009646 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009648 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649 }
9650 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651 return (PyObject *)result;
9652
Benjamin Peterson29060642009-01-31 22:14:21 +00009653 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654 Py_XDECREF(result);
9655 Py_DECREF(uformat);
9656 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009657 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658 }
9659 return NULL;
9660}
9661
Jeremy Hylton938ace62002-07-17 16:30:39 +00009662static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009663unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9664
Tim Peters6d6c1a32001-08-02 04:15:00 +00009665static PyObject *
9666unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9667{
Benjamin Peterson29060642009-01-31 22:14:21 +00009668 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009669 static char *kwlist[] = {"object", "encoding", "errors", 0};
9670 char *encoding = NULL;
9671 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009672
Benjamin Peterson14339b62009-01-31 16:36:08 +00009673 if (type != &PyUnicode_Type)
9674 return unicode_subtype_new(type, args, kwds);
9675 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009676 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009677 return NULL;
9678 if (x == NULL)
9679 return (PyObject *)_PyUnicode_New(0);
9680 if (encoding == NULL && errors == NULL)
9681 return PyObject_Str(x);
9682 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009683 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009684}
9685
Guido van Rossume023fe02001-08-30 03:12:59 +00009686static PyObject *
9687unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9688{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009689 PyUnicodeObject *tmp, *pnew;
9690 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009691
Benjamin Peterson14339b62009-01-31 16:36:08 +00009692 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9693 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9694 if (tmp == NULL)
9695 return NULL;
9696 assert(PyUnicode_Check(tmp));
9697 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9698 if (pnew == NULL) {
9699 Py_DECREF(tmp);
9700 return NULL;
9701 }
9702 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9703 if (pnew->str == NULL) {
9704 _Py_ForgetReference((PyObject *)pnew);
9705 PyObject_Del(pnew);
9706 Py_DECREF(tmp);
9707 return PyErr_NoMemory();
9708 }
9709 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9710 pnew->length = n;
9711 pnew->hash = tmp->hash;
9712 Py_DECREF(tmp);
9713 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009714}
9715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009716PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009717 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009718\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009719Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009720encoding defaults to the current default string encoding.\n\
9721errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009722
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009723static PyObject *unicode_iter(PyObject *seq);
9724
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009726 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009727 "str", /* tp_name */
9728 sizeof(PyUnicodeObject), /* tp_size */
9729 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009731 (destructor)unicode_dealloc, /* tp_dealloc */
9732 0, /* tp_print */
9733 0, /* tp_getattr */
9734 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009735 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009736 unicode_repr, /* tp_repr */
9737 &unicode_as_number, /* tp_as_number */
9738 &unicode_as_sequence, /* tp_as_sequence */
9739 &unicode_as_mapping, /* tp_as_mapping */
9740 (hashfunc) unicode_hash, /* tp_hash*/
9741 0, /* tp_call*/
9742 (reprfunc) unicode_str, /* tp_str */
9743 PyObject_GenericGetAttr, /* tp_getattro */
9744 0, /* tp_setattro */
9745 0, /* tp_as_buffer */
9746 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009747 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009748 unicode_doc, /* tp_doc */
9749 0, /* tp_traverse */
9750 0, /* tp_clear */
9751 PyUnicode_RichCompare, /* tp_richcompare */
9752 0, /* tp_weaklistoffset */
9753 unicode_iter, /* tp_iter */
9754 0, /* tp_iternext */
9755 unicode_methods, /* tp_methods */
9756 0, /* tp_members */
9757 0, /* tp_getset */
9758 &PyBaseObject_Type, /* tp_base */
9759 0, /* tp_dict */
9760 0, /* tp_descr_get */
9761 0, /* tp_descr_set */
9762 0, /* tp_dictoffset */
9763 0, /* tp_init */
9764 0, /* tp_alloc */
9765 unicode_new, /* tp_new */
9766 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767};
9768
9769/* Initialize the Unicode implementation */
9770
Thomas Wouters78890102000-07-22 19:25:51 +00009771void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009773 int i;
9774
Thomas Wouters477c8d52006-05-27 19:21:47 +00009775 /* XXX - move this array to unicodectype.c ? */
9776 Py_UNICODE linebreak[] = {
9777 0x000A, /* LINE FEED */
9778 0x000D, /* CARRIAGE RETURN */
9779 0x001C, /* FILE SEPARATOR */
9780 0x001D, /* GROUP SEPARATOR */
9781 0x001E, /* RECORD SEPARATOR */
9782 0x0085, /* NEXT LINE */
9783 0x2028, /* LINE SEPARATOR */
9784 0x2029, /* PARAGRAPH SEPARATOR */
9785 };
9786
Fred Drakee4315f52000-05-09 19:53:39 +00009787 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009788 free_list = NULL;
9789 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009791 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009792 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009793
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009794 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009795 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009796 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009797 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009798
9799 /* initialize the linebreak bloom filter */
9800 bloom_linebreak = make_bloom_mask(
9801 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9802 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009803
9804 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805}
9806
9807/* Finalize the Unicode implementation */
9808
Christian Heimesa156e092008-02-16 07:38:31 +00009809int
9810PyUnicode_ClearFreeList(void)
9811{
9812 int freelist_size = numfree;
9813 PyUnicodeObject *u;
9814
9815 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009816 PyUnicodeObject *v = u;
9817 u = *(PyUnicodeObject **)u;
9818 if (v->str)
9819 PyObject_DEL(v->str);
9820 Py_XDECREF(v->defenc);
9821 PyObject_Del(v);
9822 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009823 }
9824 free_list = NULL;
9825 assert(numfree == 0);
9826 return freelist_size;
9827}
9828
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829void
Thomas Wouters78890102000-07-22 19:25:51 +00009830_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009832 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009834 Py_XDECREF(unicode_empty);
9835 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009836
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009837 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009838 if (unicode_latin1[i]) {
9839 Py_DECREF(unicode_latin1[i]);
9840 unicode_latin1[i] = NULL;
9841 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009842 }
Christian Heimesa156e092008-02-16 07:38:31 +00009843 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009845
Walter Dörwald16807132007-05-25 13:52:07 +00009846void
9847PyUnicode_InternInPlace(PyObject **p)
9848{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009849 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9850 PyObject *t;
9851 if (s == NULL || !PyUnicode_Check(s))
9852 Py_FatalError(
9853 "PyUnicode_InternInPlace: unicode strings only please!");
9854 /* If it's a subclass, we don't really know what putting
9855 it in the interned dict might do. */
9856 if (!PyUnicode_CheckExact(s))
9857 return;
9858 if (PyUnicode_CHECK_INTERNED(s))
9859 return;
9860 if (interned == NULL) {
9861 interned = PyDict_New();
9862 if (interned == NULL) {
9863 PyErr_Clear(); /* Don't leave an exception */
9864 return;
9865 }
9866 }
9867 /* It might be that the GetItem call fails even
9868 though the key is present in the dictionary,
9869 namely when this happens during a stack overflow. */
9870 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009872 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009873
Benjamin Peterson29060642009-01-31 22:14:21 +00009874 if (t) {
9875 Py_INCREF(t);
9876 Py_DECREF(*p);
9877 *p = t;
9878 return;
9879 }
Walter Dörwald16807132007-05-25 13:52:07 +00009880
Benjamin Peterson14339b62009-01-31 16:36:08 +00009881 PyThreadState_GET()->recursion_critical = 1;
9882 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9883 PyErr_Clear();
9884 PyThreadState_GET()->recursion_critical = 0;
9885 return;
9886 }
9887 PyThreadState_GET()->recursion_critical = 0;
9888 /* The two references in interned are not counted by refcnt.
9889 The deallocator will take care of this */
9890 Py_REFCNT(s) -= 2;
9891 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009892}
9893
9894void
9895PyUnicode_InternImmortal(PyObject **p)
9896{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009897 PyUnicode_InternInPlace(p);
9898 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9899 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9900 Py_INCREF(*p);
9901 }
Walter Dörwald16807132007-05-25 13:52:07 +00009902}
9903
9904PyObject *
9905PyUnicode_InternFromString(const char *cp)
9906{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009907 PyObject *s = PyUnicode_FromString(cp);
9908 if (s == NULL)
9909 return NULL;
9910 PyUnicode_InternInPlace(&s);
9911 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009912}
9913
9914void _Py_ReleaseInternedUnicodeStrings(void)
9915{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009916 PyObject *keys;
9917 PyUnicodeObject *s;
9918 Py_ssize_t i, n;
9919 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009920
Benjamin Peterson14339b62009-01-31 16:36:08 +00009921 if (interned == NULL || !PyDict_Check(interned))
9922 return;
9923 keys = PyDict_Keys(interned);
9924 if (keys == NULL || !PyList_Check(keys)) {
9925 PyErr_Clear();
9926 return;
9927 }
Walter Dörwald16807132007-05-25 13:52:07 +00009928
Benjamin Peterson14339b62009-01-31 16:36:08 +00009929 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9930 detector, interned unicode strings are not forcibly deallocated;
9931 rather, we give them their stolen references back, and then clear
9932 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009933
Benjamin Peterson14339b62009-01-31 16:36:08 +00009934 n = PyList_GET_SIZE(keys);
9935 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009936 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009937 for (i = 0; i < n; i++) {
9938 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9939 switch (s->state) {
9940 case SSTATE_NOT_INTERNED:
9941 /* XXX Shouldn't happen */
9942 break;
9943 case SSTATE_INTERNED_IMMORTAL:
9944 Py_REFCNT(s) += 1;
9945 immortal_size += s->length;
9946 break;
9947 case SSTATE_INTERNED_MORTAL:
9948 Py_REFCNT(s) += 2;
9949 mortal_size += s->length;
9950 break;
9951 default:
9952 Py_FatalError("Inconsistent interned string state.");
9953 }
9954 s->state = SSTATE_NOT_INTERNED;
9955 }
9956 fprintf(stderr, "total size of all interned strings: "
9957 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9958 "mortal/immortal\n", mortal_size, immortal_size);
9959 Py_DECREF(keys);
9960 PyDict_Clear(interned);
9961 Py_DECREF(interned);
9962 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009963}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009964
9965
9966/********************* Unicode Iterator **************************/
9967
9968typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009969 PyObject_HEAD
9970 Py_ssize_t it_index;
9971 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009972} unicodeiterobject;
9973
9974static void
9975unicodeiter_dealloc(unicodeiterobject *it)
9976{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009977 _PyObject_GC_UNTRACK(it);
9978 Py_XDECREF(it->it_seq);
9979 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009980}
9981
9982static int
9983unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9984{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009985 Py_VISIT(it->it_seq);
9986 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009987}
9988
9989static PyObject *
9990unicodeiter_next(unicodeiterobject *it)
9991{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009992 PyUnicodeObject *seq;
9993 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009994
Benjamin Peterson14339b62009-01-31 16:36:08 +00009995 assert(it != NULL);
9996 seq = it->it_seq;
9997 if (seq == NULL)
9998 return NULL;
9999 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010000
Benjamin Peterson14339b62009-01-31 16:36:08 +000010001 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10002 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010004 if (item != NULL)
10005 ++it->it_index;
10006 return item;
10007 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010008
Benjamin Peterson14339b62009-01-31 16:36:08 +000010009 Py_DECREF(seq);
10010 it->it_seq = NULL;
10011 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010012}
10013
10014static PyObject *
10015unicodeiter_len(unicodeiterobject *it)
10016{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010017 Py_ssize_t len = 0;
10018 if (it->it_seq)
10019 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10020 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010021}
10022
10023PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10024
10025static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010026 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010027 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010028 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010029};
10030
10031PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010032 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10033 "str_iterator", /* tp_name */
10034 sizeof(unicodeiterobject), /* tp_basicsize */
10035 0, /* tp_itemsize */
10036 /* methods */
10037 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10038 0, /* tp_print */
10039 0, /* tp_getattr */
10040 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010041 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010042 0, /* tp_repr */
10043 0, /* tp_as_number */
10044 0, /* tp_as_sequence */
10045 0, /* tp_as_mapping */
10046 0, /* tp_hash */
10047 0, /* tp_call */
10048 0, /* tp_str */
10049 PyObject_GenericGetAttr, /* tp_getattro */
10050 0, /* tp_setattro */
10051 0, /* tp_as_buffer */
10052 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10053 0, /* tp_doc */
10054 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10055 0, /* tp_clear */
10056 0, /* tp_richcompare */
10057 0, /* tp_weaklistoffset */
10058 PyObject_SelfIter, /* tp_iter */
10059 (iternextfunc)unicodeiter_next, /* tp_iternext */
10060 unicodeiter_methods, /* tp_methods */
10061 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010062};
10063
10064static PyObject *
10065unicode_iter(PyObject *seq)
10066{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010067 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010068
Benjamin Peterson14339b62009-01-31 16:36:08 +000010069 if (!PyUnicode_Check(seq)) {
10070 PyErr_BadInternalCall();
10071 return NULL;
10072 }
10073 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10074 if (it == NULL)
10075 return NULL;
10076 it->it_index = 0;
10077 Py_INCREF(seq);
10078 it->it_seq = (PyUnicodeObject *)seq;
10079 _PyObject_GC_TRACK(it);
10080 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010081}
10082
Martin v. Löwis5b222132007-06-10 09:51:05 +000010083size_t
10084Py_UNICODE_strlen(const Py_UNICODE *u)
10085{
10086 int res = 0;
10087 while(*u++)
10088 res++;
10089 return res;
10090}
10091
10092Py_UNICODE*
10093Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10094{
10095 Py_UNICODE *u = s1;
10096 while ((*u++ = *s2++));
10097 return s1;
10098}
10099
10100Py_UNICODE*
10101Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10102{
10103 Py_UNICODE *u = s1;
10104 while ((*u++ = *s2++))
10105 if (n-- == 0)
10106 break;
10107 return s1;
10108}
10109
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010110Py_UNICODE*
10111Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10112{
10113 Py_UNICODE *u1 = s1;
10114 u1 += Py_UNICODE_strlen(u1);
10115 Py_UNICODE_strcpy(u1, s2);
10116 return s1;
10117}
10118
Martin v. Löwis5b222132007-06-10 09:51:05 +000010119int
10120Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10121{
10122 while (*s1 && *s2 && *s1 == *s2)
10123 s1++, s2++;
10124 if (*s1 && *s2)
10125 return (*s1 < *s2) ? -1 : +1;
10126 if (*s1)
10127 return 1;
10128 if (*s2)
10129 return -1;
10130 return 0;
10131}
10132
Victor Stinneref8d95c2010-08-16 22:03:11 +000010133int
10134Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10135{
10136 register Py_UNICODE u1, u2;
10137 for (; n != 0; n--) {
10138 u1 = *s1;
10139 u2 = *s2;
10140 if (u1 != u2)
10141 return (u1 < u2) ? -1 : +1;
10142 if (u1 == '\0')
10143 return 0;
10144 s1++;
10145 s2++;
10146 }
10147 return 0;
10148}
10149
Martin v. Löwis5b222132007-06-10 09:51:05 +000010150Py_UNICODE*
10151Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10152{
10153 const Py_UNICODE *p;
10154 for (p = s; *p; p++)
10155 if (*p == c)
10156 return (Py_UNICODE*)p;
10157 return NULL;
10158}
10159
Victor Stinner331ea922010-08-10 16:37:20 +000010160Py_UNICODE*
10161Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10162{
10163 const Py_UNICODE *p;
10164 p = s + Py_UNICODE_strlen(s);
10165 while (p != s) {
10166 p--;
10167 if (*p == c)
10168 return (Py_UNICODE*)p;
10169 }
10170 return NULL;
10171}
10172
Victor Stinner71133ff2010-09-01 23:43:53 +000010173Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010174PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010175{
10176 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10177 Py_UNICODE *copy;
10178 Py_ssize_t size;
10179
10180 /* Ensure we won't overflow the size. */
10181 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10182 PyErr_NoMemory();
10183 return NULL;
10184 }
10185 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10186 size *= sizeof(Py_UNICODE);
10187 copy = PyMem_Malloc(size);
10188 if (copy == NULL) {
10189 PyErr_NoMemory();
10190 return NULL;
10191 }
10192 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10193 return copy;
10194}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010195
Georg Brandl66c221e2010-10-14 07:04:07 +000010196/* A _string module, to export formatter_parser and formatter_field_name_split
10197 to the string.Formatter class implemented in Python. */
10198
10199static PyMethodDef _string_methods[] = {
10200 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10201 METH_O, PyDoc_STR("split the argument as a field name")},
10202 {"formatter_parser", (PyCFunction) formatter_parser,
10203 METH_O, PyDoc_STR("parse the argument as a format string")},
10204 {NULL, NULL}
10205};
10206
10207static struct PyModuleDef _string_module = {
10208 PyModuleDef_HEAD_INIT,
10209 "_string",
10210 PyDoc_STR("string helper module"),
10211 0,
10212 _string_methods,
10213 NULL,
10214 NULL,
10215 NULL,
10216 NULL
10217};
10218
10219PyMODINIT_FUNC
10220PyInit__string(void)
10221{
10222 return PyModule_Create(&_string_module);
10223}
10224
10225
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010226#ifdef __cplusplus
10227}
10228#endif