blob: 98427e3c2c74d1d15342ce99a964fc665cb10bf1 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Christian Heimes190d79e2008-01-30 11:58:22 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000119 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000120/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000121/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000122/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000123/* case 0x000C: * FORM FEED */
124/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000127/* case 0x001C: * FILE SEPARATOR */
128/* case 0x001D: * GROUP SEPARATOR */
129/* case 0x001E: * RECORD SEPARATOR */
130/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000131 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson14339b62009-01-31 16:36:08 +0000138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000146};
147
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149 PyObject **errorHandler,const char *encoding, const char *reason,
150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
Victor Stinner31be90b2010-04-22 19:38:16 +0000153static void raise_encode_exception(PyObject **exceptionObject,
154 const char *encoding,
155 const Py_UNICODE *unicode, Py_ssize_t size,
156 Py_ssize_t startpos, Py_ssize_t endpos,
157 const char *reason);
158
Christian Heimes190d79e2008-01-30 11:58:22 +0000159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrouf068f942010-01-13 14:19:12 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
Antoine Pitrouf068f942010-01-13 14:19:12 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
Benjamin Peterson29060642009-01-31 22:14:21 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252}
253
Benjamin Peterson29060642009-01-31 22:14:21 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000278 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000309 Ux0000 terminated; some code (e.g. new_identifier)
310 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000313 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314
315*/
316
317static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319{
320 register PyUnicodeObject *unicode;
321
Thomas Wouters477c8d52006-05-27 19:21:47 +0000322 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 if (length == 0 && unicode_empty != NULL) {
324 Py_INCREF(unicode_empty);
325 return unicode_empty;
326 }
327
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000328 /* Ensure we won't overflow the size. */
329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330 return (PyUnicodeObject *)PyErr_NoMemory();
331 }
332
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000334 if (free_list) {
335 unicode = free_list;
336 free_list = *(PyUnicodeObject **)unicode;
337 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000338 if (unicode->str) {
339 /* Keep-Alive optimization: we only upsize the buffer,
340 never downsize it. */
341 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000342 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000343 PyObject_DEL(unicode->str);
344 unicode->str = NULL;
345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000350 }
351 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 }
353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 if (unicode == NULL)
357 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 }
361
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 PyErr_NoMemory();
364 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000365 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000366 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000367 * the caller fails before initializing str -- unicode_resize()
368 * reads str[0], and the Keep-Alive optimization can keep memory
369 * allocated for str alive across a call to unicode_dealloc(unicode).
370 * We don't want unicode_resize to read uninitialized memory in
371 * that case.
372 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000373 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000377 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000378 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000380
Benjamin Peterson29060642009-01-31 22:14:21 +0000381 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000382 /* XXX UNREF/NEWREF interface should be more symmetrical */
383 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000385 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387}
388
389static
Guido van Rossum9475a232001-10-05 20:51:39 +0000390void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391{
Walter Dörwald16807132007-05-25 13:52:07 +0000392 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_NOT_INTERNED:
394 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000395
Benjamin Peterson29060642009-01-31 22:14:21 +0000396 case SSTATE_INTERNED_MORTAL:
397 /* revive dead object temporarily for DelItem */
398 Py_REFCNT(unicode) = 3;
399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400 Py_FatalError(
401 "deletion of interned string failed");
402 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 case SSTATE_INTERNED_IMMORTAL:
405 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406
Benjamin Peterson29060642009-01-31 22:14:21 +0000407 default:
408 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000409 }
410
Guido van Rossum604ddf82001-12-06 20:03:56 +0000411 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000412 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000413 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415 PyObject_DEL(unicode->str);
416 unicode->str = NULL;
417 unicode->length = 0;
418 }
419 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000420 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 }
422 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000423 *(PyUnicodeObject **)unicode = free_list;
424 free_list = unicode;
425 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000426 }
427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyObject_DEL(unicode->str);
429 Py_XDECREF(unicode->defenc);
430 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431 }
432}
433
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000436{
437 register PyUnicodeObject *v;
438
439 /* Argument checks */
440 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 PyErr_BadInternalCall();
442 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000443 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 PyErr_BadInternalCall();
447 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 }
449
450 /* Resizing unicode_empty and single character objects is not
451 possible since these are being shared. We simply return a fresh
452 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000453 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000454 (v == unicode_empty || v->length == 1)) {
455 PyUnicodeObject *w = _PyUnicode_New(length);
456 if (w == NULL)
457 return -1;
458 Py_UNICODE_COPY(w->str, v->str,
459 length < v->length ? length : v->length);
460 Py_DECREF(*unicode);
461 *unicode = w;
462 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000463 }
464
465 /* Note that we don't have to modify *unicode for unshared Unicode
466 objects, since we can modify them in-place. */
467 return unicode_resize(v, length);
468}
469
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477{
478 PyUnicodeObject *unicode;
479
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects. */
482 if (u != NULL) {
483
Benjamin Peterson29060642009-01-31 22:14:21 +0000484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000489
490 /* Single character Unicode objects in the Latin-1 range are
491 shared when using this constructor */
492 if (size == 1 && *u < 256) {
493 unicode = unicode_latin1[*u];
494 if (!unicode) {
495 unicode = _PyUnicode_New(1);
496 if (!unicode)
497 return NULL;
498 unicode->str[0] = *u;
499 unicode_latin1[*u] = unicode;
500 }
501 Py_INCREF(unicode);
502 return (PyObject *)unicode;
503 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000504 }
Tim Petersced69f82003-09-16 20:30:58 +0000505
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 unicode = _PyUnicode_New(size);
507 if (!unicode)
508 return NULL;
509
510 /* Copy the Unicode data into the new object */
511 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000512 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 return (PyObject *)unicode;
515}
516
Walter Dörwaldd2034312007-05-18 16:29:38 +0000517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518{
519 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000520
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 if (size < 0) {
522 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 return NULL;
525 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000526
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000528 some optimizations which share commonly used objects.
529 Also, this means the input must be UTF-8, so fall back to the
530 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (u != NULL) {
532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533 /* Optimization for empty strings */
534 if (size == 0 && unicode_empty != NULL) {
535 Py_INCREF(unicode_empty);
536 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000537 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000538
539 /* Single characters are shared when using this constructor.
540 Restrict to ASCII, since the input must be UTF-8. */
541 if (size == 1 && Py_CHARMASK(*u) < 128) {
542 unicode = unicode_latin1[Py_CHARMASK(*u)];
543 if (!unicode) {
544 unicode = _PyUnicode_New(1);
545 if (!unicode)
546 return NULL;
547 unicode->str[0] = Py_CHARMASK(*u);
548 unicode_latin1[Py_CHARMASK(*u)] = unicode;
549 }
550 Py_INCREF(unicode);
551 return (PyObject *)unicode;
552 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000553
554 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 }
556
Walter Dörwald55507312007-05-18 13:12:10 +0000557 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 if (!unicode)
559 return NULL;
560
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 return (PyObject *)unicode;
562}
563
Walter Dörwaldd2034312007-05-18 16:29:38 +0000564PyObject *PyUnicode_FromString(const char *u)
565{
566 size_t size = strlen(u);
567 if (size > PY_SSIZE_T_MAX) {
568 PyErr_SetString(PyExc_OverflowError, "input too long");
569 return NULL;
570 }
571
572 return PyUnicode_FromStringAndSize(u, size);
573}
574
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575#ifdef HAVE_WCHAR_H
576
Mark Dickinson081dfee2009-03-18 14:47:41 +0000577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584 to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
588{
589 PyUnicodeObject *unicode;
590 register Py_ssize_t i;
591 Py_ssize_t alloc;
592 const wchar_t *orig_w;
593
594 if (w == NULL) {
595 if (size == 0)
596 return PyUnicode_FromStringAndSize(NULL, 0);
597 PyErr_BadInternalCall();
598 return NULL;
599 }
600
601 if (size == -1) {
602 size = wcslen(w);
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000643 if (size == 0)
644 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000645 PyErr_BadInternalCall();
646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647 }
648
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == -1) {
650 size = wcslen(w);
651 }
652
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 unicode = _PyUnicode_New(size);
654 if (!unicode)
655 return NULL;
656
657 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000660#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000662 register Py_UNICODE *u;
663 register Py_ssize_t i;
664 u = PyUnicode_AS_UNICODE(unicode);
665 for (i = size; i > 0; i--)
666 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 }
668#endif
669
670 return (PyObject *)unicode;
671}
672
Mark Dickinson081dfee2009-03-18 14:47:41 +0000673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
Walter Dörwald346737f2007-05-31 10:44:43 +0000677static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000680{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000681 *fmt++ = '%';
682 if (width) {
683 if (zeropad)
684 *fmt++ = '0';
685 fmt += sprintf(fmt, "%d", width);
686 }
687 if (precision)
688 fmt += sprintf(fmt, ".%d", precision);
689 if (longflag)
690 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000691 else if (longlongflag) {
692 /* longlongflag should only ever be nonzero on machines with
693 HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695 char *f = PY_FORMAT_LONG_LONG;
696 while (*f)
697 *fmt++ = *f++;
698#else
699 /* we shouldn't ever get here */
700 assert(0);
701 *fmt++ = 'l';
702#endif
703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000704 else if (size_tflag) {
705 char *f = PY_FORMAT_SIZE_T;
706 while (*f)
707 *fmt++ = *f++;
708 }
709 *fmt++ = c;
710 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000711}
712
Walter Dörwaldd2034312007-05-18 16:29:38 +0000713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld. 21 characters
718 allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000728 va_list count;
729 Py_ssize_t callcount = 0;
730 PyObject **callresults = NULL;
731 PyObject **callresult = NULL;
732 Py_ssize_t n = 0;
733 int width = 0;
734 int precision = 0;
735 int zeropad;
736 const char* f;
737 Py_UNICODE *s;
738 PyObject *string;
739 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000740 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000741 /* use abuffer instead of buffer, if we need more space
742 * (which can happen if there's a format specifier with width). */
743 char *abuffer = NULL;
744 char *realbuffer;
745 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000749 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000750 /* step 1: count the number of %S/%R/%A/%s format specifications
751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000754 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000755 if (*f == '%') {
756 if (*(f+1)=='%')
757 continue;
758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759 ++callcount;
760 while (ISDIGIT((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763 ;
764 if (*f == 's')
765 ++callcount;
766 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000767 else if (128 <= (unsigned char)*f) {
768 PyErr_Format(PyExc_ValueError,
769 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000770 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000771 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000772 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 }
775 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000776 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000777 if (callcount) {
778 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
779 if (!callresults) {
780 PyErr_NoMemory();
781 return NULL;
782 }
783 callresult = callresults;
784 }
785 /* step 3: figure out how large a buffer we need */
786 for (f = format; *f; f++) {
787 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000788#ifdef HAVE_LONG_LONG
789 int longlongflag = 0;
790#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 const char* p = f;
792 width = 0;
793 while (ISDIGIT((unsigned)*f))
794 width = (width*10) + *f++ - '0';
795 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
796 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797
Benjamin Peterson14339b62009-01-31 16:36:08 +0000798 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
799 * they don't affect the amount of space we reserve.
800 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000801 if (*f == 'l') {
802 if (f[1] == 'd' || f[1] == 'u') {
803 ++f;
804 }
805#ifdef HAVE_LONG_LONG
806 else if (f[1] == 'l' &&
807 (f[2] == 'd' || f[2] == 'u')) {
808 longlongflag = 1;
809 f += 2;
810 }
811#endif
812 }
813 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000814 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000816
Benjamin Peterson14339b62009-01-31 16:36:08 +0000817 switch (*f) {
818 case 'c':
819 (void)va_arg(count, int);
820 /* fall through... */
821 case '%':
822 n++;
823 break;
824 case 'd': case 'u': case 'i': case 'x':
825 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000826#ifdef HAVE_LONG_LONG
827 if (longlongflag) {
828 if (width < MAX_LONG_LONG_CHARS)
829 width = MAX_LONG_LONG_CHARS;
830 }
831 else
832#endif
833 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
834 including sign. Decimal takes the most space. This
835 isn't enough for octal. If a width is specified we
836 need more (which we allocate later). */
837 if (width < MAX_LONG_CHARS)
838 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000839 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000840 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 if (abuffersize < width)
842 abuffersize = width;
843 break;
844 case 's':
845 {
846 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000847 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000848 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
849 if (!str)
850 goto fail;
851 n += PyUnicode_GET_SIZE(str);
852 /* Remember the str and switch to the next slot */
853 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000854 break;
855 }
856 case 'U':
857 {
858 PyObject *obj = va_arg(count, PyObject *);
859 assert(obj && PyUnicode_Check(obj));
860 n += PyUnicode_GET_SIZE(obj);
861 break;
862 }
863 case 'V':
864 {
865 PyObject *obj = va_arg(count, PyObject *);
866 const char *str = va_arg(count, const char *);
867 assert(obj || str);
868 assert(!obj || PyUnicode_Check(obj));
869 if (obj)
870 n += PyUnicode_GET_SIZE(obj);
871 else
872 n += strlen(str);
873 break;
874 }
875 case 'S':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 PyObject *str;
879 assert(obj);
880 str = PyObject_Str(obj);
881 if (!str)
882 goto fail;
883 n += PyUnicode_GET_SIZE(str);
884 /* Remember the str and switch to the next slot */
885 *callresult++ = str;
886 break;
887 }
888 case 'R':
889 {
890 PyObject *obj = va_arg(count, PyObject *);
891 PyObject *repr;
892 assert(obj);
893 repr = PyObject_Repr(obj);
894 if (!repr)
895 goto fail;
896 n += PyUnicode_GET_SIZE(repr);
897 /* Remember the repr and switch to the next slot */
898 *callresult++ = repr;
899 break;
900 }
901 case 'A':
902 {
903 PyObject *obj = va_arg(count, PyObject *);
904 PyObject *ascii;
905 assert(obj);
906 ascii = PyObject_ASCII(obj);
907 if (!ascii)
908 goto fail;
909 n += PyUnicode_GET_SIZE(ascii);
910 /* Remember the repr and switch to the next slot */
911 *callresult++ = ascii;
912 break;
913 }
914 case 'p':
915 (void) va_arg(count, int);
916 /* maximum 64-bit pointer representation:
917 * 0xffffffffffffffff
918 * so 19 characters is enough.
919 * XXX I count 18 -- what's the extra for?
920 */
921 n += 19;
922 break;
923 default:
924 /* if we stumble upon an unknown
925 formatting code, copy the rest of
926 the format string to the output
927 string. (we cannot just skip the
928 code, since there's no way to know
929 what's in the argument list) */
930 n += strlen(p);
931 goto expand;
932 }
933 } else
934 n++;
935 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000936 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000937 if (abuffersize > ITEM_BUFFER_LEN) {
938 /* add 1 for sprintf's trailing null byte */
939 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000940 if (!abuffer) {
941 PyErr_NoMemory();
942 goto fail;
943 }
944 realbuffer = abuffer;
945 }
946 else
947 realbuffer = buffer;
948 /* step 4: fill the buffer */
949 /* Since we've analyzed how much space we need for the worst case,
950 we don't have to resize the string.
951 There can be no errors beyond this point. */
952 string = PyUnicode_FromUnicode(NULL, n);
953 if (!string)
954 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000955
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 s = PyUnicode_AS_UNICODE(string);
957 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000958
Benjamin Peterson14339b62009-01-31 16:36:08 +0000959 for (f = format; *f; f++) {
960 if (*f == '%') {
961 const char* p = f++;
962 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000963 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000964 int size_tflag = 0;
965 zeropad = (*f == '0');
966 /* parse the width.precision part */
967 width = 0;
968 while (ISDIGIT((unsigned)*f))
969 width = (width*10) + *f++ - '0';
970 precision = 0;
971 if (*f == '.') {
972 f++;
973 while (ISDIGIT((unsigned)*f))
974 precision = (precision*10) + *f++ - '0';
975 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000976 /* Handle %ld, %lu, %lld and %llu. */
977 if (*f == 'l') {
978 if (f[1] == 'd' || f[1] == 'u') {
979 longflag = 1;
980 ++f;
981 }
982#ifdef HAVE_LONG_LONG
983 else if (f[1] == 'l' &&
984 (f[2] == 'd' || f[2] == 'u')) {
985 longlongflag = 1;
986 f += 2;
987 }
988#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 }
990 /* handle the size_t flag. */
991 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
992 size_tflag = 1;
993 ++f;
994 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000995
Benjamin Peterson14339b62009-01-31 16:36:08 +0000996 switch (*f) {
997 case 'c':
998 *s++ = va_arg(vargs, int);
999 break;
1000 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001001 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1002 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001003 if (longflag)
1004 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001005#ifdef HAVE_LONG_LONG
1006 else if (longlongflag)
1007 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1008#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001009 else if (size_tflag)
1010 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1011 else
1012 sprintf(realbuffer, fmt, va_arg(vargs, int));
1013 appendstring(realbuffer);
1014 break;
1015 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001016 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1017 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 if (longflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001020#ifdef HAVE_LONG_LONG
1021 else if (longlongflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs,
1023 unsigned PY_LONG_LONG));
1024#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001025 else if (size_tflag)
1026 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1027 else
1028 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1029 appendstring(realbuffer);
1030 break;
1031 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 sprintf(realbuffer, fmt, va_arg(vargs, int));
1034 appendstring(realbuffer);
1035 break;
1036 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001037 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 's':
1042 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001043 /* unused, since we already have the result */
1044 (void) va_arg(vargs, char *);
1045 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1046 PyUnicode_GET_SIZE(*callresult));
1047 s += PyUnicode_GET_SIZE(*callresult);
1048 /* We're done with the unicode()/repr() => forget it */
1049 Py_DECREF(*callresult);
1050 /* switch to next unicode()/repr() result */
1051 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001052 break;
1053 }
1054 case 'U':
1055 {
1056 PyObject *obj = va_arg(vargs, PyObject *);
1057 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1058 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1059 s += size;
1060 break;
1061 }
1062 case 'V':
1063 {
1064 PyObject *obj = va_arg(vargs, PyObject *);
1065 const char *str = va_arg(vargs, const char *);
1066 if (obj) {
1067 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1068 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1069 s += size;
1070 } else {
1071 appendstring(str);
1072 }
1073 break;
1074 }
1075 case 'S':
1076 case 'R':
1077 {
1078 Py_UNICODE *ucopy;
1079 Py_ssize_t usize;
1080 Py_ssize_t upos;
1081 /* unused, since we already have the result */
1082 (void) va_arg(vargs, PyObject *);
1083 ucopy = PyUnicode_AS_UNICODE(*callresult);
1084 usize = PyUnicode_GET_SIZE(*callresult);
1085 for (upos = 0; upos<usize;)
1086 *s++ = ucopy[upos++];
1087 /* We're done with the unicode()/repr() => forget it */
1088 Py_DECREF(*callresult);
1089 /* switch to next unicode()/repr() result */
1090 ++callresult;
1091 break;
1092 }
1093 case 'p':
1094 sprintf(buffer, "%p", va_arg(vargs, void*));
1095 /* %p is ill-defined: ensure leading 0x. */
1096 if (buffer[1] == 'X')
1097 buffer[1] = 'x';
1098 else if (buffer[1] != 'x') {
1099 memmove(buffer+2, buffer, strlen(buffer)+1);
1100 buffer[0] = '0';
1101 buffer[1] = 'x';
1102 }
1103 appendstring(buffer);
1104 break;
1105 case '%':
1106 *s++ = '%';
1107 break;
1108 default:
1109 appendstring(p);
1110 goto end;
1111 }
Victor Stinner1205f272010-09-11 00:54:47 +00001112 }
Victor Stinner1205f272010-09-11 00:54:47 +00001113 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001114 *s++ = *f;
1115 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001116
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 if (callresults)
1119 PyObject_Free(callresults);
1120 if (abuffer)
1121 PyObject_Free(abuffer);
1122 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1123 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001124 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 if (callresults) {
1126 PyObject **callresult2 = callresults;
1127 while (callresult2 < callresult) {
1128 Py_DECREF(*callresult2);
1129 ++callresult2;
1130 }
1131 PyObject_Free(callresults);
1132 }
1133 if (abuffer)
1134 PyObject_Free(abuffer);
1135 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001136}
1137
1138#undef appendstring
1139
1140PyObject *
1141PyUnicode_FromFormat(const char *format, ...)
1142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001143 PyObject* ret;
1144 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001145
1146#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001147 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001148#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001150#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001151 ret = PyUnicode_FromFormatV(format, vargs);
1152 va_end(vargs);
1153 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001154}
1155
Victor Stinner5593d8a2010-10-02 11:11:27 +00001156/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1157 convert a Unicode object to a wide character string.
1158
1159 - If w is NULL: return the number of wide characters (including the nul
1160 character) required to convert the unicode object. Ignore size argument.
1161
1162 - Otherwise: return the number of wide characters (excluding the nul
1163 character) written into w. Write at most size wide characters (including
1164 the nul character). */
1165static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001166unicode_aswidechar(PyUnicodeObject *unicode,
1167 wchar_t *w,
1168 Py_ssize_t size)
1169{
1170#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001171 Py_ssize_t res;
1172 if (w != NULL) {
1173 res = PyUnicode_GET_SIZE(unicode);
1174 if (size > res)
1175 size = res + 1;
1176 else
1177 res = size;
1178 memcpy(w, unicode->str, size * sizeof(wchar_t));
1179 return res;
1180 }
1181 else
1182 return PyUnicode_GET_SIZE(unicode) + 1;
1183#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1184 register const Py_UNICODE *u;
1185 const Py_UNICODE *uend;
1186 const wchar_t *worig, *wend;
1187 Py_ssize_t nchar;
1188
Victor Stinner137c34c2010-09-29 10:25:54 +00001189 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001190 uend = u + PyUnicode_GET_SIZE(unicode);
1191 if (w != NULL) {
1192 worig = w;
1193 wend = w + size;
1194 while (u != uend && w != wend) {
1195 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1196 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1197 {
1198 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1199 u += 2;
1200 }
1201 else {
1202 *w = *u;
1203 u++;
1204 }
1205 w++;
1206 }
1207 if (w != wend)
1208 *w = L'\0';
1209 return w - worig;
1210 }
1211 else {
1212 nchar = 1; /* nul character at the end */
1213 while (u != uend) {
1214 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1215 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1216 u += 2;
1217 else
1218 u++;
1219 nchar++;
1220 }
1221 }
1222 return nchar;
1223#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1224 register Py_UNICODE *u, *uend, ordinal;
1225 register Py_ssize_t i;
1226 wchar_t *worig, *wend;
1227 Py_ssize_t nchar;
1228
1229 u = PyUnicode_AS_UNICODE(unicode);
1230 uend = u + PyUnicode_GET_SIZE(u);
1231 if (w != NULL) {
1232 worig = w;
1233 wend = w + size;
1234 while (u != uend && w != wend) {
1235 ordinal = *u;
1236 if (ordinal > 0xffff) {
1237 ordinal -= 0x10000;
1238 *w++ = 0xD800 | (ordinal >> 10);
1239 *w++ = 0xDC00 | (ordinal & 0x3FF);
1240 }
1241 else
1242 *w++ = ordinal;
1243 u++;
1244 }
1245 if (w != wend)
1246 *w = 0;
1247 return w - worig;
1248 }
1249 else {
1250 nchar = 1; /* nul character */
1251 while (u != uend) {
1252 if (*u > 0xffff)
1253 nchar += 2;
1254 else
1255 nchar++;
1256 u++;
1257 }
1258 return nchar;
1259 }
1260#else
1261# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001262#endif
1263}
1264
1265Py_ssize_t
1266PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1267 wchar_t *w,
1268 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269{
1270 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001271 PyErr_BadInternalCall();
1272 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00001274 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275}
1276
Victor Stinner137c34c2010-09-29 10:25:54 +00001277wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001278PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001279 Py_ssize_t *size)
1280{
1281 wchar_t* buffer;
1282 Py_ssize_t buflen;
1283
1284 if (unicode == NULL) {
1285 PyErr_BadInternalCall();
1286 return NULL;
1287 }
1288
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001289 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001290 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001291 PyErr_NoMemory();
1292 return NULL;
1293 }
1294
Victor Stinner137c34c2010-09-29 10:25:54 +00001295 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1296 if (buffer == NULL) {
1297 PyErr_NoMemory();
1298 return NULL;
1299 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001300 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001301 if (size != NULL)
1302 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001303 return buffer;
1304}
1305
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306#endif
1307
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001308PyObject *PyUnicode_FromOrdinal(int ordinal)
1309{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001310 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001311
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001312 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001313 PyErr_SetString(PyExc_ValueError,
1314 "chr() arg not in range(0x110000)");
1315 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001316 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001317
1318#ifndef Py_UNICODE_WIDE
1319 if (ordinal > 0xffff) {
1320 ordinal -= 0x10000;
1321 s[0] = 0xD800 | (ordinal >> 10);
1322 s[1] = 0xDC00 | (ordinal & 0x3FF);
1323 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001324 }
1325#endif
1326
Hye-Shik Chang40574832004-04-06 07:24:51 +00001327 s[0] = (Py_UNICODE)ordinal;
1328 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001329}
1330
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331PyObject *PyUnicode_FromObject(register PyObject *obj)
1332{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001333 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001334 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001335 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001336 Py_INCREF(obj);
1337 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001338 }
1339 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 /* For a Unicode subtype that's not a Unicode object,
1341 return a true Unicode object with the same data. */
1342 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1343 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001344 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001345 PyErr_Format(PyExc_TypeError,
1346 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001347 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001348 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001349}
1350
1351PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001352 const char *encoding,
1353 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001354{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001355 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001356 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001357
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 PyErr_BadInternalCall();
1360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001362
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001363 /* Decoding bytes objects is the most common case and should be fast */
1364 if (PyBytes_Check(obj)) {
1365 if (PyBytes_GET_SIZE(obj) == 0) {
1366 Py_INCREF(unicode_empty);
1367 v = (PyObject *) unicode_empty;
1368 }
1369 else {
1370 v = PyUnicode_Decode(
1371 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1372 encoding, errors);
1373 }
1374 return v;
1375 }
1376
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001377 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001378 PyErr_SetString(PyExc_TypeError,
1379 "decoding str is not supported");
1380 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001381 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001382
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001383 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1384 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1385 PyErr_Format(PyExc_TypeError,
1386 "coercing to str: need bytes, bytearray "
1387 "or buffer-like object, %.80s found",
1388 Py_TYPE(obj)->tp_name);
1389 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001390 }
Tim Petersced69f82003-09-16 20:30:58 +00001391
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001392 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001393 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001394 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395 }
Tim Petersced69f82003-09-16 20:30:58 +00001396 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001397 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001398
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001399 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001400 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401}
1402
Victor Stinner600d3be2010-06-10 12:00:55 +00001403/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001404 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1405 1 on success. */
1406static int
1407normalize_encoding(const char *encoding,
1408 char *lower,
1409 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001411 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001412 char *l;
1413 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001415 e = encoding;
1416 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001417 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001418 while (*e) {
1419 if (l == l_end)
1420 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001421 if (ISUPPER(*e)) {
1422 *l++ = TOLOWER(*e++);
1423 }
1424 else if (*e == '_') {
1425 *l++ = '-';
1426 e++;
1427 }
1428 else {
1429 *l++ = *e++;
1430 }
1431 }
1432 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001433 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001434}
1435
1436PyObject *PyUnicode_Decode(const char *s,
1437 Py_ssize_t size,
1438 const char *encoding,
1439 const char *errors)
1440{
1441 PyObject *buffer = NULL, *unicode;
1442 Py_buffer info;
1443 char lower[11]; /* Enough for any encoding shortcut */
1444
1445 if (encoding == NULL)
1446 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001447
1448 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001449 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1450 if (strcmp(lower, "utf-8") == 0)
1451 return PyUnicode_DecodeUTF8(s, size, errors);
1452 else if ((strcmp(lower, "latin-1") == 0) ||
1453 (strcmp(lower, "iso-8859-1") == 0))
1454 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001455#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001456 else if (strcmp(lower, "mbcs") == 0)
1457 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001458#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001459 else if (strcmp(lower, "ascii") == 0)
1460 return PyUnicode_DecodeASCII(s, size, errors);
1461 else if (strcmp(lower, "utf-16") == 0)
1462 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1463 else if (strcmp(lower, "utf-32") == 0)
1464 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466
1467 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001468 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001469 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001470 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001471 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001472 if (buffer == NULL)
1473 goto onError;
1474 unicode = PyCodec_Decode(buffer, encoding, errors);
1475 if (unicode == NULL)
1476 goto onError;
1477 if (!PyUnicode_Check(unicode)) {
1478 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001479 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001480 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481 Py_DECREF(unicode);
1482 goto onError;
1483 }
1484 Py_DECREF(buffer);
1485 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001486
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488 Py_XDECREF(buffer);
1489 return NULL;
1490}
1491
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001492PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1493 const char *encoding,
1494 const char *errors)
1495{
1496 PyObject *v;
1497
1498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 goto onError;
1501 }
1502
1503 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001504 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001505
1506 /* Decode via the codec registry */
1507 v = PyCodec_Decode(unicode, encoding, errors);
1508 if (v == NULL)
1509 goto onError;
1510 return v;
1511
Benjamin Peterson29060642009-01-31 22:14:21 +00001512 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001513 return NULL;
1514}
1515
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001516PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1517 const char *encoding,
1518 const char *errors)
1519{
1520 PyObject *v;
1521
1522 if (!PyUnicode_Check(unicode)) {
1523 PyErr_BadArgument();
1524 goto onError;
1525 }
1526
1527 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001528 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001529
1530 /* Decode via the codec registry */
1531 v = PyCodec_Decode(unicode, encoding, errors);
1532 if (v == NULL)
1533 goto onError;
1534 if (!PyUnicode_Check(v)) {
1535 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001536 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001537 Py_TYPE(v)->tp_name);
1538 Py_DECREF(v);
1539 goto onError;
1540 }
1541 return v;
1542
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001544 return NULL;
1545}
1546
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001548 Py_ssize_t size,
1549 const char *encoding,
1550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001551{
1552 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001553
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554 unicode = PyUnicode_FromUnicode(s, size);
1555 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1558 Py_DECREF(unicode);
1559 return v;
1560}
1561
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001562PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1563 const char *encoding,
1564 const char *errors)
1565{
1566 PyObject *v;
1567
1568 if (!PyUnicode_Check(unicode)) {
1569 PyErr_BadArgument();
1570 goto onError;
1571 }
1572
1573 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001575
1576 /* Encode via the codec registry */
1577 v = PyCodec_Encode(unicode, encoding, errors);
1578 if (v == NULL)
1579 goto onError;
1580 return v;
1581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001583 return NULL;
1584}
1585
Victor Stinnerae6265f2010-05-15 16:27:27 +00001586PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1587{
Victor Stinner313a1202010-06-11 23:56:51 +00001588 if (Py_FileSystemDefaultEncoding) {
1589#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1590 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1591 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1592 PyUnicode_GET_SIZE(unicode),
1593 NULL);
1594#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001595 return PyUnicode_AsEncodedString(unicode,
1596 Py_FileSystemDefaultEncoding,
1597 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001598 }
1599 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001600 /* locale encoding with surrogateescape */
1601 wchar_t *wchar;
1602 char *bytes;
1603 PyObject *bytes_obj;
1604
1605 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1606 if (wchar == NULL)
1607 return NULL;
1608 bytes = _Py_wchar2char(wchar);
1609 PyMem_Free(wchar);
1610 if (bytes == NULL)
1611 return NULL;
1612
1613 bytes_obj = PyBytes_FromString(bytes);
1614 PyMem_Free(bytes);
1615 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001616 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00001617}
1618
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1620 const char *encoding,
1621 const char *errors)
1622{
1623 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001624 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001625
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626 if (!PyUnicode_Check(unicode)) {
1627 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 }
Fred Drakee4315f52000-05-09 19:53:39 +00001630
Tim Petersced69f82003-09-16 20:30:58 +00001631 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001632 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001633
1634 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001635 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1636 if (strcmp(lower, "utf-8") == 0)
1637 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1638 PyUnicode_GET_SIZE(unicode),
1639 errors);
1640 else if ((strcmp(lower, "latin-1") == 0) ||
1641 (strcmp(lower, "iso-8859-1") == 0))
1642 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1643 PyUnicode_GET_SIZE(unicode),
1644 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001645#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001646 else if (strcmp(lower, "mbcs") == 0)
1647 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1648 PyUnicode_GET_SIZE(unicode),
1649 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001650#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001651 else if (strcmp(lower, "ascii") == 0)
1652 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1653 PyUnicode_GET_SIZE(unicode),
1654 errors);
1655 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001656 /* During bootstrap, we may need to find the encodings
1657 package, to load the file system encoding, and require the
1658 file system encoding in order to load the encodings
1659 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001660
Victor Stinner59e62db2010-05-15 13:14:32 +00001661 Break out of this dependency by assuming that the path to
1662 the encodings module is ASCII-only. XXX could try wcstombs
1663 instead, if the file system encoding is the locale's
1664 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001665 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001666 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1667 !PyThreadState_GET()->interp->codecs_initialized)
1668 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1669 PyUnicode_GET_SIZE(unicode),
1670 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671
1672 /* Encode via the codec registry */
1673 v = PyCodec_Encode(unicode, encoding, errors);
1674 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001675 return NULL;
1676
1677 /* The normal path */
1678 if (PyBytes_Check(v))
1679 return v;
1680
1681 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001682 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001683 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001684 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001685
1686 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1687 "encoder %s returned bytearray instead of bytes",
1688 encoding);
1689 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001690 Py_DECREF(v);
1691 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001692 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001693
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001694 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1695 Py_DECREF(v);
1696 return b;
1697 }
1698
1699 PyErr_Format(PyExc_TypeError,
1700 "encoder did not return a bytes object (type=%.400s)",
1701 Py_TYPE(v)->tp_name);
1702 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001703 return NULL;
1704}
1705
1706PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1707 const char *encoding,
1708 const char *errors)
1709{
1710 PyObject *v;
1711
1712 if (!PyUnicode_Check(unicode)) {
1713 PyErr_BadArgument();
1714 goto onError;
1715 }
1716
1717 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001718 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001719
1720 /* Encode via the codec registry */
1721 v = PyCodec_Encode(unicode, encoding, errors);
1722 if (v == NULL)
1723 goto onError;
1724 if (!PyUnicode_Check(v)) {
1725 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001726 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001727 Py_TYPE(v)->tp_name);
1728 Py_DECREF(v);
1729 goto onError;
1730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001732
Benjamin Peterson29060642009-01-31 22:14:21 +00001733 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 return NULL;
1735}
1736
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001737PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001738 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001739{
1740 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001741 if (v)
1742 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001743 if (errors != NULL)
1744 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001745 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001746 PyUnicode_GET_SIZE(unicode),
1747 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001748 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001749 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001750 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001751 return v;
1752}
1753
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001754PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001755PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001756 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001757 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1758}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001759
Christian Heimes5894ba72007-11-04 11:43:14 +00001760PyObject*
1761PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1762{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001763 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1764 can be undefined. If it is case, decode using UTF-8. The following assumes
1765 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1766 bootstrapping process where the codecs aren't ready yet.
1767 */
1768 if (Py_FileSystemDefaultEncoding) {
1769#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001770 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001771 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001772 }
1773#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001774 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001775 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001776 }
1777#endif
1778 return PyUnicode_Decode(s, size,
1779 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001780 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001781 }
1782 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001783 /* locale encoding with surrogateescape */
1784 wchar_t *wchar;
1785 PyObject *unicode;
1786
1787 if (s[size] != '\0' || size != strlen(s)) {
1788 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1789 return NULL;
1790 }
1791
1792 wchar = _Py_char2wchar(s);
1793 if (wchar == NULL)
1794 return NULL;
1795
1796 unicode = PyUnicode_FromWideChar(wchar, -1);
1797 PyMem_Free(wchar);
1798 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001799 }
1800}
1801
Martin v. Löwis011e8422009-05-05 04:43:17 +00001802
1803int
1804PyUnicode_FSConverter(PyObject* arg, void* addr)
1805{
1806 PyObject *output = NULL;
1807 Py_ssize_t size;
1808 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001809 if (arg == NULL) {
1810 Py_DECREF(*(PyObject**)addr);
1811 return 1;
1812 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001813 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001814 output = arg;
1815 Py_INCREF(output);
1816 }
1817 else {
1818 arg = PyUnicode_FromObject(arg);
1819 if (!arg)
1820 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001821 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001822 Py_DECREF(arg);
1823 if (!output)
1824 return 0;
1825 if (!PyBytes_Check(output)) {
1826 Py_DECREF(output);
1827 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1828 return 0;
1829 }
1830 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001831 size = PyBytes_GET_SIZE(output);
1832 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001833 if (size != strlen(data)) {
1834 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1835 Py_DECREF(output);
1836 return 0;
1837 }
1838 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001839 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001840}
1841
1842
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001843int
1844PyUnicode_FSDecoder(PyObject* arg, void* addr)
1845{
1846 PyObject *output = NULL;
1847 Py_ssize_t size;
1848 void *data;
1849 if (arg == NULL) {
1850 Py_DECREF(*(PyObject**)addr);
1851 return 1;
1852 }
1853 if (PyUnicode_Check(arg)) {
1854 output = arg;
1855 Py_INCREF(output);
1856 }
1857 else {
1858 arg = PyBytes_FromObject(arg);
1859 if (!arg)
1860 return 0;
1861 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1862 PyBytes_GET_SIZE(arg));
1863 Py_DECREF(arg);
1864 if (!output)
1865 return 0;
1866 if (!PyUnicode_Check(output)) {
1867 Py_DECREF(output);
1868 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1869 return 0;
1870 }
1871 }
1872 size = PyUnicode_GET_SIZE(output);
1873 data = PyUnicode_AS_UNICODE(output);
1874 if (size != Py_UNICODE_strlen(data)) {
1875 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1876 Py_DECREF(output);
1877 return 0;
1878 }
1879 *(PyObject**)addr = output;
1880 return Py_CLEANUP_SUPPORTED;
1881}
1882
1883
Martin v. Löwis5b222132007-06-10 09:51:05 +00001884char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001885_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001886{
Christian Heimesf3863112007-11-22 07:46:41 +00001887 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001888 if (!PyUnicode_Check(unicode)) {
1889 PyErr_BadArgument();
1890 return NULL;
1891 }
Christian Heimesf3863112007-11-22 07:46:41 +00001892 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1893 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001894 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001895 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001896 *psize = PyBytes_GET_SIZE(bytes);
1897 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001898}
1899
1900char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001901_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001902{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001903 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001904}
1905
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1907{
1908 if (!PyUnicode_Check(unicode)) {
1909 PyErr_BadArgument();
1910 goto onError;
1911 }
1912 return PyUnicode_AS_UNICODE(unicode);
1913
Benjamin Peterson29060642009-01-31 22:14:21 +00001914 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 return NULL;
1916}
1917
Martin v. Löwis18e16552006-02-15 17:27:45 +00001918Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919{
1920 if (!PyUnicode_Check(unicode)) {
1921 PyErr_BadArgument();
1922 goto onError;
1923 }
1924 return PyUnicode_GET_SIZE(unicode);
1925
Benjamin Peterson29060642009-01-31 22:14:21 +00001926 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927 return -1;
1928}
1929
Thomas Wouters78890102000-07-22 19:25:51 +00001930const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001931{
Victor Stinner42cb4622010-09-01 19:39:01 +00001932 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001933}
1934
Victor Stinner554f3f02010-06-16 23:33:54 +00001935/* create or adjust a UnicodeDecodeError */
1936static void
1937make_decode_exception(PyObject **exceptionObject,
1938 const char *encoding,
1939 const char *input, Py_ssize_t length,
1940 Py_ssize_t startpos, Py_ssize_t endpos,
1941 const char *reason)
1942{
1943 if (*exceptionObject == NULL) {
1944 *exceptionObject = PyUnicodeDecodeError_Create(
1945 encoding, input, length, startpos, endpos, reason);
1946 }
1947 else {
1948 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1949 goto onError;
1950 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1951 goto onError;
1952 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1953 goto onError;
1954 }
1955 return;
1956
1957onError:
1958 Py_DECREF(*exceptionObject);
1959 *exceptionObject = NULL;
1960}
1961
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001962/* error handling callback helper:
1963 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001964 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001965 and adjust various state variables.
1966 return 0 on success, -1 on error
1967*/
1968
1969static
1970int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001971 const char *encoding, const char *reason,
1972 const char **input, const char **inend, Py_ssize_t *startinpos,
1973 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1974 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001975{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001976 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001977
1978 PyObject *restuple = NULL;
1979 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001980 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001981 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001982 Py_ssize_t requiredsize;
1983 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001984 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001985 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001986 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001987 int res = -1;
1988
1989 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001990 *errorHandler = PyCodec_LookupError(errors);
1991 if (*errorHandler == NULL)
1992 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001993 }
1994
Victor Stinner554f3f02010-06-16 23:33:54 +00001995 make_decode_exception(exceptionObject,
1996 encoding,
1997 *input, *inend - *input,
1998 *startinpos, *endinpos,
1999 reason);
2000 if (*exceptionObject == NULL)
2001 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002002
2003 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2004 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002005 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002007 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002008 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002009 }
2010 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002011 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002012
2013 /* Copy back the bytes variables, which might have been modified by the
2014 callback */
2015 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2016 if (!inputobj)
2017 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002018 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002019 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002020 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002021 *input = PyBytes_AS_STRING(inputobj);
2022 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002023 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002024 /* we can DECREF safely, as the exception has another reference,
2025 so the object won't go away. */
2026 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002027
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002029 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002030 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2032 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002033 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002034
2035 /* need more space? (at least enough for what we
2036 have+the replacement+the rest of the string (starting
2037 at the new input position), so we won't have to check space
2038 when there are no errors in the rest of the string) */
2039 repptr = PyUnicode_AS_UNICODE(repunicode);
2040 repsize = PyUnicode_GET_SIZE(repunicode);
2041 requiredsize = *outpos + repsize + insize-newpos;
2042 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002043 if (requiredsize<2*outsize)
2044 requiredsize = 2*outsize;
2045 if (_PyUnicode_Resize(output, requiredsize) < 0)
2046 goto onError;
2047 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 }
2049 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002050 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002051 Py_UNICODE_COPY(*outptr, repptr, repsize);
2052 *outptr += repsize;
2053 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002054
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002055 /* we made it! */
2056 res = 0;
2057
Benjamin Peterson29060642009-01-31 22:14:21 +00002058 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002059 Py_XDECREF(restuple);
2060 return res;
2061}
2062
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002063/* --- UTF-7 Codec -------------------------------------------------------- */
2064
Antoine Pitrou244651a2009-05-04 18:56:13 +00002065/* See RFC2152 for details. We encode conservatively and decode liberally. */
2066
2067/* Three simple macros defining base-64. */
2068
2069/* Is c a base-64 character? */
2070
2071#define IS_BASE64(c) \
2072 (((c) >= 'A' && (c) <= 'Z') || \
2073 ((c) >= 'a' && (c) <= 'z') || \
2074 ((c) >= '0' && (c) <= '9') || \
2075 (c) == '+' || (c) == '/')
2076
2077/* given that c is a base-64 character, what is its base-64 value? */
2078
2079#define FROM_BASE64(c) \
2080 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2081 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2082 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2083 (c) == '+' ? 62 : 63)
2084
2085/* What is the base-64 character of the bottom 6 bits of n? */
2086
2087#define TO_BASE64(n) \
2088 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2089
2090/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2091 * decoded as itself. We are permissive on decoding; the only ASCII
2092 * byte not decoding to itself is the + which begins a base64
2093 * string. */
2094
2095#define DECODE_DIRECT(c) \
2096 ((c) <= 127 && (c) != '+')
2097
2098/* The UTF-7 encoder treats ASCII characters differently according to
2099 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2100 * the above). See RFC2152. This array identifies these different
2101 * sets:
2102 * 0 : "Set D"
2103 * alphanumeric and '(),-./:?
2104 * 1 : "Set O"
2105 * !"#$%&*;<=>@[]^_`{|}
2106 * 2 : "whitespace"
2107 * ht nl cr sp
2108 * 3 : special (must be base64 encoded)
2109 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2110 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002111
Tim Petersced69f82003-09-16 20:30:58 +00002112static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002113char utf7_category[128] = {
2114/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2115 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2116/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2117 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2118/* sp ! " # $ % & ' ( ) * + , - . / */
2119 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2120/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2122/* @ A B C D E F G H I J K L M N O */
2123 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2124/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2126/* ` a b c d e f g h i j k l m n o */
2127 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2128/* p q r s t u v w x y z { | } ~ del */
2129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002130};
2131
Antoine Pitrou244651a2009-05-04 18:56:13 +00002132/* ENCODE_DIRECT: this character should be encoded as itself. The
2133 * answer depends on whether we are encoding set O as itself, and also
2134 * on whether we are encoding whitespace as itself. RFC2152 makes it
2135 * clear that the answers to these questions vary between
2136 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002137
Antoine Pitrou244651a2009-05-04 18:56:13 +00002138#define ENCODE_DIRECT(c, directO, directWS) \
2139 ((c) < 128 && (c) > 0 && \
2140 ((utf7_category[(c)] == 0) || \
2141 (directWS && (utf7_category[(c)] == 2)) || \
2142 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002143
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002144PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002145 Py_ssize_t size,
2146 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002147{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002148 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2149}
2150
Antoine Pitrou244651a2009-05-04 18:56:13 +00002151/* The decoder. The only state we preserve is our read position,
2152 * i.e. how many characters we have consumed. So if we end in the
2153 * middle of a shift sequence we have to back off the read position
2154 * and the output to the beginning of the sequence, otherwise we lose
2155 * all the shift state (seen bits, number of bits seen, high
2156 * surrogate). */
2157
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002158PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002159 Py_ssize_t size,
2160 const char *errors,
2161 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002162{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002163 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002164 Py_ssize_t startinpos;
2165 Py_ssize_t endinpos;
2166 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002167 const char *e;
2168 PyUnicodeObject *unicode;
2169 Py_UNICODE *p;
2170 const char *errmsg = "";
2171 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002172 Py_UNICODE *shiftOutStart;
2173 unsigned int base64bits = 0;
2174 unsigned long base64buffer = 0;
2175 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002176 PyObject *errorHandler = NULL;
2177 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002178
2179 unicode = _PyUnicode_New(size);
2180 if (!unicode)
2181 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002182 if (size == 0) {
2183 if (consumed)
2184 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002185 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002186 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002187
2188 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002189 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002190 e = s + size;
2191
2192 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002193 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002194 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002195 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002196
Antoine Pitrou244651a2009-05-04 18:56:13 +00002197 if (inShift) { /* in a base-64 section */
2198 if (IS_BASE64(ch)) { /* consume a base-64 character */
2199 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2200 base64bits += 6;
2201 s++;
2202 if (base64bits >= 16) {
2203 /* we have enough bits for a UTF-16 value */
2204 Py_UNICODE outCh = (Py_UNICODE)
2205 (base64buffer >> (base64bits-16));
2206 base64bits -= 16;
2207 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2208 if (surrogate) {
2209 /* expecting a second surrogate */
2210 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2211#ifdef Py_UNICODE_WIDE
2212 *p++ = (((surrogate & 0x3FF)<<10)
2213 | (outCh & 0x3FF)) + 0x10000;
2214#else
2215 *p++ = surrogate;
2216 *p++ = outCh;
2217#endif
2218 surrogate = 0;
2219 }
2220 else {
2221 surrogate = 0;
2222 errmsg = "second surrogate missing";
2223 goto utf7Error;
2224 }
2225 }
2226 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2227 /* first surrogate */
2228 surrogate = outCh;
2229 }
2230 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2231 errmsg = "unexpected second surrogate";
2232 goto utf7Error;
2233 }
2234 else {
2235 *p++ = outCh;
2236 }
2237 }
2238 }
2239 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002240 inShift = 0;
2241 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002242 if (surrogate) {
2243 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002244 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002245 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002246 if (base64bits > 0) { /* left-over bits */
2247 if (base64bits >= 6) {
2248 /* We've seen at least one base-64 character */
2249 errmsg = "partial character in shift sequence";
2250 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002251 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002252 else {
2253 /* Some bits remain; they should be zero */
2254 if (base64buffer != 0) {
2255 errmsg = "non-zero padding bits in shift sequence";
2256 goto utf7Error;
2257 }
2258 }
2259 }
2260 if (ch != '-') {
2261 /* '-' is absorbed; other terminating
2262 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002263 *p++ = ch;
2264 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002265 }
2266 }
2267 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002268 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002269 s++; /* consume '+' */
2270 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002271 s++;
2272 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002273 }
2274 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002275 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002276 shiftOutStart = p;
2277 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002278 }
2279 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002280 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002281 *p++ = ch;
2282 s++;
2283 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002284 else {
2285 startinpos = s-starts;
2286 s++;
2287 errmsg = "unexpected special character";
2288 goto utf7Error;
2289 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002290 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002291utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002292 outpos = p-PyUnicode_AS_UNICODE(unicode);
2293 endinpos = s-starts;
2294 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002295 errors, &errorHandler,
2296 "utf7", errmsg,
2297 &starts, &e, &startinpos, &endinpos, &exc, &s,
2298 &unicode, &outpos, &p))
2299 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002300 }
2301
Antoine Pitrou244651a2009-05-04 18:56:13 +00002302 /* end of string */
2303
2304 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2305 /* if we're in an inconsistent state, that's an error */
2306 if (surrogate ||
2307 (base64bits >= 6) ||
2308 (base64bits > 0 && base64buffer != 0)) {
2309 outpos = p-PyUnicode_AS_UNICODE(unicode);
2310 endinpos = size;
2311 if (unicode_decode_call_errorhandler(
2312 errors, &errorHandler,
2313 "utf7", "unterminated shift sequence",
2314 &starts, &e, &startinpos, &endinpos, &exc, &s,
2315 &unicode, &outpos, &p))
2316 goto onError;
2317 if (s < e)
2318 goto restart;
2319 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002320 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002321
2322 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002323 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002324 if (inShift) {
2325 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002326 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002327 }
2328 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002329 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002330 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002331 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002332
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002333 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002334 goto onError;
2335
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002336 Py_XDECREF(errorHandler);
2337 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002338 return (PyObject *)unicode;
2339
Benjamin Peterson29060642009-01-31 22:14:21 +00002340 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002341 Py_XDECREF(errorHandler);
2342 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002343 Py_DECREF(unicode);
2344 return NULL;
2345}
2346
2347
2348PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002349 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002350 int base64SetO,
2351 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002352 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002353{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002354 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002355 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002356 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002357 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002358 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002359 unsigned int base64bits = 0;
2360 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002361 char * out;
2362 char * start;
2363
2364 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002365 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002366
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002367 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002368 return PyErr_NoMemory();
2369
Antoine Pitrou244651a2009-05-04 18:56:13 +00002370 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002371 if (v == NULL)
2372 return NULL;
2373
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002374 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002375 for (;i < size; ++i) {
2376 Py_UNICODE ch = s[i];
2377
Antoine Pitrou244651a2009-05-04 18:56:13 +00002378 if (inShift) {
2379 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2380 /* shifting out */
2381 if (base64bits) { /* output remaining bits */
2382 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2383 base64buffer = 0;
2384 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002385 }
2386 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002387 /* Characters not in the BASE64 set implicitly unshift the sequence
2388 so no '-' is required, except if the character is itself a '-' */
2389 if (IS_BASE64(ch) || ch == '-') {
2390 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002391 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002392 *out++ = (char) ch;
2393 }
2394 else {
2395 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002396 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002397 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002398 else { /* not in a shift sequence */
2399 if (ch == '+') {
2400 *out++ = '+';
2401 *out++ = '-';
2402 }
2403 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2404 *out++ = (char) ch;
2405 }
2406 else {
2407 *out++ = '+';
2408 inShift = 1;
2409 goto encode_char;
2410 }
2411 }
2412 continue;
2413encode_char:
2414#ifdef Py_UNICODE_WIDE
2415 if (ch >= 0x10000) {
2416 /* code first surrogate */
2417 base64bits += 16;
2418 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2419 while (base64bits >= 6) {
2420 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2421 base64bits -= 6;
2422 }
2423 /* prepare second surrogate */
2424 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2425 }
2426#endif
2427 base64bits += 16;
2428 base64buffer = (base64buffer << 16) | ch;
2429 while (base64bits >= 6) {
2430 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2431 base64bits -= 6;
2432 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002433 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002434 if (base64bits)
2435 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2436 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002437 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002438 if (_PyBytes_Resize(&v, out - start) < 0)
2439 return NULL;
2440 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002441}
2442
Antoine Pitrou244651a2009-05-04 18:56:13 +00002443#undef IS_BASE64
2444#undef FROM_BASE64
2445#undef TO_BASE64
2446#undef DECODE_DIRECT
2447#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002448
Guido van Rossumd57fd912000-03-10 22:53:23 +00002449/* --- UTF-8 Codec -------------------------------------------------------- */
2450
Tim Petersced69f82003-09-16 20:30:58 +00002451static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002453 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2454 illegal prefix. See RFC 3629 for details */
2455 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2456 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002457 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2459 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2463 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2465 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2467 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2468 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2469 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2470 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471};
2472
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002474 Py_ssize_t size,
2475 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476{
Walter Dörwald69652032004-09-07 20:24:22 +00002477 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2478}
2479
Antoine Pitrouab868312009-01-10 15:40:25 +00002480/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2481#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2482
2483/* Mask to quickly check whether a C 'long' contains a
2484 non-ASCII, UTF8-encoded char. */
2485#if (SIZEOF_LONG == 8)
2486# define ASCII_CHAR_MASK 0x8080808080808080L
2487#elif (SIZEOF_LONG == 4)
2488# define ASCII_CHAR_MASK 0x80808080L
2489#else
2490# error C 'long' size should be either 4 or 8!
2491#endif
2492
Walter Dörwald69652032004-09-07 20:24:22 +00002493PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 Py_ssize_t size,
2495 const char *errors,
2496 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002497{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002498 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002500 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002501 Py_ssize_t startinpos;
2502 Py_ssize_t endinpos;
2503 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002504 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 PyUnicodeObject *unicode;
2506 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002507 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002508 PyObject *errorHandler = NULL;
2509 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510
2511 /* Note: size will always be longer than the resulting Unicode
2512 character count */
2513 unicode = _PyUnicode_New(size);
2514 if (!unicode)
2515 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002516 if (size == 0) {
2517 if (consumed)
2518 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521
2522 /* Unpack UTF-8 encoded data */
2523 p = unicode->str;
2524 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002525 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526
2527 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002528 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529
2530 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002531 /* Fast path for runs of ASCII characters. Given that common UTF-8
2532 input will consist of an overwhelming majority of ASCII
2533 characters, we try to optimize for this case by checking
2534 as many characters as a C 'long' can contain.
2535 First, check if we can do an aligned read, as most CPUs have
2536 a penalty for unaligned reads.
2537 */
2538 if (!((size_t) s & LONG_PTR_MASK)) {
2539 /* Help register allocation */
2540 register const char *_s = s;
2541 register Py_UNICODE *_p = p;
2542 while (_s < aligned_end) {
2543 /* Read a whole long at a time (either 4 or 8 bytes),
2544 and do a fast unrolled copy if it only contains ASCII
2545 characters. */
2546 unsigned long data = *(unsigned long *) _s;
2547 if (data & ASCII_CHAR_MASK)
2548 break;
2549 _p[0] = (unsigned char) _s[0];
2550 _p[1] = (unsigned char) _s[1];
2551 _p[2] = (unsigned char) _s[2];
2552 _p[3] = (unsigned char) _s[3];
2553#if (SIZEOF_LONG == 8)
2554 _p[4] = (unsigned char) _s[4];
2555 _p[5] = (unsigned char) _s[5];
2556 _p[6] = (unsigned char) _s[6];
2557 _p[7] = (unsigned char) _s[7];
2558#endif
2559 _s += SIZEOF_LONG;
2560 _p += SIZEOF_LONG;
2561 }
2562 s = _s;
2563 p = _p;
2564 if (s == e)
2565 break;
2566 ch = (unsigned char)*s;
2567 }
2568 }
2569
2570 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002571 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 s++;
2573 continue;
2574 }
2575
2576 n = utf8_code_length[ch];
2577
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002578 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002579 if (consumed)
2580 break;
2581 else {
2582 errmsg = "unexpected end of data";
2583 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002584 endinpos = startinpos+1;
2585 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2586 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002587 goto utf8Error;
2588 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590
2591 switch (n) {
2592
2593 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002594 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002595 startinpos = s-starts;
2596 endinpos = startinpos+1;
2597 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598
2599 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002600 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002601 startinpos = s-starts;
2602 endinpos = startinpos+1;
2603 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604
2605 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002606 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002607 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002608 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002609 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002610 goto utf8Error;
2611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002613 assert ((ch > 0x007F) && (ch <= 0x07FF));
2614 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 break;
2616
2617 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002618 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2619 will result in surrogates in range d800-dfff. Surrogates are
2620 not valid UTF-8 so they are rejected.
2621 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2622 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002623 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002624 (s[2] & 0xc0) != 0x80 ||
2625 ((unsigned char)s[0] == 0xE0 &&
2626 (unsigned char)s[1] < 0xA0) ||
2627 ((unsigned char)s[0] == 0xED &&
2628 (unsigned char)s[1] > 0x9F)) {
2629 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002630 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002631 endinpos = startinpos + 1;
2632
2633 /* if s[1] first two bits are 1 and 0, then the invalid
2634 continuation byte is s[2], so increment endinpos by 1,
2635 if not, s[1] is invalid and endinpos doesn't need to
2636 be incremented. */
2637 if ((s[1] & 0xC0) == 0x80)
2638 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002639 goto utf8Error;
2640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002642 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2643 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002644 break;
2645
2646 case 4:
2647 if ((s[1] & 0xc0) != 0x80 ||
2648 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002649 (s[3] & 0xc0) != 0x80 ||
2650 ((unsigned char)s[0] == 0xF0 &&
2651 (unsigned char)s[1] < 0x90) ||
2652 ((unsigned char)s[0] == 0xF4 &&
2653 (unsigned char)s[1] > 0x8F)) {
2654 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002655 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002656 endinpos = startinpos + 1;
2657 if ((s[1] & 0xC0) == 0x80) {
2658 endinpos++;
2659 if ((s[2] & 0xC0) == 0x80)
2660 endinpos++;
2661 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002662 goto utf8Error;
2663 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002664 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002665 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2666 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2667
Fredrik Lundh8f455852001-06-27 18:59:43 +00002668#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002669 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002670#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002671 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002672
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002673 /* translate from 10000..10FFFF to 0..FFFF */
2674 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002675
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002676 /* high surrogate = top 10 bits added to D800 */
2677 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002678
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002679 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002680 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002681#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 }
2684 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002686
Benjamin Peterson29060642009-01-31 22:14:21 +00002687 utf8Error:
2688 outpos = p-PyUnicode_AS_UNICODE(unicode);
2689 if (unicode_decode_call_errorhandler(
2690 errors, &errorHandler,
2691 "utf8", errmsg,
2692 &starts, &e, &startinpos, &endinpos, &exc, &s,
2693 &unicode, &outpos, &p))
2694 goto onError;
2695 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 }
Walter Dörwald69652032004-09-07 20:24:22 +00002697 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002698 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699
2700 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002701 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 goto onError;
2703
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002704 Py_XDECREF(errorHandler);
2705 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 return (PyObject *)unicode;
2707
Benjamin Peterson29060642009-01-31 22:14:21 +00002708 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002709 Py_XDECREF(errorHandler);
2710 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 Py_DECREF(unicode);
2712 return NULL;
2713}
2714
Antoine Pitrouab868312009-01-10 15:40:25 +00002715#undef ASCII_CHAR_MASK
2716
2717
Tim Peters602f7402002-04-27 18:03:26 +00002718/* Allocation strategy: if the string is short, convert into a stack buffer
2719 and allocate exactly as much space needed at the end. Else allocate the
2720 maximum possible needed (4 result bytes per Unicode character), and return
2721 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002722*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002723PyObject *
2724PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002725 Py_ssize_t size,
2726 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727{
Tim Peters602f7402002-04-27 18:03:26 +00002728#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002729
Guido van Rossum98297ee2007-11-06 21:34:58 +00002730 Py_ssize_t i; /* index into s of next input byte */
2731 PyObject *result; /* result string object */
2732 char *p; /* next free byte in output buffer */
2733 Py_ssize_t nallocated; /* number of result bytes allocated */
2734 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002735 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002736 PyObject *errorHandler = NULL;
2737 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002738
Tim Peters602f7402002-04-27 18:03:26 +00002739 assert(s != NULL);
2740 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741
Tim Peters602f7402002-04-27 18:03:26 +00002742 if (size <= MAX_SHORT_UNICHARS) {
2743 /* Write into the stack buffer; nallocated can't overflow.
2744 * At the end, we'll allocate exactly as much heap space as it
2745 * turns out we need.
2746 */
2747 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002748 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002749 p = stackbuf;
2750 }
2751 else {
2752 /* Overallocate on the heap, and give the excess back at the end. */
2753 nallocated = size * 4;
2754 if (nallocated / 4 != size) /* overflow! */
2755 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002756 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002757 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002758 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002759 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002760 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002761
Tim Peters602f7402002-04-27 18:03:26 +00002762 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002763 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002764
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002765 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002766 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002768
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002770 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002771 *p++ = (char)(0xc0 | (ch >> 6));
2772 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002773 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002774#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002775 /* Special case: check for high and low surrogate */
2776 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2777 Py_UCS4 ch2 = s[i];
2778 /* Combine the two surrogates to form a UCS4 value */
2779 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2780 i++;
2781
2782 /* Encode UCS4 Unicode ordinals */
2783 *p++ = (char)(0xf0 | (ch >> 18));
2784 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002785 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2786 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002787 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002788#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002789 Py_ssize_t newpos;
2790 PyObject *rep;
2791 Py_ssize_t repsize, k;
2792 rep = unicode_encode_call_errorhandler
2793 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2794 s, size, &exc, i-1, i, &newpos);
2795 if (!rep)
2796 goto error;
2797
2798 if (PyBytes_Check(rep))
2799 repsize = PyBytes_GET_SIZE(rep);
2800 else
2801 repsize = PyUnicode_GET_SIZE(rep);
2802
2803 if (repsize > 4) {
2804 Py_ssize_t offset;
2805
2806 if (result == NULL)
2807 offset = p - stackbuf;
2808 else
2809 offset = p - PyBytes_AS_STRING(result);
2810
2811 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2812 /* integer overflow */
2813 PyErr_NoMemory();
2814 goto error;
2815 }
2816 nallocated += repsize - 4;
2817 if (result != NULL) {
2818 if (_PyBytes_Resize(&result, nallocated) < 0)
2819 goto error;
2820 } else {
2821 result = PyBytes_FromStringAndSize(NULL, nallocated);
2822 if (result == NULL)
2823 goto error;
2824 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2825 }
2826 p = PyBytes_AS_STRING(result) + offset;
2827 }
2828
2829 if (PyBytes_Check(rep)) {
2830 char *prep = PyBytes_AS_STRING(rep);
2831 for(k = repsize; k > 0; k--)
2832 *p++ = *prep++;
2833 } else /* rep is unicode */ {
2834 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2835 Py_UNICODE c;
2836
2837 for(k=0; k<repsize; k++) {
2838 c = prep[k];
2839 if (0x80 <= c) {
2840 raise_encode_exception(&exc, "utf-8", s, size,
2841 i-1, i, "surrogates not allowed");
2842 goto error;
2843 }
2844 *p++ = (char)prep[k];
2845 }
2846 }
2847 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002848#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002849 }
Victor Stinner445a6232010-04-22 20:01:57 +00002850#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002851 } else if (ch < 0x10000) {
2852 *p++ = (char)(0xe0 | (ch >> 12));
2853 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2854 *p++ = (char)(0x80 | (ch & 0x3f));
2855 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002856 /* Encode UCS4 Unicode ordinals */
2857 *p++ = (char)(0xf0 | (ch >> 18));
2858 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2859 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2860 *p++ = (char)(0x80 | (ch & 0x3f));
2861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002863
Guido van Rossum98297ee2007-11-06 21:34:58 +00002864 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002865 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002866 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002867 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002868 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002869 }
2870 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002871 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002872 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002873 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002874 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002875 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002876 Py_XDECREF(errorHandler);
2877 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002878 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002879 error:
2880 Py_XDECREF(errorHandler);
2881 Py_XDECREF(exc);
2882 Py_XDECREF(result);
2883 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002884
Tim Peters602f7402002-04-27 18:03:26 +00002885#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886}
2887
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2889{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890 if (!PyUnicode_Check(unicode)) {
2891 PyErr_BadArgument();
2892 return NULL;
2893 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002894 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002895 PyUnicode_GET_SIZE(unicode),
2896 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897}
2898
Walter Dörwald41980ca2007-08-16 21:55:45 +00002899/* --- UTF-32 Codec ------------------------------------------------------- */
2900
2901PyObject *
2902PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002903 Py_ssize_t size,
2904 const char *errors,
2905 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002906{
2907 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2908}
2909
2910PyObject *
2911PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 Py_ssize_t size,
2913 const char *errors,
2914 int *byteorder,
2915 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002916{
2917 const char *starts = s;
2918 Py_ssize_t startinpos;
2919 Py_ssize_t endinpos;
2920 Py_ssize_t outpos;
2921 PyUnicodeObject *unicode;
2922 Py_UNICODE *p;
2923#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002924 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002925 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002926#else
2927 const int pairs = 0;
2928#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002929 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002930 int bo = 0; /* assume native ordering by default */
2931 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002932 /* Offsets from q for retrieving bytes in the right order. */
2933#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2934 int iorder[] = {0, 1, 2, 3};
2935#else
2936 int iorder[] = {3, 2, 1, 0};
2937#endif
2938 PyObject *errorHandler = NULL;
2939 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002940
Walter Dörwald41980ca2007-08-16 21:55:45 +00002941 q = (unsigned char *)s;
2942 e = q + size;
2943
2944 if (byteorder)
2945 bo = *byteorder;
2946
2947 /* Check for BOM marks (U+FEFF) in the input and adjust current
2948 byte order setting accordingly. In native mode, the leading BOM
2949 mark is skipped, in all other modes, it is copied to the output
2950 stream as-is (giving a ZWNBSP character). */
2951 if (bo == 0) {
2952 if (size >= 4) {
2953 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002954 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002955#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002956 if (bom == 0x0000FEFF) {
2957 q += 4;
2958 bo = -1;
2959 }
2960 else if (bom == 0xFFFE0000) {
2961 q += 4;
2962 bo = 1;
2963 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002964#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002965 if (bom == 0x0000FEFF) {
2966 q += 4;
2967 bo = 1;
2968 }
2969 else if (bom == 0xFFFE0000) {
2970 q += 4;
2971 bo = -1;
2972 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002973#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002974 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002975 }
2976
2977 if (bo == -1) {
2978 /* force LE */
2979 iorder[0] = 0;
2980 iorder[1] = 1;
2981 iorder[2] = 2;
2982 iorder[3] = 3;
2983 }
2984 else if (bo == 1) {
2985 /* force BE */
2986 iorder[0] = 3;
2987 iorder[1] = 2;
2988 iorder[2] = 1;
2989 iorder[3] = 0;
2990 }
2991
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002992 /* On narrow builds we split characters outside the BMP into two
2993 codepoints => count how much extra space we need. */
2994#ifndef Py_UNICODE_WIDE
2995 for (qq = q; qq < e; qq += 4)
2996 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2997 pairs++;
2998#endif
2999
3000 /* This might be one to much, because of a BOM */
3001 unicode = _PyUnicode_New((size+3)/4+pairs);
3002 if (!unicode)
3003 return NULL;
3004 if (size == 0)
3005 return (PyObject *)unicode;
3006
3007 /* Unpack UTF-32 encoded data */
3008 p = unicode->str;
3009
Walter Dörwald41980ca2007-08-16 21:55:45 +00003010 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 Py_UCS4 ch;
3012 /* remaining bytes at the end? (size should be divisible by 4) */
3013 if (e-q<4) {
3014 if (consumed)
3015 break;
3016 errmsg = "truncated data";
3017 startinpos = ((const char *)q)-starts;
3018 endinpos = ((const char *)e)-starts;
3019 goto utf32Error;
3020 /* The remaining input chars are ignored if the callback
3021 chooses to skip the input */
3022 }
3023 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3024 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003025
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 if (ch >= 0x110000)
3027 {
3028 errmsg = "codepoint not in range(0x110000)";
3029 startinpos = ((const char *)q)-starts;
3030 endinpos = startinpos+4;
3031 goto utf32Error;
3032 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003033#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003034 if (ch >= 0x10000)
3035 {
3036 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3037 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3038 }
3039 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003040#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 *p++ = ch;
3042 q += 4;
3043 continue;
3044 utf32Error:
3045 outpos = p-PyUnicode_AS_UNICODE(unicode);
3046 if (unicode_decode_call_errorhandler(
3047 errors, &errorHandler,
3048 "utf32", errmsg,
3049 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3050 &unicode, &outpos, &p))
3051 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003052 }
3053
3054 if (byteorder)
3055 *byteorder = bo;
3056
3057 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003059
3060 /* Adjust length */
3061 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3062 goto onError;
3063
3064 Py_XDECREF(errorHandler);
3065 Py_XDECREF(exc);
3066 return (PyObject *)unicode;
3067
Benjamin Peterson29060642009-01-31 22:14:21 +00003068 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003069 Py_DECREF(unicode);
3070 Py_XDECREF(errorHandler);
3071 Py_XDECREF(exc);
3072 return NULL;
3073}
3074
3075PyObject *
3076PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 Py_ssize_t size,
3078 const char *errors,
3079 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003080{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003081 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003082 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003083 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003084#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003085 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003086#else
3087 const int pairs = 0;
3088#endif
3089 /* Offsets from p for storing byte pairs in the right order. */
3090#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3091 int iorder[] = {0, 1, 2, 3};
3092#else
3093 int iorder[] = {3, 2, 1, 0};
3094#endif
3095
Benjamin Peterson29060642009-01-31 22:14:21 +00003096#define STORECHAR(CH) \
3097 do { \
3098 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3099 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3100 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3101 p[iorder[0]] = (CH) & 0xff; \
3102 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003103 } while(0)
3104
3105 /* In narrow builds we can output surrogate pairs as one codepoint,
3106 so we need less space. */
3107#ifndef Py_UNICODE_WIDE
3108 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003109 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3110 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3111 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003112#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003113 nsize = (size - pairs + (byteorder == 0));
3114 bytesize = nsize * 4;
3115 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003116 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003117 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003118 if (v == NULL)
3119 return NULL;
3120
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003121 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003122 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003123 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003124 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003125 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003126
3127 if (byteorder == -1) {
3128 /* force LE */
3129 iorder[0] = 0;
3130 iorder[1] = 1;
3131 iorder[2] = 2;
3132 iorder[3] = 3;
3133 }
3134 else if (byteorder == 1) {
3135 /* force BE */
3136 iorder[0] = 3;
3137 iorder[1] = 2;
3138 iorder[2] = 1;
3139 iorder[3] = 0;
3140 }
3141
3142 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003143 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003144#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3146 Py_UCS4 ch2 = *s;
3147 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3148 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3149 s++;
3150 size--;
3151 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003152 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003153#endif
3154 STORECHAR(ch);
3155 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003156
3157 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003158 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003159#undef STORECHAR
3160}
3161
3162PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3163{
3164 if (!PyUnicode_Check(unicode)) {
3165 PyErr_BadArgument();
3166 return NULL;
3167 }
3168 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003169 PyUnicode_GET_SIZE(unicode),
3170 NULL,
3171 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003172}
3173
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174/* --- UTF-16 Codec ------------------------------------------------------- */
3175
Tim Peters772747b2001-08-09 22:21:55 +00003176PyObject *
3177PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 Py_ssize_t size,
3179 const char *errors,
3180 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181{
Walter Dörwald69652032004-09-07 20:24:22 +00003182 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3183}
3184
Antoine Pitrouab868312009-01-10 15:40:25 +00003185/* Two masks for fast checking of whether a C 'long' may contain
3186 UTF16-encoded surrogate characters. This is an efficient heuristic,
3187 assuming that non-surrogate characters with a code point >= 0x8000 are
3188 rare in most input.
3189 FAST_CHAR_MASK is used when the input is in native byte ordering,
3190 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003191*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003192#if (SIZEOF_LONG == 8)
3193# define FAST_CHAR_MASK 0x8000800080008000L
3194# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3195#elif (SIZEOF_LONG == 4)
3196# define FAST_CHAR_MASK 0x80008000L
3197# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3198#else
3199# error C 'long' size should be either 4 or 8!
3200#endif
3201
Walter Dörwald69652032004-09-07 20:24:22 +00003202PyObject *
3203PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003204 Py_ssize_t size,
3205 const char *errors,
3206 int *byteorder,
3207 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003208{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003209 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003210 Py_ssize_t startinpos;
3211 Py_ssize_t endinpos;
3212 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 PyUnicodeObject *unicode;
3214 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003215 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003216 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003217 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003218 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003219 /* Offsets from q for retrieving byte pairs in the right order. */
3220#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3221 int ihi = 1, ilo = 0;
3222#else
3223 int ihi = 0, ilo = 1;
3224#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003225 PyObject *errorHandler = NULL;
3226 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227
3228 /* Note: size will always be longer than the resulting Unicode
3229 character count */
3230 unicode = _PyUnicode_New(size);
3231 if (!unicode)
3232 return NULL;
3233 if (size == 0)
3234 return (PyObject *)unicode;
3235
3236 /* Unpack UTF-16 encoded data */
3237 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003238 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003239 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240
3241 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003242 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003244 /* Check for BOM marks (U+FEFF) in the input and adjust current
3245 byte order setting accordingly. In native mode, the leading BOM
3246 mark is skipped, in all other modes, it is copied to the output
3247 stream as-is (giving a ZWNBSP character). */
3248 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003249 if (size >= 2) {
3250 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003251#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003252 if (bom == 0xFEFF) {
3253 q += 2;
3254 bo = -1;
3255 }
3256 else if (bom == 0xFFFE) {
3257 q += 2;
3258 bo = 1;
3259 }
Tim Petersced69f82003-09-16 20:30:58 +00003260#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 if (bom == 0xFEFF) {
3262 q += 2;
3263 bo = 1;
3264 }
3265 else if (bom == 0xFFFE) {
3266 q += 2;
3267 bo = -1;
3268 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003269#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003270 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272
Tim Peters772747b2001-08-09 22:21:55 +00003273 if (bo == -1) {
3274 /* force LE */
3275 ihi = 1;
3276 ilo = 0;
3277 }
3278 else if (bo == 1) {
3279 /* force BE */
3280 ihi = 0;
3281 ilo = 1;
3282 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003283#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3284 native_ordering = ilo < ihi;
3285#else
3286 native_ordering = ilo > ihi;
3287#endif
Tim Peters772747b2001-08-09 22:21:55 +00003288
Antoine Pitrouab868312009-01-10 15:40:25 +00003289 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003290 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003292 /* First check for possible aligned read of a C 'long'. Unaligned
3293 reads are more expensive, better to defer to another iteration. */
3294 if (!((size_t) q & LONG_PTR_MASK)) {
3295 /* Fast path for runs of non-surrogate chars. */
3296 register const unsigned char *_q = q;
3297 Py_UNICODE *_p = p;
3298 if (native_ordering) {
3299 /* Native ordering is simple: as long as the input cannot
3300 possibly contain a surrogate char, do an unrolled copy
3301 of several 16-bit code points to the target object.
3302 The non-surrogate check is done on several input bytes
3303 at a time (as many as a C 'long' can contain). */
3304 while (_q < aligned_end) {
3305 unsigned long data = * (unsigned long *) _q;
3306 if (data & FAST_CHAR_MASK)
3307 break;
3308 _p[0] = ((unsigned short *) _q)[0];
3309 _p[1] = ((unsigned short *) _q)[1];
3310#if (SIZEOF_LONG == 8)
3311 _p[2] = ((unsigned short *) _q)[2];
3312 _p[3] = ((unsigned short *) _q)[3];
3313#endif
3314 _q += SIZEOF_LONG;
3315 _p += SIZEOF_LONG / 2;
3316 }
3317 }
3318 else {
3319 /* Byteswapped ordering is similar, but we must decompose
3320 the copy bytewise, and take care of zero'ing out the
3321 upper bytes if the target object is in 32-bit units
3322 (that is, in UCS-4 builds). */
3323 while (_q < aligned_end) {
3324 unsigned long data = * (unsigned long *) _q;
3325 if (data & SWAPPED_FAST_CHAR_MASK)
3326 break;
3327 /* Zero upper bytes in UCS-4 builds */
3328#if (Py_UNICODE_SIZE > 2)
3329 _p[0] = 0;
3330 _p[1] = 0;
3331#if (SIZEOF_LONG == 8)
3332 _p[2] = 0;
3333 _p[3] = 0;
3334#endif
3335#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003336 /* Issue #4916; UCS-4 builds on big endian machines must
3337 fill the two last bytes of each 4-byte unit. */
3338#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3339# define OFF 2
3340#else
3341# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003342#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003343 ((unsigned char *) _p)[OFF + 1] = _q[0];
3344 ((unsigned char *) _p)[OFF + 0] = _q[1];
3345 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3346 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3347#if (SIZEOF_LONG == 8)
3348 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3349 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3350 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3351 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3352#endif
3353#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003354 _q += SIZEOF_LONG;
3355 _p += SIZEOF_LONG / 2;
3356 }
3357 }
3358 p = _p;
3359 q = _q;
3360 if (q >= e)
3361 break;
3362 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003363 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003364
Benjamin Peterson14339b62009-01-31 16:36:08 +00003365 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003366
3367 if (ch < 0xD800 || ch > 0xDFFF) {
3368 *p++ = ch;
3369 continue;
3370 }
3371
3372 /* UTF-16 code pair: */
3373 if (q > e) {
3374 errmsg = "unexpected end of data";
3375 startinpos = (((const char *)q) - 2) - starts;
3376 endinpos = ((const char *)e) + 1 - starts;
3377 goto utf16Error;
3378 }
3379 if (0xD800 <= ch && ch <= 0xDBFF) {
3380 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3381 q += 2;
3382 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003383#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003384 *p++ = ch;
3385 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003386#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003387 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003388#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003389 continue;
3390 }
3391 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003392 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 startinpos = (((const char *)q)-4)-starts;
3394 endinpos = startinpos+2;
3395 goto utf16Error;
3396 }
3397
Benjamin Peterson14339b62009-01-31 16:36:08 +00003398 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003399 errmsg = "illegal encoding";
3400 startinpos = (((const char *)q)-2)-starts;
3401 endinpos = startinpos+2;
3402 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003403
Benjamin Peterson29060642009-01-31 22:14:21 +00003404 utf16Error:
3405 outpos = p - PyUnicode_AS_UNICODE(unicode);
3406 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003407 errors,
3408 &errorHandler,
3409 "utf16", errmsg,
3410 &starts,
3411 (const char **)&e,
3412 &startinpos,
3413 &endinpos,
3414 &exc,
3415 (const char **)&q,
3416 &unicode,
3417 &outpos,
3418 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003419 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003421 /* remaining byte at the end? (size should be even) */
3422 if (e == q) {
3423 if (!consumed) {
3424 errmsg = "truncated data";
3425 startinpos = ((const char *)q) - starts;
3426 endinpos = ((const char *)e) + 1 - starts;
3427 outpos = p - PyUnicode_AS_UNICODE(unicode);
3428 if (unicode_decode_call_errorhandler(
3429 errors,
3430 &errorHandler,
3431 "utf16", errmsg,
3432 &starts,
3433 (const char **)&e,
3434 &startinpos,
3435 &endinpos,
3436 &exc,
3437 (const char **)&q,
3438 &unicode,
3439 &outpos,
3440 &p))
3441 goto onError;
3442 /* The remaining input chars are ignored if the callback
3443 chooses to skip the input */
3444 }
3445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446
3447 if (byteorder)
3448 *byteorder = bo;
3449
Walter Dörwald69652032004-09-07 20:24:22 +00003450 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003451 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003452
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003454 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 goto onError;
3456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 Py_XDECREF(errorHandler);
3458 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 return (PyObject *)unicode;
3460
Benjamin Peterson29060642009-01-31 22:14:21 +00003461 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003463 Py_XDECREF(errorHandler);
3464 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 return NULL;
3466}
3467
Antoine Pitrouab868312009-01-10 15:40:25 +00003468#undef FAST_CHAR_MASK
3469#undef SWAPPED_FAST_CHAR_MASK
3470
Tim Peters772747b2001-08-09 22:21:55 +00003471PyObject *
3472PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003473 Py_ssize_t size,
3474 const char *errors,
3475 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003477 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003478 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003479 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003480#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003481 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003482#else
3483 const int pairs = 0;
3484#endif
Tim Peters772747b2001-08-09 22:21:55 +00003485 /* Offsets from p for storing byte pairs in the right order. */
3486#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3487 int ihi = 1, ilo = 0;
3488#else
3489 int ihi = 0, ilo = 1;
3490#endif
3491
Benjamin Peterson29060642009-01-31 22:14:21 +00003492#define STORECHAR(CH) \
3493 do { \
3494 p[ihi] = ((CH) >> 8) & 0xff; \
3495 p[ilo] = (CH) & 0xff; \
3496 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003497 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003499#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003500 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003501 if (s[i] >= 0x10000)
3502 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003503#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003504 /* 2 * (size + pairs + (byteorder == 0)) */
3505 if (size > PY_SSIZE_T_MAX ||
3506 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003507 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003508 nsize = size + pairs + (byteorder == 0);
3509 bytesize = nsize * 2;
3510 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003511 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003512 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513 if (v == NULL)
3514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003516 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003518 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003519 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003520 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003521
3522 if (byteorder == -1) {
3523 /* force LE */
3524 ihi = 1;
3525 ilo = 0;
3526 }
3527 else if (byteorder == 1) {
3528 /* force BE */
3529 ihi = 0;
3530 ilo = 1;
3531 }
3532
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003533 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003534 Py_UNICODE ch = *s++;
3535 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003536#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 if (ch >= 0x10000) {
3538 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3539 ch = 0xD800 | ((ch-0x10000) >> 10);
3540 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003541#endif
Tim Peters772747b2001-08-09 22:21:55 +00003542 STORECHAR(ch);
3543 if (ch2)
3544 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003545 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003546
3547 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003548 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003549#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550}
3551
3552PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3553{
3554 if (!PyUnicode_Check(unicode)) {
3555 PyErr_BadArgument();
3556 return NULL;
3557 }
3558 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003559 PyUnicode_GET_SIZE(unicode),
3560 NULL,
3561 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562}
3563
3564/* --- Unicode Escape Codec ----------------------------------------------- */
3565
Fredrik Lundh06d12682001-01-24 07:59:11 +00003566static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003567
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 Py_ssize_t size,
3570 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003573 Py_ssize_t startinpos;
3574 Py_ssize_t endinpos;
3575 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003580 char* message;
3581 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 PyObject *errorHandler = NULL;
3583 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003584
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585 /* Escaped strings will always be longer than the resulting
3586 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 length after conversion to the true value.
3588 (but if the error callback returns a long replacement string
3589 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 v = _PyUnicode_New(size);
3591 if (v == NULL)
3592 goto onError;
3593 if (size == 0)
3594 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003598
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 while (s < end) {
3600 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003601 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603
3604 /* Non-escape characters are interpreted as Unicode ordinals */
3605 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003606 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 continue;
3608 }
3609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 /* \ - Escapes */
3612 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003613 c = *s++;
3614 if (s > end)
3615 c = '\0'; /* Invalid after \ */
3616 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617
Benjamin Peterson29060642009-01-31 22:14:21 +00003618 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619 case '\n': break;
3620 case '\\': *p++ = '\\'; break;
3621 case '\'': *p++ = '\''; break;
3622 case '\"': *p++ = '\"'; break;
3623 case 'b': *p++ = '\b'; break;
3624 case 'f': *p++ = '\014'; break; /* FF */
3625 case 't': *p++ = '\t'; break;
3626 case 'n': *p++ = '\n'; break;
3627 case 'r': *p++ = '\r'; break;
3628 case 'v': *p++ = '\013'; break; /* VT */
3629 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3630
Benjamin Peterson29060642009-01-31 22:14:21 +00003631 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632 case '0': case '1': case '2': case '3':
3633 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003634 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003635 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003636 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003637 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003638 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003640 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 break;
3642
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 /* hex escapes */
3644 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003646 digits = 2;
3647 message = "truncated \\xXX escape";
3648 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649
Benjamin Peterson29060642009-01-31 22:14:21 +00003650 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003652 digits = 4;
3653 message = "truncated \\uXXXX escape";
3654 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003657 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003658 digits = 8;
3659 message = "truncated \\UXXXXXXXX escape";
3660 hexescape:
3661 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662 outpos = p-PyUnicode_AS_UNICODE(v);
3663 if (s+digits>end) {
3664 endinpos = size;
3665 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 errors, &errorHandler,
3667 "unicodeescape", "end of string in escape sequence",
3668 &starts, &end, &startinpos, &endinpos, &exc, &s,
3669 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 goto onError;
3671 goto nextByte;
3672 }
3673 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003674 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003675 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 endinpos = (s+i+1)-starts;
3677 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 errors, &errorHandler,
3679 "unicodeescape", message,
3680 &starts, &end, &startinpos, &endinpos, &exc, &s,
3681 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003682 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003684 }
3685 chr = (chr<<4) & ~0xF;
3686 if (c >= '0' && c <= '9')
3687 chr += c - '0';
3688 else if (c >= 'a' && c <= 'f')
3689 chr += 10 + c - 'a';
3690 else
3691 chr += 10 + c - 'A';
3692 }
3693 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003694 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 /* _decoding_error will have already written into the
3696 target buffer. */
3697 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003698 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003699 /* when we get here, chr is a 32-bit unicode character */
3700 if (chr <= 0xffff)
3701 /* UCS-2 character */
3702 *p++ = (Py_UNICODE) chr;
3703 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003704 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003705 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003706#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003707 *p++ = chr;
3708#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003709 chr -= 0x10000L;
3710 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003711 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003712#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003713 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003714 endinpos = s-starts;
3715 outpos = p-PyUnicode_AS_UNICODE(v);
3716 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003717 errors, &errorHandler,
3718 "unicodeescape", "illegal Unicode character",
3719 &starts, &end, &startinpos, &endinpos, &exc, &s,
3720 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003721 goto onError;
3722 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003723 break;
3724
Benjamin Peterson29060642009-01-31 22:14:21 +00003725 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003726 case 'N':
3727 message = "malformed \\N character escape";
3728 if (ucnhash_CAPI == NULL) {
3729 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003730 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003731 if (ucnhash_CAPI == NULL)
3732 goto ucnhashError;
3733 }
3734 if (*s == '{') {
3735 const char *start = s+1;
3736 /* look for the closing brace */
3737 while (*s != '}' && s < end)
3738 s++;
3739 if (s > start && s < end && *s == '}') {
3740 /* found a name. look it up in the unicode database */
3741 message = "unknown Unicode character name";
3742 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003743 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003744 goto store;
3745 }
3746 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 endinpos = s-starts;
3748 outpos = p-PyUnicode_AS_UNICODE(v);
3749 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003750 errors, &errorHandler,
3751 "unicodeescape", message,
3752 &starts, &end, &startinpos, &endinpos, &exc, &s,
3753 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003754 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003755 break;
3756
3757 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003758 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 message = "\\ at end of string";
3760 s--;
3761 endinpos = s-starts;
3762 outpos = p-PyUnicode_AS_UNICODE(v);
3763 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 errors, &errorHandler,
3765 "unicodeescape", message,
3766 &starts, &end, &startinpos, &endinpos, &exc, &s,
3767 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003768 goto onError;
3769 }
3770 else {
3771 *p++ = '\\';
3772 *p++ = (unsigned char)s[-1];
3773 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003774 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003776 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003779 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003780 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003781 Py_XDECREF(errorHandler);
3782 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003784
Benjamin Peterson29060642009-01-31 22:14:21 +00003785 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003786 PyErr_SetString(
3787 PyExc_UnicodeError,
3788 "\\N escapes not supported (can't load unicodedata module)"
3789 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003790 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791 Py_XDECREF(errorHandler);
3792 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003793 return NULL;
3794
Benjamin Peterson29060642009-01-31 22:14:21 +00003795 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003797 Py_XDECREF(errorHandler);
3798 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799 return NULL;
3800}
3801
3802/* Return a Unicode-Escape string version of the Unicode object.
3803
3804 If quotes is true, the string is enclosed in u"" or u'' quotes as
3805 appropriate.
3806
3807*/
3808
Thomas Wouters477c8d52006-05-27 19:21:47 +00003809Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003810 Py_ssize_t size,
3811 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003812{
3813 /* like wcschr, but doesn't stop at NULL characters */
3814
3815 while (size-- > 0) {
3816 if (*s == ch)
3817 return s;
3818 s++;
3819 }
3820
3821 return NULL;
3822}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003823
Walter Dörwald79e913e2007-05-12 11:08:06 +00003824static const char *hexdigits = "0123456789abcdef";
3825
3826PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003827 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003829 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003832#ifdef Py_UNICODE_WIDE
3833 const Py_ssize_t expandsize = 10;
3834#else
3835 const Py_ssize_t expandsize = 6;
3836#endif
3837
Thomas Wouters89f507f2006-12-13 04:49:30 +00003838 /* XXX(nnorwitz): rather than over-allocating, it would be
3839 better to choose a different scheme. Perhaps scan the
3840 first N-chars of the string and allocate based on that size.
3841 */
3842 /* Initial allocation is based on the longest-possible unichr
3843 escape.
3844
3845 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3846 unichr, so in this case it's the longest unichr escape. In
3847 narrow (UTF-16) builds this is five chars per source unichr
3848 since there are two unichrs in the surrogate pair, so in narrow
3849 (UTF-16) builds it's not the longest unichr escape.
3850
3851 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3852 so in the narrow (UTF-16) build case it's the longest unichr
3853 escape.
3854 */
3855
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003856 if (size == 0)
3857 return PyBytes_FromStringAndSize(NULL, 0);
3858
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003859 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003860 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003861
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003862 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 2
3864 + expandsize*size
3865 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 if (repr == NULL)
3867 return NULL;
3868
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003869 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 while (size-- > 0) {
3872 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003873
Walter Dörwald79e913e2007-05-12 11:08:06 +00003874 /* Escape backslashes */
3875 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 *p++ = '\\';
3877 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003878 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003879 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003880
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003881#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003882 /* Map 21-bit characters to '\U00xxxxxx' */
3883 else if (ch >= 0x10000) {
3884 *p++ = '\\';
3885 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003886 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3887 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3888 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3889 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3890 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3891 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3892 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3893 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003894 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003895 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003896#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003897 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3898 else if (ch >= 0xD800 && ch < 0xDC00) {
3899 Py_UNICODE ch2;
3900 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003901
Benjamin Peterson29060642009-01-31 22:14:21 +00003902 ch2 = *s++;
3903 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003904 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003905 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3906 *p++ = '\\';
3907 *p++ = 'U';
3908 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3909 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3910 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3911 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3912 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3913 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3914 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3915 *p++ = hexdigits[ucs & 0x0000000F];
3916 continue;
3917 }
3918 /* Fall through: isolated surrogates are copied as-is */
3919 s--;
3920 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003921 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003922#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003923
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003925 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926 *p++ = '\\';
3927 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003928 *p++ = hexdigits[(ch >> 12) & 0x000F];
3929 *p++ = hexdigits[(ch >> 8) & 0x000F];
3930 *p++ = hexdigits[(ch >> 4) & 0x000F];
3931 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003932 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003933
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003934 /* Map special whitespace to '\t', \n', '\r' */
3935 else if (ch == '\t') {
3936 *p++ = '\\';
3937 *p++ = 't';
3938 }
3939 else if (ch == '\n') {
3940 *p++ = '\\';
3941 *p++ = 'n';
3942 }
3943 else if (ch == '\r') {
3944 *p++ = '\\';
3945 *p++ = 'r';
3946 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003947
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003948 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003949 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003951 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003952 *p++ = hexdigits[(ch >> 4) & 0x000F];
3953 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003954 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003955
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956 /* Copy everything else as-is */
3957 else
3958 *p++ = (char) ch;
3959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003961 assert(p - PyBytes_AS_STRING(repr) > 0);
3962 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3963 return NULL;
3964 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965}
3966
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003967PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003969 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970 if (!PyUnicode_Check(unicode)) {
3971 PyErr_BadArgument();
3972 return NULL;
3973 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003974 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3975 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003976 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977}
3978
3979/* --- Raw Unicode Escape Codec ------------------------------------------- */
3980
3981PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 Py_ssize_t size,
3983 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003986 Py_ssize_t startinpos;
3987 Py_ssize_t endinpos;
3988 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 const char *end;
3992 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993 PyObject *errorHandler = NULL;
3994 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003995
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 /* Escaped strings will always be longer than the resulting
3997 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998 length after conversion to the true value. (But decoding error
3999 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 v = _PyUnicode_New(size);
4001 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 end = s + size;
4007 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 unsigned char c;
4009 Py_UCS4 x;
4010 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004011 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012
Benjamin Peterson29060642009-01-31 22:14:21 +00004013 /* Non-escape characters are interpreted as Unicode ordinals */
4014 if (*s != '\\') {
4015 *p++ = (unsigned char)*s++;
4016 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004017 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004018 startinpos = s-starts;
4019
4020 /* \u-escapes are only interpreted iff the number of leading
4021 backslashes if odd */
4022 bs = s;
4023 for (;s < end;) {
4024 if (*s != '\\')
4025 break;
4026 *p++ = (unsigned char)*s++;
4027 }
4028 if (((s - bs) & 1) == 0 ||
4029 s >= end ||
4030 (*s != 'u' && *s != 'U')) {
4031 continue;
4032 }
4033 p--;
4034 count = *s=='u' ? 4 : 8;
4035 s++;
4036
4037 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4038 outpos = p-PyUnicode_AS_UNICODE(v);
4039 for (x = 0, i = 0; i < count; ++i, ++s) {
4040 c = (unsigned char)*s;
4041 if (!ISXDIGIT(c)) {
4042 endinpos = s-starts;
4043 if (unicode_decode_call_errorhandler(
4044 errors, &errorHandler,
4045 "rawunicodeescape", "truncated \\uXXXX",
4046 &starts, &end, &startinpos, &endinpos, &exc, &s,
4047 &v, &outpos, &p))
4048 goto onError;
4049 goto nextByte;
4050 }
4051 x = (x<<4) & ~0xF;
4052 if (c >= '0' && c <= '9')
4053 x += c - '0';
4054 else if (c >= 'a' && c <= 'f')
4055 x += 10 + c - 'a';
4056 else
4057 x += 10 + c - 'A';
4058 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004059 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 /* UCS-2 character */
4061 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004062 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 /* UCS-4 character. Either store directly, or as
4064 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004065#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004067#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004068 x -= 0x10000L;
4069 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4070 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004071#endif
4072 } else {
4073 endinpos = s-starts;
4074 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004075 if (unicode_decode_call_errorhandler(
4076 errors, &errorHandler,
4077 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 &starts, &end, &startinpos, &endinpos, &exc, &s,
4079 &v, &outpos, &p))
4080 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004081 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 nextByte:
4083 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004085 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004086 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 Py_XDECREF(errorHandler);
4088 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004090
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093 Py_XDECREF(errorHandler);
4094 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 return NULL;
4096}
4097
4098PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004099 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004101 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 char *p;
4103 char *q;
4104
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004105#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004106 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004107#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004108 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004109#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004110
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004111 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004112 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004113
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004114 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 if (repr == NULL)
4116 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004117 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004118 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004120 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 while (size-- > 0) {
4122 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004123#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 /* Map 32-bit characters to '\Uxxxxxxxx' */
4125 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004126 *p++ = '\\';
4127 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004128 *p++ = hexdigits[(ch >> 28) & 0xf];
4129 *p++ = hexdigits[(ch >> 24) & 0xf];
4130 *p++ = hexdigits[(ch >> 20) & 0xf];
4131 *p++ = hexdigits[(ch >> 16) & 0xf];
4132 *p++ = hexdigits[(ch >> 12) & 0xf];
4133 *p++ = hexdigits[(ch >> 8) & 0xf];
4134 *p++ = hexdigits[(ch >> 4) & 0xf];
4135 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004136 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004137 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004138#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4140 if (ch >= 0xD800 && ch < 0xDC00) {
4141 Py_UNICODE ch2;
4142 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004143
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 ch2 = *s++;
4145 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004146 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4148 *p++ = '\\';
4149 *p++ = 'U';
4150 *p++ = hexdigits[(ucs >> 28) & 0xf];
4151 *p++ = hexdigits[(ucs >> 24) & 0xf];
4152 *p++ = hexdigits[(ucs >> 20) & 0xf];
4153 *p++ = hexdigits[(ucs >> 16) & 0xf];
4154 *p++ = hexdigits[(ucs >> 12) & 0xf];
4155 *p++ = hexdigits[(ucs >> 8) & 0xf];
4156 *p++ = hexdigits[(ucs >> 4) & 0xf];
4157 *p++ = hexdigits[ucs & 0xf];
4158 continue;
4159 }
4160 /* Fall through: isolated surrogates are copied as-is */
4161 s--;
4162 size++;
4163 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004164#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004165 /* Map 16-bit characters to '\uxxxx' */
4166 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 *p++ = '\\';
4168 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004169 *p++ = hexdigits[(ch >> 12) & 0xf];
4170 *p++ = hexdigits[(ch >> 8) & 0xf];
4171 *p++ = hexdigits[(ch >> 4) & 0xf];
4172 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 /* Copy everything else as-is */
4175 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176 *p++ = (char) ch;
4177 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004178 size = p - q;
4179
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004180 assert(size > 0);
4181 if (_PyBytes_Resize(&repr, size) < 0)
4182 return NULL;
4183 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184}
4185
4186PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4187{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004188 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004190 PyErr_BadArgument();
4191 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004193 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4194 PyUnicode_GET_SIZE(unicode));
4195
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004196 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197}
4198
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004199/* --- Unicode Internal Codec ------------------------------------------- */
4200
4201PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 Py_ssize_t size,
4203 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004204{
4205 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004206 Py_ssize_t startinpos;
4207 Py_ssize_t endinpos;
4208 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004209 PyUnicodeObject *v;
4210 Py_UNICODE *p;
4211 const char *end;
4212 const char *reason;
4213 PyObject *errorHandler = NULL;
4214 PyObject *exc = NULL;
4215
Neal Norwitzd43069c2006-01-08 01:12:10 +00004216#ifdef Py_UNICODE_WIDE
4217 Py_UNICODE unimax = PyUnicode_GetMax();
4218#endif
4219
Thomas Wouters89f507f2006-12-13 04:49:30 +00004220 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004221 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4222 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004223 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004224 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004226 p = PyUnicode_AS_UNICODE(v);
4227 end = s + size;
4228
4229 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004230 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004231 /* We have to sanity check the raw data, otherwise doom looms for
4232 some malformed UCS-4 data. */
4233 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004234#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004235 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004236#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004237 end-s < Py_UNICODE_SIZE
4238 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004239 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004240 startinpos = s - starts;
4241 if (end-s < Py_UNICODE_SIZE) {
4242 endinpos = end-starts;
4243 reason = "truncated input";
4244 }
4245 else {
4246 endinpos = s - starts + Py_UNICODE_SIZE;
4247 reason = "illegal code point (> 0x10FFFF)";
4248 }
4249 outpos = p - PyUnicode_AS_UNICODE(v);
4250 if (unicode_decode_call_errorhandler(
4251 errors, &errorHandler,
4252 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004253 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004254 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004255 goto onError;
4256 }
4257 }
4258 else {
4259 p++;
4260 s += Py_UNICODE_SIZE;
4261 }
4262 }
4263
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004264 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004265 goto onError;
4266 Py_XDECREF(errorHandler);
4267 Py_XDECREF(exc);
4268 return (PyObject *)v;
4269
Benjamin Peterson29060642009-01-31 22:14:21 +00004270 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004271 Py_XDECREF(v);
4272 Py_XDECREF(errorHandler);
4273 Py_XDECREF(exc);
4274 return NULL;
4275}
4276
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277/* --- Latin-1 Codec ------------------------------------------------------ */
4278
4279PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004280 Py_ssize_t size,
4281 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282{
4283 PyUnicodeObject *v;
4284 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004285 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004286
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004288 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 Py_UNICODE r = *(unsigned char*)s;
4290 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004291 }
4292
Guido van Rossumd57fd912000-03-10 22:53:23 +00004293 v = _PyUnicode_New(size);
4294 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004295 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004299 e = s + size;
4300 /* Unrolling the copy makes it much faster by reducing the looping
4301 overhead. This is similar to what many memcpy() implementations do. */
4302 unrolled_end = e - 4;
4303 while (s < unrolled_end) {
4304 p[0] = (unsigned char) s[0];
4305 p[1] = (unsigned char) s[1];
4306 p[2] = (unsigned char) s[2];
4307 p[3] = (unsigned char) s[3];
4308 s += 4;
4309 p += 4;
4310 }
4311 while (s < e)
4312 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004314
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316 Py_XDECREF(v);
4317 return NULL;
4318}
4319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320/* create or adjust a UnicodeEncodeError */
4321static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004322 const char *encoding,
4323 const Py_UNICODE *unicode, Py_ssize_t size,
4324 Py_ssize_t startpos, Py_ssize_t endpos,
4325 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004328 *exceptionObject = PyUnicodeEncodeError_Create(
4329 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 }
4331 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4333 goto onError;
4334 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4335 goto onError;
4336 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4337 goto onError;
4338 return;
4339 onError:
4340 Py_DECREF(*exceptionObject);
4341 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 }
4343}
4344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345/* raises a UnicodeEncodeError */
4346static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 const char *encoding,
4348 const Py_UNICODE *unicode, Py_ssize_t size,
4349 Py_ssize_t startpos, Py_ssize_t endpos,
4350 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351{
4352 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004355 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356}
4357
4358/* error handling callback helper:
4359 build arguments, call the callback and check the arguments,
4360 put the result into newpos and return the replacement string, which
4361 has to be freed by the caller */
4362static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 PyObject **errorHandler,
4364 const char *encoding, const char *reason,
4365 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4366 Py_ssize_t startpos, Py_ssize_t endpos,
4367 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004369 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370
4371 PyObject *restuple;
4372 PyObject *resunicode;
4373
4374 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004375 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 }
4379
4380 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384
4385 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004390 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 Py_DECREF(restuple);
4392 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004394 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 &resunicode, newpos)) {
4396 Py_DECREF(restuple);
4397 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004399 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4400 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4401 Py_DECREF(restuple);
4402 return NULL;
4403 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004406 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4408 Py_DECREF(restuple);
4409 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004410 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 Py_INCREF(resunicode);
4412 Py_DECREF(restuple);
4413 return resunicode;
4414}
4415
4416static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 Py_ssize_t size,
4418 const char *errors,
4419 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420{
4421 /* output object */
4422 PyObject *res;
4423 /* pointers to the beginning and end+1 of input */
4424 const Py_UNICODE *startp = p;
4425 const Py_UNICODE *endp = p + size;
4426 /* pointer to the beginning of the unencodable characters */
4427 /* const Py_UNICODE *badp = NULL; */
4428 /* pointer into the output */
4429 char *str;
4430 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004431 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004432 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4433 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 PyObject *errorHandler = NULL;
4435 PyObject *exc = NULL;
4436 /* the following variable is used for caching string comparisons
4437 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4438 int known_errorHandler = -1;
4439
4440 /* allocate enough for a simple encoding without
4441 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004442 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004443 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004444 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004446 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004447 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448 ressize = size;
4449
4450 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004451 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 /* can we encode this? */
4454 if (c<limit) {
4455 /* no overflow check, because we know that the space is enough */
4456 *str++ = (char)c;
4457 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004458 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 else {
4460 Py_ssize_t unicodepos = p-startp;
4461 Py_ssize_t requiredsize;
4462 PyObject *repunicode;
4463 Py_ssize_t repsize;
4464 Py_ssize_t newpos;
4465 Py_ssize_t respos;
4466 Py_UNICODE *uni2;
4467 /* startpos for collecting unencodable chars */
4468 const Py_UNICODE *collstart = p;
4469 const Py_UNICODE *collend = p;
4470 /* find all unecodable characters */
4471 while ((collend < endp) && ((*collend)>=limit))
4472 ++collend;
4473 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4474 if (known_errorHandler==-1) {
4475 if ((errors==NULL) || (!strcmp(errors, "strict")))
4476 known_errorHandler = 1;
4477 else if (!strcmp(errors, "replace"))
4478 known_errorHandler = 2;
4479 else if (!strcmp(errors, "ignore"))
4480 known_errorHandler = 3;
4481 else if (!strcmp(errors, "xmlcharrefreplace"))
4482 known_errorHandler = 4;
4483 else
4484 known_errorHandler = 0;
4485 }
4486 switch (known_errorHandler) {
4487 case 1: /* strict */
4488 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4489 goto onError;
4490 case 2: /* replace */
4491 while (collstart++<collend)
4492 *str++ = '?'; /* fall through */
4493 case 3: /* ignore */
4494 p = collend;
4495 break;
4496 case 4: /* xmlcharrefreplace */
4497 respos = str - PyBytes_AS_STRING(res);
4498 /* determine replacement size (temporarily (mis)uses p) */
4499 for (p = collstart, repsize = 0; p < collend; ++p) {
4500 if (*p<10)
4501 repsize += 2+1+1;
4502 else if (*p<100)
4503 repsize += 2+2+1;
4504 else if (*p<1000)
4505 repsize += 2+3+1;
4506 else if (*p<10000)
4507 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004508#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 else
4510 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004511#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 else if (*p<100000)
4513 repsize += 2+5+1;
4514 else if (*p<1000000)
4515 repsize += 2+6+1;
4516 else
4517 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004518#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 }
4520 requiredsize = respos+repsize+(endp-collend);
4521 if (requiredsize > ressize) {
4522 if (requiredsize<2*ressize)
4523 requiredsize = 2*ressize;
4524 if (_PyBytes_Resize(&res, requiredsize))
4525 goto onError;
4526 str = PyBytes_AS_STRING(res) + respos;
4527 ressize = requiredsize;
4528 }
4529 /* generate replacement (temporarily (mis)uses p) */
4530 for (p = collstart; p < collend; ++p) {
4531 str += sprintf(str, "&#%d;", (int)*p);
4532 }
4533 p = collend;
4534 break;
4535 default:
4536 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4537 encoding, reason, startp, size, &exc,
4538 collstart-startp, collend-startp, &newpos);
4539 if (repunicode == NULL)
4540 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004541 if (PyBytes_Check(repunicode)) {
4542 /* Directly copy bytes result to output. */
4543 repsize = PyBytes_Size(repunicode);
4544 if (repsize > 1) {
4545 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004546 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004547 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4548 Py_DECREF(repunicode);
4549 goto onError;
4550 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004551 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004552 ressize += repsize-1;
4553 }
4554 memcpy(str, PyBytes_AsString(repunicode), repsize);
4555 str += repsize;
4556 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004557 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004558 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004559 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 /* need more space? (at least enough for what we
4561 have+the replacement+the rest of the string, so
4562 we won't have to check space for encodable characters) */
4563 respos = str - PyBytes_AS_STRING(res);
4564 repsize = PyUnicode_GET_SIZE(repunicode);
4565 requiredsize = respos+repsize+(endp-collend);
4566 if (requiredsize > ressize) {
4567 if (requiredsize<2*ressize)
4568 requiredsize = 2*ressize;
4569 if (_PyBytes_Resize(&res, requiredsize)) {
4570 Py_DECREF(repunicode);
4571 goto onError;
4572 }
4573 str = PyBytes_AS_STRING(res) + respos;
4574 ressize = requiredsize;
4575 }
4576 /* check if there is anything unencodable in the replacement
4577 and copy it to the output */
4578 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4579 c = *uni2;
4580 if (c >= limit) {
4581 raise_encode_exception(&exc, encoding, startp, size,
4582 unicodepos, unicodepos+1, reason);
4583 Py_DECREF(repunicode);
4584 goto onError;
4585 }
4586 *str = (char)c;
4587 }
4588 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004589 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004590 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004591 }
4592 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004593 /* Resize if we allocated to much */
4594 size = str - PyBytes_AS_STRING(res);
4595 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004596 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004597 if (_PyBytes_Resize(&res, size) < 0)
4598 goto onError;
4599 }
4600
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 Py_XDECREF(errorHandler);
4602 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004603 return res;
4604
4605 onError:
4606 Py_XDECREF(res);
4607 Py_XDECREF(errorHandler);
4608 Py_XDECREF(exc);
4609 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610}
4611
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 Py_ssize_t size,
4614 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004616 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617}
4618
4619PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4620{
4621 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 PyErr_BadArgument();
4623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624 }
4625 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004626 PyUnicode_GET_SIZE(unicode),
4627 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628}
4629
4630/* --- 7-bit ASCII Codec -------------------------------------------------- */
4631
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004633 Py_ssize_t size,
4634 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 PyUnicodeObject *v;
4638 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004639 Py_ssize_t startinpos;
4640 Py_ssize_t endinpos;
4641 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 const char *e;
4643 PyObject *errorHandler = NULL;
4644 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004645
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004647 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 Py_UNICODE r = *(unsigned char*)s;
4649 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004650 }
Tim Petersced69f82003-09-16 20:30:58 +00004651
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 v = _PyUnicode_New(size);
4653 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004658 e = s + size;
4659 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 register unsigned char c = (unsigned char)*s;
4661 if (c < 128) {
4662 *p++ = c;
4663 ++s;
4664 }
4665 else {
4666 startinpos = s-starts;
4667 endinpos = startinpos + 1;
4668 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4669 if (unicode_decode_call_errorhandler(
4670 errors, &errorHandler,
4671 "ascii", "ordinal not in range(128)",
4672 &starts, &e, &startinpos, &endinpos, &exc, &s,
4673 &v, &outpos, &p))
4674 goto onError;
4675 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004677 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004678 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4679 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 Py_XDECREF(errorHandler);
4681 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004683
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 Py_XDECREF(errorHandler);
4687 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688 return NULL;
4689}
4690
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 Py_ssize_t size,
4693 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696}
4697
4698PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4699{
4700 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 PyErr_BadArgument();
4702 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 }
4704 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004705 PyUnicode_GET_SIZE(unicode),
4706 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707}
4708
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004709#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004710
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004711/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004712
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004713#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004714#define NEED_RETRY
4715#endif
4716
4717/* XXX This code is limited to "true" double-byte encodings, as
4718 a) it assumes an incomplete character consists of a single byte, and
4719 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004721
4722static int is_dbcs_lead_byte(const char *s, int offset)
4723{
4724 const char *curr = s + offset;
4725
4726 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 const char *prev = CharPrev(s, curr);
4728 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004729 }
4730 return 0;
4731}
4732
4733/*
4734 * Decode MBCS string into unicode object. If 'final' is set, converts
4735 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4736 */
4737static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 const char *s, /* MBCS string */
4739 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004740 int final,
4741 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004742{
4743 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004744 Py_ssize_t n;
4745 DWORD usize;
4746 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004747
4748 assert(size >= 0);
4749
Victor Stinner554f3f02010-06-16 23:33:54 +00004750 /* check and handle 'errors' arg */
4751 if (errors==NULL || strcmp(errors, "strict")==0)
4752 flags = MB_ERR_INVALID_CHARS;
4753 else if (strcmp(errors, "ignore")==0)
4754 flags = 0;
4755 else {
4756 PyErr_Format(PyExc_ValueError,
4757 "mbcs encoding does not support errors='%s'",
4758 errors);
4759 return -1;
4760 }
4761
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004762 /* Skip trailing lead-byte unless 'final' is set */
4763 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004765
4766 /* First get the size of the result */
4767 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004768 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4769 if (usize==0)
4770 goto mbcs_decode_error;
4771 } else
4772 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004773
4774 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004775 /* Create unicode object */
4776 *v = _PyUnicode_New(usize);
4777 if (*v == NULL)
4778 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004779 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004780 }
4781 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 /* Extend unicode object */
4783 n = PyUnicode_GET_SIZE(*v);
4784 if (_PyUnicode_Resize(v, n + usize) < 0)
4785 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004786 }
4787
4788 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004789 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004790 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004791 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4792 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004793 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004794 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004795 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004796
4797mbcs_decode_error:
4798 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4799 we raise a UnicodeDecodeError - else it is a 'generic'
4800 windows error
4801 */
4802 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4803 /* Ideally, we should get reason from FormatMessage - this
4804 is the Windows 2000 English version of the message
4805 */
4806 PyObject *exc = NULL;
4807 const char *reason = "No mapping for the Unicode character exists "
4808 "in the target multi-byte code page.";
4809 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4810 if (exc != NULL) {
4811 PyCodec_StrictErrors(exc);
4812 Py_DECREF(exc);
4813 }
4814 } else {
4815 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4816 }
4817 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004818}
4819
4820PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 Py_ssize_t size,
4822 const char *errors,
4823 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004824{
4825 PyUnicodeObject *v = NULL;
4826 int done;
4827
4828 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004829 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004830
4831#ifdef NEED_RETRY
4832 retry:
4833 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004834 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004835 else
4836#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004837 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004838
4839 if (done < 0) {
4840 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004841 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004842 }
4843
4844 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004846
4847#ifdef NEED_RETRY
4848 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 s += done;
4850 size -= done;
4851 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004852 }
4853#endif
4854
4855 return (PyObject *)v;
4856}
4857
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004858PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004859 Py_ssize_t size,
4860 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004861{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004862 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4863}
4864
4865/*
4866 * Convert unicode into string object (MBCS).
4867 * Returns 0 if succeed, -1 otherwise.
4868 */
4869static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004871 int size, /* size of unicode */
4872 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004873{
Victor Stinner554f3f02010-06-16 23:33:54 +00004874 BOOL usedDefaultChar = FALSE;
4875 BOOL *pusedDefaultChar;
4876 int mbcssize;
4877 Py_ssize_t n;
4878 PyObject *exc = NULL;
4879 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004880
4881 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004882
Victor Stinner554f3f02010-06-16 23:33:54 +00004883 /* check and handle 'errors' arg */
4884 if (errors==NULL || strcmp(errors, "strict")==0) {
4885 flags = WC_NO_BEST_FIT_CHARS;
4886 pusedDefaultChar = &usedDefaultChar;
4887 } else if (strcmp(errors, "replace")==0) {
4888 flags = 0;
4889 pusedDefaultChar = NULL;
4890 } else {
4891 PyErr_Format(PyExc_ValueError,
4892 "mbcs encoding does not support errors='%s'",
4893 errors);
4894 return -1;
4895 }
4896
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004897 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004898 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004899 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4900 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 if (mbcssize == 0) {
4902 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4903 return -1;
4904 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004905 /* If we used a default char, then we failed! */
4906 if (pusedDefaultChar && *pusedDefaultChar)
4907 goto mbcs_encode_error;
4908 } else {
4909 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004910 }
4911
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004912 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 /* Create string object */
4914 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4915 if (*repr == NULL)
4916 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004917 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004918 }
4919 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 /* Extend string object */
4921 n = PyBytes_Size(*repr);
4922 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4923 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004924 }
4925
4926 /* Do the conversion */
4927 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004929 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4930 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4932 return -1;
4933 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004934 if (pusedDefaultChar && *pusedDefaultChar)
4935 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004936 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004937 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004938
4939mbcs_encode_error:
4940 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4941 Py_XDECREF(exc);
4942 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004943}
4944
4945PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 Py_ssize_t size,
4947 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004948{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004949 PyObject *repr = NULL;
4950 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004951
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004952#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004954 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004955 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004956 else
4957#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004958 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004959
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004960 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 Py_XDECREF(repr);
4962 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004963 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004964
4965#ifdef NEED_RETRY
4966 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 p += INT_MAX;
4968 size -= INT_MAX;
4969 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004970 }
4971#endif
4972
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004973 return repr;
4974}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004975
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004976PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4977{
4978 if (!PyUnicode_Check(unicode)) {
4979 PyErr_BadArgument();
4980 return NULL;
4981 }
4982 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 PyUnicode_GET_SIZE(unicode),
4984 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004985}
4986
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004987#undef NEED_RETRY
4988
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004989#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004990
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991/* --- Character Mapping Codec -------------------------------------------- */
4992
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 Py_ssize_t size,
4995 PyObject *mapping,
4996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004998 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004999 Py_ssize_t startinpos;
5000 Py_ssize_t endinpos;
5001 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003 PyUnicodeObject *v;
5004 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005005 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006 PyObject *errorHandler = NULL;
5007 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005008 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005009 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005010
Guido van Rossumd57fd912000-03-10 22:53:23 +00005011 /* Default to Latin-1 */
5012 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014
5015 v = _PyUnicode_New(size);
5016 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005019 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005021 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005022 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 mapstring = PyUnicode_AS_UNICODE(mapping);
5024 maplen = PyUnicode_GET_SIZE(mapping);
5025 while (s < e) {
5026 unsigned char ch = *s;
5027 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 if (ch < maplen)
5030 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 if (x == 0xfffe) {
5033 /* undefined mapping */
5034 outpos = p-PyUnicode_AS_UNICODE(v);
5035 startinpos = s-starts;
5036 endinpos = startinpos+1;
5037 if (unicode_decode_call_errorhandler(
5038 errors, &errorHandler,
5039 "charmap", "character maps to <undefined>",
5040 &starts, &e, &startinpos, &endinpos, &exc, &s,
5041 &v, &outpos, &p)) {
5042 goto onError;
5043 }
5044 continue;
5045 }
5046 *p++ = x;
5047 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005048 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005049 }
5050 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 while (s < e) {
5052 unsigned char ch = *s;
5053 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005054
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5056 w = PyLong_FromLong((long)ch);
5057 if (w == NULL)
5058 goto onError;
5059 x = PyObject_GetItem(mapping, w);
5060 Py_DECREF(w);
5061 if (x == NULL) {
5062 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5063 /* No mapping found means: mapping is undefined. */
5064 PyErr_Clear();
5065 x = Py_None;
5066 Py_INCREF(x);
5067 } else
5068 goto onError;
5069 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005070
Benjamin Peterson29060642009-01-31 22:14:21 +00005071 /* Apply mapping */
5072 if (PyLong_Check(x)) {
5073 long value = PyLong_AS_LONG(x);
5074 if (value < 0 || value > 65535) {
5075 PyErr_SetString(PyExc_TypeError,
5076 "character mapping must be in range(65536)");
5077 Py_DECREF(x);
5078 goto onError;
5079 }
5080 *p++ = (Py_UNICODE)value;
5081 }
5082 else if (x == Py_None) {
5083 /* undefined mapping */
5084 outpos = p-PyUnicode_AS_UNICODE(v);
5085 startinpos = s-starts;
5086 endinpos = startinpos+1;
5087 if (unicode_decode_call_errorhandler(
5088 errors, &errorHandler,
5089 "charmap", "character maps to <undefined>",
5090 &starts, &e, &startinpos, &endinpos, &exc, &s,
5091 &v, &outpos, &p)) {
5092 Py_DECREF(x);
5093 goto onError;
5094 }
5095 Py_DECREF(x);
5096 continue;
5097 }
5098 else if (PyUnicode_Check(x)) {
5099 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005100
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 if (targetsize == 1)
5102 /* 1-1 mapping */
5103 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005104
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 else if (targetsize > 1) {
5106 /* 1-n mapping */
5107 if (targetsize > extrachars) {
5108 /* resize first */
5109 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5110 Py_ssize_t needed = (targetsize - extrachars) + \
5111 (targetsize << 2);
5112 extrachars += needed;
5113 /* XXX overflow detection missing */
5114 if (_PyUnicode_Resize(&v,
5115 PyUnicode_GET_SIZE(v) + needed) < 0) {
5116 Py_DECREF(x);
5117 goto onError;
5118 }
5119 p = PyUnicode_AS_UNICODE(v) + oldpos;
5120 }
5121 Py_UNICODE_COPY(p,
5122 PyUnicode_AS_UNICODE(x),
5123 targetsize);
5124 p += targetsize;
5125 extrachars -= targetsize;
5126 }
5127 /* 1-0 mapping: skip the character */
5128 }
5129 else {
5130 /* wrong return value */
5131 PyErr_SetString(PyExc_TypeError,
5132 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005133 Py_DECREF(x);
5134 goto onError;
5135 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005136 Py_DECREF(x);
5137 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 }
5140 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5142 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005143 Py_XDECREF(errorHandler);
5144 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005146
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005148 Py_XDECREF(errorHandler);
5149 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 Py_XDECREF(v);
5151 return NULL;
5152}
5153
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005154/* Charmap encoding: the lookup table */
5155
5156struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 PyObject_HEAD
5158 unsigned char level1[32];
5159 int count2, count3;
5160 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005161};
5162
5163static PyObject*
5164encoding_map_size(PyObject *obj, PyObject* args)
5165{
5166 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005167 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005169}
5170
5171static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005172 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 PyDoc_STR("Return the size (in bytes) of this object") },
5174 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005175};
5176
5177static void
5178encoding_map_dealloc(PyObject* o)
5179{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005180 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005181}
5182
5183static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005184 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 "EncodingMap", /*tp_name*/
5186 sizeof(struct encoding_map), /*tp_basicsize*/
5187 0, /*tp_itemsize*/
5188 /* methods */
5189 encoding_map_dealloc, /*tp_dealloc*/
5190 0, /*tp_print*/
5191 0, /*tp_getattr*/
5192 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005193 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 0, /*tp_repr*/
5195 0, /*tp_as_number*/
5196 0, /*tp_as_sequence*/
5197 0, /*tp_as_mapping*/
5198 0, /*tp_hash*/
5199 0, /*tp_call*/
5200 0, /*tp_str*/
5201 0, /*tp_getattro*/
5202 0, /*tp_setattro*/
5203 0, /*tp_as_buffer*/
5204 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5205 0, /*tp_doc*/
5206 0, /*tp_traverse*/
5207 0, /*tp_clear*/
5208 0, /*tp_richcompare*/
5209 0, /*tp_weaklistoffset*/
5210 0, /*tp_iter*/
5211 0, /*tp_iternext*/
5212 encoding_map_methods, /*tp_methods*/
5213 0, /*tp_members*/
5214 0, /*tp_getset*/
5215 0, /*tp_base*/
5216 0, /*tp_dict*/
5217 0, /*tp_descr_get*/
5218 0, /*tp_descr_set*/
5219 0, /*tp_dictoffset*/
5220 0, /*tp_init*/
5221 0, /*tp_alloc*/
5222 0, /*tp_new*/
5223 0, /*tp_free*/
5224 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005225};
5226
5227PyObject*
5228PyUnicode_BuildEncodingMap(PyObject* string)
5229{
5230 Py_UNICODE *decode;
5231 PyObject *result;
5232 struct encoding_map *mresult;
5233 int i;
5234 int need_dict = 0;
5235 unsigned char level1[32];
5236 unsigned char level2[512];
5237 unsigned char *mlevel1, *mlevel2, *mlevel3;
5238 int count2 = 0, count3 = 0;
5239
5240 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5241 PyErr_BadArgument();
5242 return NULL;
5243 }
5244 decode = PyUnicode_AS_UNICODE(string);
5245 memset(level1, 0xFF, sizeof level1);
5246 memset(level2, 0xFF, sizeof level2);
5247
5248 /* If there isn't a one-to-one mapping of NULL to \0,
5249 or if there are non-BMP characters, we need to use
5250 a mapping dictionary. */
5251 if (decode[0] != 0)
5252 need_dict = 1;
5253 for (i = 1; i < 256; i++) {
5254 int l1, l2;
5255 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005256#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005257 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005258#endif
5259 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005260 need_dict = 1;
5261 break;
5262 }
5263 if (decode[i] == 0xFFFE)
5264 /* unmapped character */
5265 continue;
5266 l1 = decode[i] >> 11;
5267 l2 = decode[i] >> 7;
5268 if (level1[l1] == 0xFF)
5269 level1[l1] = count2++;
5270 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005271 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005272 }
5273
5274 if (count2 >= 0xFF || count3 >= 0xFF)
5275 need_dict = 1;
5276
5277 if (need_dict) {
5278 PyObject *result = PyDict_New();
5279 PyObject *key, *value;
5280 if (!result)
5281 return NULL;
5282 for (i = 0; i < 256; i++) {
5283 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005284 key = PyLong_FromLong(decode[i]);
5285 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005286 if (!key || !value)
5287 goto failed1;
5288 if (PyDict_SetItem(result, key, value) == -1)
5289 goto failed1;
5290 Py_DECREF(key);
5291 Py_DECREF(value);
5292 }
5293 return result;
5294 failed1:
5295 Py_XDECREF(key);
5296 Py_XDECREF(value);
5297 Py_DECREF(result);
5298 return NULL;
5299 }
5300
5301 /* Create a three-level trie */
5302 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5303 16*count2 + 128*count3 - 1);
5304 if (!result)
5305 return PyErr_NoMemory();
5306 PyObject_Init(result, &EncodingMapType);
5307 mresult = (struct encoding_map*)result;
5308 mresult->count2 = count2;
5309 mresult->count3 = count3;
5310 mlevel1 = mresult->level1;
5311 mlevel2 = mresult->level23;
5312 mlevel3 = mresult->level23 + 16*count2;
5313 memcpy(mlevel1, level1, 32);
5314 memset(mlevel2, 0xFF, 16*count2);
5315 memset(mlevel3, 0, 128*count3);
5316 count3 = 0;
5317 for (i = 1; i < 256; i++) {
5318 int o1, o2, o3, i2, i3;
5319 if (decode[i] == 0xFFFE)
5320 /* unmapped character */
5321 continue;
5322 o1 = decode[i]>>11;
5323 o2 = (decode[i]>>7) & 0xF;
5324 i2 = 16*mlevel1[o1] + o2;
5325 if (mlevel2[i2] == 0xFF)
5326 mlevel2[i2] = count3++;
5327 o3 = decode[i] & 0x7F;
5328 i3 = 128*mlevel2[i2] + o3;
5329 mlevel3[i3] = i;
5330 }
5331 return result;
5332}
5333
5334static int
5335encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5336{
5337 struct encoding_map *map = (struct encoding_map*)mapping;
5338 int l1 = c>>11;
5339 int l2 = (c>>7) & 0xF;
5340 int l3 = c & 0x7F;
5341 int i;
5342
5343#ifdef Py_UNICODE_WIDE
5344 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005346 }
5347#endif
5348 if (c == 0)
5349 return 0;
5350 /* level 1*/
5351 i = map->level1[l1];
5352 if (i == 0xFF) {
5353 return -1;
5354 }
5355 /* level 2*/
5356 i = map->level23[16*i+l2];
5357 if (i == 0xFF) {
5358 return -1;
5359 }
5360 /* level 3 */
5361 i = map->level23[16*map->count2 + 128*i + l3];
5362 if (i == 0) {
5363 return -1;
5364 }
5365 return i;
5366}
5367
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005368/* Lookup the character ch in the mapping. If the character
5369 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005370 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372{
Christian Heimes217cfd12007-12-02 14:31:20 +00005373 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005374 PyObject *x;
5375
5376 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378 x = PyObject_GetItem(mapping, w);
5379 Py_DECREF(w);
5380 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5382 /* No mapping found means: mapping is undefined. */
5383 PyErr_Clear();
5384 x = Py_None;
5385 Py_INCREF(x);
5386 return x;
5387 } else
5388 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005390 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005392 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 long value = PyLong_AS_LONG(x);
5394 if (value < 0 || value > 255) {
5395 PyErr_SetString(PyExc_TypeError,
5396 "character mapping must be in range(256)");
5397 Py_DECREF(x);
5398 return NULL;
5399 }
5400 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005402 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 /* wrong return value */
5406 PyErr_Format(PyExc_TypeError,
5407 "character mapping must return integer, bytes or None, not %.400s",
5408 x->ob_type->tp_name);
5409 Py_DECREF(x);
5410 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 }
5412}
5413
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005414static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005415charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005416{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005417 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5418 /* exponentially overallocate to minimize reallocations */
5419 if (requiredsize < 2*outsize)
5420 requiredsize = 2*outsize;
5421 if (_PyBytes_Resize(outobj, requiredsize))
5422 return -1;
5423 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005424}
5425
Benjamin Peterson14339b62009-01-31 16:36:08 +00005426typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005428}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005429/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005430 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431 space is available. Return a new reference to the object that
5432 was put in the output buffer, or Py_None, if the mapping was undefined
5433 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005434 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005435static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005436charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005439 PyObject *rep;
5440 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005441 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442
Christian Heimes90aa7642007-12-19 02:45:37 +00005443 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005444 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005446 if (res == -1)
5447 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 if (outsize<requiredsize)
5449 if (charmapencode_resize(outobj, outpos, requiredsize))
5450 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005451 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005452 outstart[(*outpos)++] = (char)res;
5453 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005454 }
5455
5456 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005459 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 Py_DECREF(rep);
5461 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005462 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 if (PyLong_Check(rep)) {
5464 Py_ssize_t requiredsize = *outpos+1;
5465 if (outsize<requiredsize)
5466 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5467 Py_DECREF(rep);
5468 return enc_EXCEPTION;
5469 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005470 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005472 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005473 else {
5474 const char *repchars = PyBytes_AS_STRING(rep);
5475 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5476 Py_ssize_t requiredsize = *outpos+repsize;
5477 if (outsize<requiredsize)
5478 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5479 Py_DECREF(rep);
5480 return enc_EXCEPTION;
5481 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005482 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 memcpy(outstart + *outpos, repchars, repsize);
5484 *outpos += repsize;
5485 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005486 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005487 Py_DECREF(rep);
5488 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005489}
5490
5491/* handle an error in PyUnicode_EncodeCharmap
5492 Return 0 on success, -1 on error */
5493static
5494int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005495 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005496 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005497 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005498 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005499{
5500 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005501 Py_ssize_t repsize;
5502 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005503 Py_UNICODE *uni2;
5504 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005505 Py_ssize_t collstartpos = *inpos;
5506 Py_ssize_t collendpos = *inpos+1;
5507 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 char *encoding = "charmap";
5509 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005510 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 /* find all unencodable characters */
5513 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005514 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005515 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 int res = encoding_map_lookup(p[collendpos], mapping);
5517 if (res != -1)
5518 break;
5519 ++collendpos;
5520 continue;
5521 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005522
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 rep = charmapencode_lookup(p[collendpos], mapping);
5524 if (rep==NULL)
5525 return -1;
5526 else if (rep!=Py_None) {
5527 Py_DECREF(rep);
5528 break;
5529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005530 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005532 }
5533 /* cache callback name lookup
5534 * (if not done yet, i.e. it's the first error) */
5535 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 if ((errors==NULL) || (!strcmp(errors, "strict")))
5537 *known_errorHandler = 1;
5538 else if (!strcmp(errors, "replace"))
5539 *known_errorHandler = 2;
5540 else if (!strcmp(errors, "ignore"))
5541 *known_errorHandler = 3;
5542 else if (!strcmp(errors, "xmlcharrefreplace"))
5543 *known_errorHandler = 4;
5544 else
5545 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005546 }
5547 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005548 case 1: /* strict */
5549 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5550 return -1;
5551 case 2: /* replace */
5552 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 x = charmapencode_output('?', mapping, res, respos);
5554 if (x==enc_EXCEPTION) {
5555 return -1;
5556 }
5557 else if (x==enc_FAILED) {
5558 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5559 return -1;
5560 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005561 }
5562 /* fall through */
5563 case 3: /* ignore */
5564 *inpos = collendpos;
5565 break;
5566 case 4: /* xmlcharrefreplace */
5567 /* generate replacement (temporarily (mis)uses p) */
5568 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 char buffer[2+29+1+1];
5570 char *cp;
5571 sprintf(buffer, "&#%d;", (int)p[collpos]);
5572 for (cp = buffer; *cp; ++cp) {
5573 x = charmapencode_output(*cp, mapping, res, respos);
5574 if (x==enc_EXCEPTION)
5575 return -1;
5576 else if (x==enc_FAILED) {
5577 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5578 return -1;
5579 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005580 }
5581 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005582 *inpos = collendpos;
5583 break;
5584 default:
5585 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 encoding, reason, p, size, exceptionObject,
5587 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005588 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005590 if (PyBytes_Check(repunicode)) {
5591 /* Directly copy bytes result to output. */
5592 Py_ssize_t outsize = PyBytes_Size(*res);
5593 Py_ssize_t requiredsize;
5594 repsize = PyBytes_Size(repunicode);
5595 requiredsize = *respos + repsize;
5596 if (requiredsize > outsize)
5597 /* Make room for all additional bytes. */
5598 if (charmapencode_resize(res, respos, requiredsize)) {
5599 Py_DECREF(repunicode);
5600 return -1;
5601 }
5602 memcpy(PyBytes_AsString(*res) + *respos,
5603 PyBytes_AsString(repunicode), repsize);
5604 *respos += repsize;
5605 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005606 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005607 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005608 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005609 /* generate replacement */
5610 repsize = PyUnicode_GET_SIZE(repunicode);
5611 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005612 x = charmapencode_output(*uni2, mapping, res, respos);
5613 if (x==enc_EXCEPTION) {
5614 return -1;
5615 }
5616 else if (x==enc_FAILED) {
5617 Py_DECREF(repunicode);
5618 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5619 return -1;
5620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005621 }
5622 *inpos = newpos;
5623 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624 }
5625 return 0;
5626}
5627
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 Py_ssize_t size,
5630 PyObject *mapping,
5631 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 /* output object */
5634 PyObject *res = NULL;
5635 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005636 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005637 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005638 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 PyObject *errorHandler = NULL;
5640 PyObject *exc = NULL;
5641 /* the following variable is used for caching string comparisons
5642 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5643 * 3=ignore, 4=xmlcharrefreplace */
5644 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645
5646 /* Default to Latin-1 */
5647 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650 /* allocate enough for a simple encoding without
5651 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005652 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 if (res == NULL)
5654 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005655 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 /* try to encode it */
5660 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5661 if (x==enc_EXCEPTION) /* error */
5662 goto onError;
5663 if (x==enc_FAILED) { /* unencodable character */
5664 if (charmap_encoding_error(p, size, &inpos, mapping,
5665 &exc,
5666 &known_errorHandler, &errorHandler, errors,
5667 &res, &respos)) {
5668 goto onError;
5669 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005670 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 else
5672 /* done with this character => adjust input position */
5673 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005677 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005678 if (_PyBytes_Resize(&res, respos) < 0)
5679 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005680
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005681 Py_XDECREF(exc);
5682 Py_XDECREF(errorHandler);
5683 return res;
5684
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 Py_XDECREF(res);
5687 Py_XDECREF(exc);
5688 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 return NULL;
5690}
5691
5692PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694{
5695 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 PyErr_BadArgument();
5697 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 }
5699 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 PyUnicode_GET_SIZE(unicode),
5701 mapping,
5702 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703}
5704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705/* create or adjust a UnicodeTranslateError */
5706static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 const Py_UNICODE *unicode, Py_ssize_t size,
5708 Py_ssize_t startpos, Py_ssize_t endpos,
5709 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005712 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 }
5715 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5717 goto onError;
5718 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5719 goto onError;
5720 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5721 goto onError;
5722 return;
5723 onError:
5724 Py_DECREF(*exceptionObject);
5725 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 }
5727}
5728
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005729/* raises a UnicodeTranslateError */
5730static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005731 const Py_UNICODE *unicode, Py_ssize_t size,
5732 Py_ssize_t startpos, Py_ssize_t endpos,
5733 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734{
5735 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739}
5740
5741/* error handling callback helper:
5742 build arguments, call the callback and check the arguments,
5743 put the result into newpos and return the replacement string, which
5744 has to be freed by the caller */
5745static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 PyObject **errorHandler,
5747 const char *reason,
5748 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5749 Py_ssize_t startpos, Py_ssize_t endpos,
5750 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005751{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005752 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005754 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 PyObject *restuple;
5756 PyObject *resunicode;
5757
5758 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005760 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 }
5763
5764 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768
5769 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005774 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 Py_DECREF(restuple);
5776 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 }
5778 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 &resunicode, &i_newpos)) {
5780 Py_DECREF(restuple);
5781 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005783 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005785 else
5786 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005787 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5789 Py_DECREF(restuple);
5790 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005791 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 Py_INCREF(resunicode);
5793 Py_DECREF(restuple);
5794 return resunicode;
5795}
5796
5797/* Lookup the character ch in the mapping and put the result in result,
5798 which must be decrefed by the caller.
5799 Return 0 on success, -1 on error */
5800static
5801int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5802{
Christian Heimes217cfd12007-12-02 14:31:20 +00005803 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 PyObject *x;
5805
5806 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005808 x = PyObject_GetItem(mapping, w);
5809 Py_DECREF(w);
5810 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5812 /* No mapping found means: use 1:1 mapping. */
5813 PyErr_Clear();
5814 *result = NULL;
5815 return 0;
5816 } else
5817 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 }
5819 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 *result = x;
5821 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005822 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005823 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 long value = PyLong_AS_LONG(x);
5825 long max = PyUnicode_GetMax();
5826 if (value < 0 || value > max) {
5827 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005828 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 Py_DECREF(x);
5830 return -1;
5831 }
5832 *result = x;
5833 return 0;
5834 }
5835 else if (PyUnicode_Check(x)) {
5836 *result = x;
5837 return 0;
5838 }
5839 else {
5840 /* wrong return value */
5841 PyErr_SetString(PyExc_TypeError,
5842 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005843 Py_DECREF(x);
5844 return -1;
5845 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846}
5847/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 if not reallocate and adjust various state variables.
5849 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850static
Walter Dörwald4894c302003-10-24 14:25:28 +00005851int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005854 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005855 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005856 /* remember old output position */
5857 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5858 /* exponentially overallocate to minimize reallocations */
5859 if (requiredsize < 2 * oldsize)
5860 requiredsize = 2 * oldsize;
5861 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5862 return -1;
5863 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 }
5865 return 0;
5866}
5867/* lookup the character, put the result in the output string and adjust
5868 various state variables. Return a new reference to the object that
5869 was put in the output buffer in *result, or Py_None, if the mapping was
5870 undefined (in which case no character was written).
5871 The called must decref result.
5872 Return 0 on success, -1 on error. */
5873static
Walter Dörwald4894c302003-10-24 14:25:28 +00005874int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5876 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877{
Walter Dörwald4894c302003-10-24 14:25:28 +00005878 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 /* not found => default to 1:1 mapping */
5882 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 }
5884 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005886 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 /* no overflow check, because we know that the space is enough */
5888 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 }
5890 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5892 if (repsize==1) {
5893 /* no overflow check, because we know that the space is enough */
5894 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5895 }
5896 else if (repsize!=0) {
5897 /* more than one character */
5898 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5899 (insize - (curinp-startinp)) +
5900 repsize - 1;
5901 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5902 return -1;
5903 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5904 *outp += repsize;
5905 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 }
5907 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 return 0;
5910}
5911
5912PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 Py_ssize_t size,
5914 PyObject *mapping,
5915 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917 /* output object */
5918 PyObject *res = NULL;
5919 /* pointers to the beginning and end+1 of input */
5920 const Py_UNICODE *startp = p;
5921 const Py_UNICODE *endp = p + size;
5922 /* pointer into the output */
5923 Py_UNICODE *str;
5924 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005925 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 char *reason = "character maps to <undefined>";
5927 PyObject *errorHandler = NULL;
5928 PyObject *exc = NULL;
5929 /* the following variable is used for caching string comparisons
5930 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5931 * 3=ignore, 4=xmlcharrefreplace */
5932 int known_errorHandler = -1;
5933
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 PyErr_BadArgument();
5936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938
5939 /* allocate enough for a simple 1:1 translation without
5940 replacements, if we need more, we'll resize */
5941 res = PyUnicode_FromUnicode(NULL, size);
5942 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005948 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 /* try to encode it */
5950 PyObject *x = NULL;
5951 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5952 Py_XDECREF(x);
5953 goto onError;
5954 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005955 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 if (x!=Py_None) /* it worked => adjust input pointer */
5957 ++p;
5958 else { /* untranslatable character */
5959 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5960 Py_ssize_t repsize;
5961 Py_ssize_t newpos;
5962 Py_UNICODE *uni2;
5963 /* startpos for collecting untranslatable chars */
5964 const Py_UNICODE *collstart = p;
5965 const Py_UNICODE *collend = p+1;
5966 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 /* find all untranslatable characters */
5969 while (collend < endp) {
5970 if (charmaptranslate_lookup(*collend, mapping, &x))
5971 goto onError;
5972 Py_XDECREF(x);
5973 if (x!=Py_None)
5974 break;
5975 ++collend;
5976 }
5977 /* cache callback name lookup
5978 * (if not done yet, i.e. it's the first error) */
5979 if (known_errorHandler==-1) {
5980 if ((errors==NULL) || (!strcmp(errors, "strict")))
5981 known_errorHandler = 1;
5982 else if (!strcmp(errors, "replace"))
5983 known_errorHandler = 2;
5984 else if (!strcmp(errors, "ignore"))
5985 known_errorHandler = 3;
5986 else if (!strcmp(errors, "xmlcharrefreplace"))
5987 known_errorHandler = 4;
5988 else
5989 known_errorHandler = 0;
5990 }
5991 switch (known_errorHandler) {
5992 case 1: /* strict */
5993 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005994 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 case 2: /* replace */
5996 /* No need to check for space, this is a 1:1 replacement */
5997 for (coll = collstart; coll<collend; ++coll)
5998 *str++ = '?';
5999 /* fall through */
6000 case 3: /* ignore */
6001 p = collend;
6002 break;
6003 case 4: /* xmlcharrefreplace */
6004 /* generate replacement (temporarily (mis)uses p) */
6005 for (p = collstart; p < collend; ++p) {
6006 char buffer[2+29+1+1];
6007 char *cp;
6008 sprintf(buffer, "&#%d;", (int)*p);
6009 if (charmaptranslate_makespace(&res, &str,
6010 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6011 goto onError;
6012 for (cp = buffer; *cp; ++cp)
6013 *str++ = *cp;
6014 }
6015 p = collend;
6016 break;
6017 default:
6018 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6019 reason, startp, size, &exc,
6020 collstart-startp, collend-startp, &newpos);
6021 if (repunicode == NULL)
6022 goto onError;
6023 /* generate replacement */
6024 repsize = PyUnicode_GET_SIZE(repunicode);
6025 if (charmaptranslate_makespace(&res, &str,
6026 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6027 Py_DECREF(repunicode);
6028 goto onError;
6029 }
6030 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6031 *str++ = *uni2;
6032 p = startp + newpos;
6033 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006034 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006035 }
6036 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 /* Resize if we allocated to much */
6038 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006039 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 if (PyUnicode_Resize(&res, respos) < 0)
6041 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 }
6043 Py_XDECREF(exc);
6044 Py_XDECREF(errorHandler);
6045 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048 Py_XDECREF(res);
6049 Py_XDECREF(exc);
6050 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 return NULL;
6052}
6053
6054PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 PyObject *mapping,
6056 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057{
6058 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006059
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 str = PyUnicode_FromObject(str);
6061 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 PyUnicode_GET_SIZE(str),
6065 mapping,
6066 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 Py_DECREF(str);
6068 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006069
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 Py_XDECREF(str);
6072 return NULL;
6073}
Tim Petersced69f82003-09-16 20:30:58 +00006074
Guido van Rossum9e896b32000-04-05 20:11:21 +00006075/* --- Decimal Encoder ---------------------------------------------------- */
6076
6077int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 Py_ssize_t length,
6079 char *output,
6080 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006081{
6082 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083 PyObject *errorHandler = NULL;
6084 PyObject *exc = NULL;
6085 const char *encoding = "decimal";
6086 const char *reason = "invalid decimal Unicode string";
6087 /* the following variable is used for caching string comparisons
6088 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6089 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006090
6091 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 PyErr_BadArgument();
6093 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006094 }
6095
6096 p = s;
6097 end = s + length;
6098 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 register Py_UNICODE ch = *p;
6100 int decimal;
6101 PyObject *repunicode;
6102 Py_ssize_t repsize;
6103 Py_ssize_t newpos;
6104 Py_UNICODE *uni2;
6105 Py_UNICODE *collstart;
6106 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006107
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006109 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 ++p;
6111 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006112 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 decimal = Py_UNICODE_TODECIMAL(ch);
6114 if (decimal >= 0) {
6115 *output++ = '0' + decimal;
6116 ++p;
6117 continue;
6118 }
6119 if (0 < ch && ch < 256) {
6120 *output++ = (char)ch;
6121 ++p;
6122 continue;
6123 }
6124 /* All other characters are considered unencodable */
6125 collstart = p;
6126 collend = p+1;
6127 while (collend < end) {
6128 if ((0 < *collend && *collend < 256) ||
6129 !Py_UNICODE_ISSPACE(*collend) ||
6130 Py_UNICODE_TODECIMAL(*collend))
6131 break;
6132 }
6133 /* cache callback name lookup
6134 * (if not done yet, i.e. it's the first error) */
6135 if (known_errorHandler==-1) {
6136 if ((errors==NULL) || (!strcmp(errors, "strict")))
6137 known_errorHandler = 1;
6138 else if (!strcmp(errors, "replace"))
6139 known_errorHandler = 2;
6140 else if (!strcmp(errors, "ignore"))
6141 known_errorHandler = 3;
6142 else if (!strcmp(errors, "xmlcharrefreplace"))
6143 known_errorHandler = 4;
6144 else
6145 known_errorHandler = 0;
6146 }
6147 switch (known_errorHandler) {
6148 case 1: /* strict */
6149 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6150 goto onError;
6151 case 2: /* replace */
6152 for (p = collstart; p < collend; ++p)
6153 *output++ = '?';
6154 /* fall through */
6155 case 3: /* ignore */
6156 p = collend;
6157 break;
6158 case 4: /* xmlcharrefreplace */
6159 /* generate replacement (temporarily (mis)uses p) */
6160 for (p = collstart; p < collend; ++p)
6161 output += sprintf(output, "&#%d;", (int)*p);
6162 p = collend;
6163 break;
6164 default:
6165 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6166 encoding, reason, s, length, &exc,
6167 collstart-s, collend-s, &newpos);
6168 if (repunicode == NULL)
6169 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006170 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006171 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006172 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6173 Py_DECREF(repunicode);
6174 goto onError;
6175 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 /* generate replacement */
6177 repsize = PyUnicode_GET_SIZE(repunicode);
6178 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6179 Py_UNICODE ch = *uni2;
6180 if (Py_UNICODE_ISSPACE(ch))
6181 *output++ = ' ';
6182 else {
6183 decimal = Py_UNICODE_TODECIMAL(ch);
6184 if (decimal >= 0)
6185 *output++ = '0' + decimal;
6186 else if (0 < ch && ch < 256)
6187 *output++ = (char)ch;
6188 else {
6189 Py_DECREF(repunicode);
6190 raise_encode_exception(&exc, encoding,
6191 s, length, collstart-s, collend-s, reason);
6192 goto onError;
6193 }
6194 }
6195 }
6196 p = s + newpos;
6197 Py_DECREF(repunicode);
6198 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006199 }
6200 /* 0-terminate the output string */
6201 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006202 Py_XDECREF(exc);
6203 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006204 return 0;
6205
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207 Py_XDECREF(exc);
6208 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006209 return -1;
6210}
6211
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212/* --- Helpers ------------------------------------------------------------ */
6213
Eric Smith8c663262007-08-25 02:26:07 +00006214#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006215#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006216
Thomas Wouters477c8d52006-05-27 19:21:47 +00006217#include "stringlib/count.h"
6218#include "stringlib/find.h"
6219#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006220#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006221
Eric Smith5807c412008-05-11 21:00:57 +00006222#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006223#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006224#include "stringlib/localeutil.h"
6225
Thomas Wouters477c8d52006-05-27 19:21:47 +00006226/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006227#define ADJUST_INDICES(start, end, len) \
6228 if (end > len) \
6229 end = len; \
6230 else if (end < 0) { \
6231 end += len; \
6232 if (end < 0) \
6233 end = 0; \
6234 } \
6235 if (start < 0) { \
6236 start += len; \
6237 if (start < 0) \
6238 start = 0; \
6239 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006240
Martin v. Löwis18e16552006-02-15 17:27:45 +00006241Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006242 PyObject *substr,
6243 Py_ssize_t start,
6244 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006246 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006247 PyUnicodeObject* str_obj;
6248 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006249
Thomas Wouters477c8d52006-05-27 19:21:47 +00006250 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6251 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006253 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6254 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 Py_DECREF(str_obj);
6256 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 }
Tim Petersced69f82003-09-16 20:30:58 +00006258
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006259 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006260 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006261 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6262 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006263 );
6264
6265 Py_DECREF(sub_obj);
6266 Py_DECREF(str_obj);
6267
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 return result;
6269}
6270
Martin v. Löwis18e16552006-02-15 17:27:45 +00006271Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006272 PyObject *sub,
6273 Py_ssize_t start,
6274 Py_ssize_t end,
6275 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006277 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006278
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006280 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006282 sub = PyUnicode_FromObject(sub);
6283 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 Py_DECREF(str);
6285 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 }
Tim Petersced69f82003-09-16 20:30:58 +00006287
Thomas Wouters477c8d52006-05-27 19:21:47 +00006288 if (direction > 0)
6289 result = stringlib_find_slice(
6290 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6291 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6292 start, end
6293 );
6294 else
6295 result = stringlib_rfind_slice(
6296 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6297 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6298 start, end
6299 );
6300
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006302 Py_DECREF(sub);
6303
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 return result;
6305}
6306
Tim Petersced69f82003-09-16 20:30:58 +00006307static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 PyUnicodeObject *substring,
6310 Py_ssize_t start,
6311 Py_ssize_t end,
6312 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 if (substring->length == 0)
6315 return 1;
6316
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006317 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 end -= substring->length;
6319 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321
6322 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 if (Py_UNICODE_MATCH(self, end, substring))
6324 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325 } else {
6326 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 }
6329
6330 return 0;
6331}
6332
Martin v. Löwis18e16552006-02-15 17:27:45 +00006333Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 PyObject *substr,
6335 Py_ssize_t start,
6336 Py_ssize_t end,
6337 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006339 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006340
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 str = PyUnicode_FromObject(str);
6342 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 substr = PyUnicode_FromObject(substr);
6345 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 Py_DECREF(str);
6347 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 }
Tim Petersced69f82003-09-16 20:30:58 +00006349
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 (PyUnicodeObject *)substr,
6352 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 Py_DECREF(str);
6354 Py_DECREF(substr);
6355 return result;
6356}
6357
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358/* Apply fixfct filter to the Unicode object self and return a
6359 reference to the modified object */
6360
Tim Petersced69f82003-09-16 20:30:58 +00006361static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364{
6365
6366 PyUnicodeObject *u;
6367
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006368 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006371
6372 Py_UNICODE_COPY(u->str, self->str, self->length);
6373
Tim Peters7a29bd52001-09-12 03:03:31 +00006374 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 /* fixfct should return TRUE if it modified the buffer. If
6376 FALSE, return a reference to the original buffer instead
6377 (to save space, not time) */
6378 Py_INCREF(self);
6379 Py_DECREF(u);
6380 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 }
6382 return (PyObject*) u;
6383}
6384
Tim Petersced69f82003-09-16 20:30:58 +00006385static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386int fixupper(PyUnicodeObject *self)
6387{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006388 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 Py_UNICODE *s = self->str;
6390 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006391
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006394
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 ch = Py_UNICODE_TOUPPER(*s);
6396 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 *s = ch;
6399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 s++;
6401 }
6402
6403 return status;
6404}
6405
Tim Petersced69f82003-09-16 20:30:58 +00006406static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407int fixlower(PyUnicodeObject *self)
6408{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006409 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 Py_UNICODE *s = self->str;
6411 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006412
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006415
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 ch = Py_UNICODE_TOLOWER(*s);
6417 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 *s = ch;
6420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 s++;
6422 }
6423
6424 return status;
6425}
6426
Tim Petersced69f82003-09-16 20:30:58 +00006427static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428int fixswapcase(PyUnicodeObject *self)
6429{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006430 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 Py_UNICODE *s = self->str;
6432 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006433
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 while (len-- > 0) {
6435 if (Py_UNICODE_ISUPPER(*s)) {
6436 *s = Py_UNICODE_TOLOWER(*s);
6437 status = 1;
6438 } else if (Py_UNICODE_ISLOWER(*s)) {
6439 *s = Py_UNICODE_TOUPPER(*s);
6440 status = 1;
6441 }
6442 s++;
6443 }
6444
6445 return status;
6446}
6447
Tim Petersced69f82003-09-16 20:30:58 +00006448static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449int fixcapitalize(PyUnicodeObject *self)
6450{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006451 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006452 Py_UNICODE *s = self->str;
6453 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006454
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006455 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006457 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 *s = Py_UNICODE_TOUPPER(*s);
6459 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006461 s++;
6462 while (--len > 0) {
6463 if (Py_UNICODE_ISUPPER(*s)) {
6464 *s = Py_UNICODE_TOLOWER(*s);
6465 status = 1;
6466 }
6467 s++;
6468 }
6469 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470}
6471
6472static
6473int fixtitle(PyUnicodeObject *self)
6474{
6475 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6476 register Py_UNICODE *e;
6477 int previous_is_cased;
6478
6479 /* Shortcut for single character strings */
6480 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6482 if (*p != ch) {
6483 *p = ch;
6484 return 1;
6485 }
6486 else
6487 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 }
Tim Petersced69f82003-09-16 20:30:58 +00006489
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 e = p + PyUnicode_GET_SIZE(self);
6491 previous_is_cased = 0;
6492 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006494
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 if (previous_is_cased)
6496 *p = Py_UNICODE_TOLOWER(ch);
6497 else
6498 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006499
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 if (Py_UNICODE_ISLOWER(ch) ||
6501 Py_UNICODE_ISUPPER(ch) ||
6502 Py_UNICODE_ISTITLE(ch))
6503 previous_is_cased = 1;
6504 else
6505 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 }
6507 return 1;
6508}
6509
Tim Peters8ce9f162004-08-27 01:49:32 +00006510PyObject *
6511PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512{
Skip Montanaro6543b452004-09-16 03:28:13 +00006513 const Py_UNICODE blank = ' ';
6514 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006515 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006516 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006517 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6518 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006519 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6520 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006521 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006522 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523
Tim Peters05eba1f2004-08-27 21:32:02 +00006524 fseq = PySequence_Fast(seq, "");
6525 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006526 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006527 }
6528
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006529 /* NOTE: the following code can't call back into Python code,
6530 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006531 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006532
Tim Peters05eba1f2004-08-27 21:32:02 +00006533 seqlen = PySequence_Fast_GET_SIZE(fseq);
6534 /* If empty sequence, return u"". */
6535 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006536 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6537 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006538 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006539 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006540 /* If singleton sequence with an exact Unicode, return that. */
6541 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 item = items[0];
6543 if (PyUnicode_CheckExact(item)) {
6544 Py_INCREF(item);
6545 res = (PyUnicodeObject *)item;
6546 goto Done;
6547 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006548 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006549 else {
6550 /* Set up sep and seplen */
6551 if (separator == NULL) {
6552 sep = &blank;
6553 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006554 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006555 else {
6556 if (!PyUnicode_Check(separator)) {
6557 PyErr_Format(PyExc_TypeError,
6558 "separator: expected str instance,"
6559 " %.80s found",
6560 Py_TYPE(separator)->tp_name);
6561 goto onError;
6562 }
6563 sep = PyUnicode_AS_UNICODE(separator);
6564 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006565 }
6566 }
6567
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006568 /* There are at least two things to join, or else we have a subclass
6569 * of str in the sequence.
6570 * Do a pre-pass to figure out the total amount of space we'll
6571 * need (sz), and see whether all argument are strings.
6572 */
6573 sz = 0;
6574 for (i = 0; i < seqlen; i++) {
6575 const Py_ssize_t old_sz = sz;
6576 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 if (!PyUnicode_Check(item)) {
6578 PyErr_Format(PyExc_TypeError,
6579 "sequence item %zd: expected str instance,"
6580 " %.80s found",
6581 i, Py_TYPE(item)->tp_name);
6582 goto onError;
6583 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006584 sz += PyUnicode_GET_SIZE(item);
6585 if (i != 0)
6586 sz += seplen;
6587 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6588 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006590 goto onError;
6591 }
6592 }
Tim Petersced69f82003-09-16 20:30:58 +00006593
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006594 res = _PyUnicode_New(sz);
6595 if (res == NULL)
6596 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006597
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006598 /* Catenate everything. */
6599 res_p = PyUnicode_AS_UNICODE(res);
6600 for (i = 0; i < seqlen; ++i) {
6601 Py_ssize_t itemlen;
6602 item = items[i];
6603 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 /* Copy item, and maybe the separator. */
6605 if (i) {
6606 Py_UNICODE_COPY(res_p, sep, seplen);
6607 res_p += seplen;
6608 }
6609 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6610 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006611 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006612
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006614 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 return (PyObject *)res;
6616
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006618 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006619 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 return NULL;
6621}
6622
Tim Petersced69f82003-09-16 20:30:58 +00006623static
6624PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 Py_ssize_t left,
6626 Py_ssize_t right,
6627 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628{
6629 PyUnicodeObject *u;
6630
6631 if (left < 0)
6632 left = 0;
6633 if (right < 0)
6634 right = 0;
6635
Tim Peters7a29bd52001-09-12 03:03:31 +00006636 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 Py_INCREF(self);
6638 return self;
6639 }
6640
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006641 if (left > PY_SSIZE_T_MAX - self->length ||
6642 right > PY_SSIZE_T_MAX - (left + self->length)) {
6643 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6644 return NULL;
6645 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 u = _PyUnicode_New(left + self->length + right);
6647 if (u) {
6648 if (left)
6649 Py_UNICODE_FILL(u->str, fill, left);
6650 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6651 if (right)
6652 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6653 }
6654
6655 return u;
6656}
6657
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006658PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661
6662 string = PyUnicode_FromObject(string);
6663 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006666 list = stringlib_splitlines(
6667 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6668 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669
6670 Py_DECREF(string);
6671 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672}
6673
Tim Petersced69f82003-09-16 20:30:58 +00006674static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 PyUnicodeObject *substring,
6677 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006680 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006683 return stringlib_split_whitespace(
6684 (PyObject*) self, self->str, self->length, maxcount
6685 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006687 return stringlib_split(
6688 (PyObject*) self, self->str, self->length,
6689 substring->str, substring->length,
6690 maxcount
6691 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692}
6693
Tim Petersced69f82003-09-16 20:30:58 +00006694static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006695PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 PyUnicodeObject *substring,
6697 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006698{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006699 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006700 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006701
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006702 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006703 return stringlib_rsplit_whitespace(
6704 (PyObject*) self, self->str, self->length, maxcount
6705 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006706
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006707 return stringlib_rsplit(
6708 (PyObject*) self, self->str, self->length,
6709 substring->str, substring->length,
6710 maxcount
6711 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006712}
6713
6714static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 PyUnicodeObject *str1,
6717 PyUnicodeObject *str2,
6718 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719{
6720 PyUnicodeObject *u;
6721
6722 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006724 else if (maxcount == 0 || self->length == 0)
6725 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726
Thomas Wouters477c8d52006-05-27 19:21:47 +00006727 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006728 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006729 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006730 if (str1->length == 0)
6731 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006732 if (str1->length == 1) {
6733 /* replace characters */
6734 Py_UNICODE u1, u2;
6735 if (!findchar(self->str, self->length, str1->str[0]))
6736 goto nothing;
6737 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6738 if (!u)
6739 return NULL;
6740 Py_UNICODE_COPY(u->str, self->str, self->length);
6741 u1 = str1->str[0];
6742 u2 = str2->str[0];
6743 for (i = 0; i < u->length; i++)
6744 if (u->str[i] == u1) {
6745 if (--maxcount < 0)
6746 break;
6747 u->str[i] = u2;
6748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006750 i = stringlib_find(
6751 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006753 if (i < 0)
6754 goto nothing;
6755 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6756 if (!u)
6757 return NULL;
6758 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006759
6760 /* change everything in-place, starting with this one */
6761 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6762 i += str1->length;
6763
6764 while ( --maxcount > 0) {
6765 i = stringlib_find(self->str+i, self->length-i,
6766 str1->str, str1->length,
6767 i);
6768 if (i == -1)
6769 break;
6770 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6771 i += str1->length;
6772 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006775
6776 Py_ssize_t n, i, j, e;
6777 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 Py_UNICODE *p;
6779
6780 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006781 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6782 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006783 if (n == 0)
6784 goto nothing;
6785 /* new_size = self->length + n * (str2->length - str1->length)); */
6786 delta = (str2->length - str1->length);
6787 if (delta == 0) {
6788 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006790 product = n * (str2->length - str1->length);
6791 if ((product / (str2->length - str1->length)) != n) {
6792 PyErr_SetString(PyExc_OverflowError,
6793 "replace string is too long");
6794 return NULL;
6795 }
6796 new_size = self->length + product;
6797 if (new_size < 0) {
6798 PyErr_SetString(PyExc_OverflowError,
6799 "replace string is too long");
6800 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801 }
6802 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006803 u = _PyUnicode_New(new_size);
6804 if (!u)
6805 return NULL;
6806 i = 0;
6807 p = u->str;
6808 e = self->length - str1->length;
6809 if (str1->length > 0) {
6810 while (n-- > 0) {
6811 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006812 j = stringlib_find(self->str+i, self->length-i,
6813 str1->str, str1->length,
6814 i);
6815 if (j == -1)
6816 break;
6817 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006818 /* copy unchanged part [i:j] */
6819 Py_UNICODE_COPY(p, self->str+i, j-i);
6820 p += j - i;
6821 }
6822 /* copy substitution string */
6823 if (str2->length > 0) {
6824 Py_UNICODE_COPY(p, str2->str, str2->length);
6825 p += str2->length;
6826 }
6827 i = j + str1->length;
6828 }
6829 if (i < self->length)
6830 /* copy tail [i:] */
6831 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6832 } else {
6833 /* interleave */
6834 while (n > 0) {
6835 Py_UNICODE_COPY(p, str2->str, str2->length);
6836 p += str2->length;
6837 if (--n <= 0)
6838 break;
6839 *p++ = self->str[i++];
6840 }
6841 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006845
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006847 /* nothing to replace; return original string (when possible) */
6848 if (PyUnicode_CheckExact(self)) {
6849 Py_INCREF(self);
6850 return (PyObject *) self;
6851 }
6852 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853}
6854
6855/* --- Unicode Object Methods --------------------------------------------- */
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859\n\
6860Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006861characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
6863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006864unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 return fixup(self, fixtitle);
6867}
6868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006869PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871\n\
6872Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006873have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874
6875static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006876unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 return fixup(self, fixcapitalize);
6879}
6880
6881#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006882PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884\n\
6885Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006886normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887
6888static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006889unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890{
6891 PyObject *list;
6892 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006893 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 /* Split into words */
6896 list = split(self, NULL, -1);
6897 if (!list)
6898 return NULL;
6899
6900 /* Capitalize each word */
6901 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6902 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 if (item == NULL)
6905 goto onError;
6906 Py_DECREF(PyList_GET_ITEM(list, i));
6907 PyList_SET_ITEM(list, i, item);
6908 }
6909
6910 /* Join the words to form a new string */
6911 item = PyUnicode_Join(NULL, list);
6912
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 Py_DECREF(list);
6915 return (PyObject *)item;
6916}
6917#endif
6918
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006919/* Argument converter. Coerces to a single unicode character */
6920
6921static int
6922convert_uc(PyObject *obj, void *addr)
6923{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006924 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6925 PyObject *uniobj;
6926 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006927
Benjamin Peterson14339b62009-01-31 16:36:08 +00006928 uniobj = PyUnicode_FromObject(obj);
6929 if (uniobj == NULL) {
6930 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006932 return 0;
6933 }
6934 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6935 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006937 Py_DECREF(uniobj);
6938 return 0;
6939 }
6940 unistr = PyUnicode_AS_UNICODE(uniobj);
6941 *fillcharloc = unistr[0];
6942 Py_DECREF(uniobj);
6943 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006944}
6945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006946PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006949Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006950done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
6952static PyObject *
6953unicode_center(PyUnicodeObject *self, PyObject *args)
6954{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006955 Py_ssize_t marg, left;
6956 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006957 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958
Thomas Woutersde017742006-02-16 19:34:37 +00006959 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 return NULL;
6961
Tim Peters7a29bd52001-09-12 03:03:31 +00006962 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 Py_INCREF(self);
6964 return (PyObject*) self;
6965 }
6966
6967 marg = width - self->length;
6968 left = marg / 2 + (marg & width & 1);
6969
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006970 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971}
6972
Marc-André Lemburge5034372000-08-08 08:04:29 +00006973#if 0
6974
6975/* This code should go into some future Unicode collation support
6976 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006977 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006978
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006979/* speedy UTF-16 code point order comparison */
6980/* gleaned from: */
6981/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6982
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006983static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006984{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006985 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006986 0, 0, 0, 0, 0, 0, 0, 0,
6987 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006988 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006989};
6990
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991static int
6992unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6993{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006994 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006995
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 Py_UNICODE *s1 = str1->str;
6997 Py_UNICODE *s2 = str2->str;
6998
6999 len1 = str1->length;
7000 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007001
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007003 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007004
7005 c1 = *s1++;
7006 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007007
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 if (c1 > (1<<11) * 26)
7009 c1 += utf16Fixup[c1>>11];
7010 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007011 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007012 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007013
7014 if (c1 != c2)
7015 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007016
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007017 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 }
7019
7020 return (len1 < len2) ? -1 : (len1 != len2);
7021}
7022
Marc-André Lemburge5034372000-08-08 08:04:29 +00007023#else
7024
7025static int
7026unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7027{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007028 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007029
7030 Py_UNICODE *s1 = str1->str;
7031 Py_UNICODE *s2 = str2->str;
7032
7033 len1 = str1->length;
7034 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007035
Marc-André Lemburge5034372000-08-08 08:04:29 +00007036 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007037 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007038
Fredrik Lundh45714e92001-06-26 16:39:36 +00007039 c1 = *s1++;
7040 c2 = *s2++;
7041
7042 if (c1 != c2)
7043 return (c1 < c2) ? -1 : 1;
7044
Marc-André Lemburge5034372000-08-08 08:04:29 +00007045 len1--; len2--;
7046 }
7047
7048 return (len1 < len2) ? -1 : (len1 != len2);
7049}
7050
7051#endif
7052
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007054 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007056 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7057 return unicode_compare((PyUnicodeObject *)left,
7058 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007059 PyErr_Format(PyExc_TypeError,
7060 "Can't compare %.100s and %.100s",
7061 left->ob_type->tp_name,
7062 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 return -1;
7064}
7065
Martin v. Löwis5b222132007-06-10 09:51:05 +00007066int
7067PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7068{
7069 int i;
7070 Py_UNICODE *id;
7071 assert(PyUnicode_Check(uni));
7072 id = PyUnicode_AS_UNICODE(uni);
7073 /* Compare Unicode string and source character set string */
7074 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 if (id[i] != str[i])
7076 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007077 /* This check keeps Python strings that end in '\0' from comparing equal
7078 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007079 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007081 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007083 return 0;
7084}
7085
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007086
Benjamin Peterson29060642009-01-31 22:14:21 +00007087#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007088 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007089
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007090PyObject *PyUnicode_RichCompare(PyObject *left,
7091 PyObject *right,
7092 int op)
7093{
7094 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007095
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007096 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7097 PyObject *v;
7098 if (((PyUnicodeObject *) left)->length !=
7099 ((PyUnicodeObject *) right)->length) {
7100 if (op == Py_EQ) {
7101 Py_INCREF(Py_False);
7102 return Py_False;
7103 }
7104 if (op == Py_NE) {
7105 Py_INCREF(Py_True);
7106 return Py_True;
7107 }
7108 }
7109 if (left == right)
7110 result = 0;
7111 else
7112 result = unicode_compare((PyUnicodeObject *)left,
7113 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007114
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007115 /* Convert the return value to a Boolean */
7116 switch (op) {
7117 case Py_EQ:
7118 v = TEST_COND(result == 0);
7119 break;
7120 case Py_NE:
7121 v = TEST_COND(result != 0);
7122 break;
7123 case Py_LE:
7124 v = TEST_COND(result <= 0);
7125 break;
7126 case Py_GE:
7127 v = TEST_COND(result >= 0);
7128 break;
7129 case Py_LT:
7130 v = TEST_COND(result == -1);
7131 break;
7132 case Py_GT:
7133 v = TEST_COND(result == 1);
7134 break;
7135 default:
7136 PyErr_BadArgument();
7137 return NULL;
7138 }
7139 Py_INCREF(v);
7140 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007141 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007142
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007143 Py_INCREF(Py_NotImplemented);
7144 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007145}
7146
Guido van Rossum403d68b2000-03-13 15:55:09 +00007147int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007149{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007150 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007151 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007152
7153 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007154 sub = PyUnicode_FromObject(element);
7155 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 PyErr_Format(PyExc_TypeError,
7157 "'in <string>' requires string as left operand, not %s",
7158 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007159 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007160 }
7161
Thomas Wouters477c8d52006-05-27 19:21:47 +00007162 str = PyUnicode_FromObject(container);
7163 if (!str) {
7164 Py_DECREF(sub);
7165 return -1;
7166 }
7167
7168 result = stringlib_contains_obj(str, sub);
7169
7170 Py_DECREF(str);
7171 Py_DECREF(sub);
7172
Guido van Rossum403d68b2000-03-13 15:55:09 +00007173 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007174}
7175
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176/* Concat to string or Unicode object giving a new Unicode object. */
7177
7178PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180{
7181 PyUnicodeObject *u = NULL, *v = NULL, *w;
7182
7183 /* Coerce the two arguments */
7184 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7185 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7188 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007189 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190
7191 /* Shortcuts */
7192 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 Py_DECREF(v);
7194 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 }
7196 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 Py_DECREF(u);
7198 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 }
7200
7201 /* Concat the two Unicode strings */
7202 w = _PyUnicode_New(u->length + v->length);
7203 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 Py_UNICODE_COPY(w->str, u->str, u->length);
7206 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7207
7208 Py_DECREF(u);
7209 Py_DECREF(v);
7210 return (PyObject *)w;
7211
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 Py_XDECREF(u);
7214 Py_XDECREF(v);
7215 return NULL;
7216}
7217
Walter Dörwald1ab83302007-05-18 17:15:44 +00007218void
7219PyUnicode_Append(PyObject **pleft, PyObject *right)
7220{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007221 PyObject *new;
7222 if (*pleft == NULL)
7223 return;
7224 if (right == NULL || !PyUnicode_Check(*pleft)) {
7225 Py_DECREF(*pleft);
7226 *pleft = NULL;
7227 return;
7228 }
7229 new = PyUnicode_Concat(*pleft, right);
7230 Py_DECREF(*pleft);
7231 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007232}
7233
7234void
7235PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7236{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007237 PyUnicode_Append(pleft, right);
7238 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007239}
7240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007241PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007244Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007245string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007246interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247
7248static PyObject *
7249unicode_count(PyUnicodeObject *self, PyObject *args)
7250{
7251 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007252 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007253 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254 PyObject *result;
7255
Guido van Rossumb8872e62000-05-09 14:14:27 +00007256 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258 return NULL;
7259
7260 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007261 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007264
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007265 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007266 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007267 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007268 substring->str, substring->length,
7269 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007270 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271
7272 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007273
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 return result;
7275}
7276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007277PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007278 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007280Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007281to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007282handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007283a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7284'xmlcharrefreplace' as well as any other name registered with\n\
7285codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286
7287static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007288unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007290 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291 char *encoding = NULL;
7292 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007293 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007294
Benjamin Peterson308d6372009-09-18 21:42:35 +00007295 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7296 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007298 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007299 if (v == NULL)
7300 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007301 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007302 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007303 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007304 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007305 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007306 Py_DECREF(v);
7307 return NULL;
7308 }
7309 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007310
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007312 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007313}
7314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007315PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317\n\
7318Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007319If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320
7321static PyObject*
7322unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7323{
7324 Py_UNICODE *e;
7325 Py_UNICODE *p;
7326 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007327 Py_UNICODE *qe;
7328 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329 PyUnicodeObject *u;
7330 int tabsize = 8;
7331
7332 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334
Thomas Wouters7e474022000-07-16 12:04:32 +00007335 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007336 i = 0; /* chars up to and including most recent \n or \r */
7337 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7338 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 for (p = self->str; p < e; p++)
7340 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 if (tabsize > 0) {
7342 incr = tabsize - (j % tabsize); /* cannot overflow */
7343 if (j > PY_SSIZE_T_MAX - incr)
7344 goto overflow1;
7345 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007346 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 if (j > PY_SSIZE_T_MAX - 1)
7350 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 j++;
7352 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 if (i > PY_SSIZE_T_MAX - j)
7354 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007356 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 }
7358 }
7359
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007360 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007362
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363 /* Second pass: create output string and fill it */
7364 u = _PyUnicode_New(i + j);
7365 if (!u)
7366 return NULL;
7367
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007368 j = 0; /* same as in first pass */
7369 q = u->str; /* next output char */
7370 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371
7372 for (p = self->str; p < e; p++)
7373 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 if (tabsize > 0) {
7375 i = tabsize - (j % tabsize);
7376 j += i;
7377 while (i--) {
7378 if (q >= qe)
7379 goto overflow2;
7380 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007381 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007383 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 else {
7385 if (q >= qe)
7386 goto overflow2;
7387 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007388 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389 if (*p == '\n' || *p == '\r')
7390 j = 0;
7391 }
7392
7393 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007394
7395 overflow2:
7396 Py_DECREF(u);
7397 overflow1:
7398 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400}
7401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007402PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404\n\
7405Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007406such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407arguments start and end are interpreted as in slice notation.\n\
7408\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007409Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410
7411static PyObject *
7412unicode_find(PyUnicodeObject *self, PyObject *args)
7413{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007414 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007415 Py_ssize_t start;
7416 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007417 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418
Christian Heimes9cd17752007-11-18 19:35:23 +00007419 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421
Thomas Wouters477c8d52006-05-27 19:21:47 +00007422 result = stringlib_find_slice(
7423 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7424 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7425 start, end
7426 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427
7428 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007429
Christian Heimes217cfd12007-12-02 14:31:20 +00007430 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431}
7432
7433static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007434unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435{
7436 if (index < 0 || index >= self->length) {
7437 PyErr_SetString(PyExc_IndexError, "string index out of range");
7438 return NULL;
7439 }
7440
7441 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7442}
7443
Guido van Rossumc2504932007-09-18 19:42:40 +00007444/* Believe it or not, this produces the same value for ASCII strings
7445 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007447unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448{
Guido van Rossumc2504932007-09-18 19:42:40 +00007449 Py_ssize_t len;
7450 Py_UNICODE *p;
7451 long x;
7452
7453 if (self->hash != -1)
7454 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007455 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007456 p = self->str;
7457 x = *p << 7;
7458 while (--len >= 0)
7459 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007460 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007461 if (x == -1)
7462 x = -2;
7463 self->hash = x;
7464 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465}
7466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007467PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007470Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
7472static PyObject *
7473unicode_index(PyUnicodeObject *self, PyObject *args)
7474{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007475 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007476 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007477 Py_ssize_t start;
7478 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
Christian Heimes9cd17752007-11-18 19:35:23 +00007480 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
Thomas Wouters477c8d52006-05-27 19:21:47 +00007483 result = stringlib_find_slice(
7484 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7485 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7486 start, end
7487 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488
7489 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007490
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491 if (result < 0) {
7492 PyErr_SetString(PyExc_ValueError, "substring not found");
7493 return NULL;
7494 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007495
Christian Heimes217cfd12007-12-02 14:31:20 +00007496 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497}
7498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007499PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007502Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007503at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504
7505static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007506unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507{
7508 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7509 register const Py_UNICODE *e;
7510 int cased;
7511
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 /* Shortcut for single character strings */
7513 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007516 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007517 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007519
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 e = p + PyUnicode_GET_SIZE(self);
7521 cased = 0;
7522 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007524
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7526 return PyBool_FromLong(0);
7527 else if (!cased && Py_UNICODE_ISLOWER(ch))
7528 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007530 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531}
7532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007533PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007536Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007537at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538
7539static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007540unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541{
7542 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7543 register const Py_UNICODE *e;
7544 int cased;
7545
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546 /* Shortcut for single character strings */
7547 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007550 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007551 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007553
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554 e = p + PyUnicode_GET_SIZE(self);
7555 cased = 0;
7556 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007557 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007558
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7560 return PyBool_FromLong(0);
7561 else if (!cased && Py_UNICODE_ISUPPER(ch))
7562 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007564 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565}
7566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007567PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007570Return True if S is a titlecased string and there is at least one\n\
7571character in S, i.e. upper- and titlecase characters may only\n\
7572follow uncased characters and lowercase characters only cased ones.\n\
7573Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574
7575static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007576unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577{
7578 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7579 register const Py_UNICODE *e;
7580 int cased, previous_is_cased;
7581
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 /* Shortcut for single character strings */
7583 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7585 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007587 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007588 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007590
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 e = p + PyUnicode_GET_SIZE(self);
7592 cased = 0;
7593 previous_is_cased = 0;
7594 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007596
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7598 if (previous_is_cased)
7599 return PyBool_FromLong(0);
7600 previous_is_cased = 1;
7601 cased = 1;
7602 }
7603 else if (Py_UNICODE_ISLOWER(ch)) {
7604 if (!previous_is_cased)
7605 return PyBool_FromLong(0);
7606 previous_is_cased = 1;
7607 cased = 1;
7608 }
7609 else
7610 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007612 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613}
7614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007615PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007618Return True if all characters in S are whitespace\n\
7619and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620
7621static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007622unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623{
7624 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7625 register const Py_UNICODE *e;
7626
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627 /* Shortcut for single character strings */
7628 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 Py_UNICODE_ISSPACE(*p))
7630 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007632 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007633 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007635
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 e = p + PyUnicode_GET_SIZE(self);
7637 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 if (!Py_UNICODE_ISSPACE(*p))
7639 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007641 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642}
7643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007644PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007646\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007647Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007648and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007649
7650static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007651unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007652{
7653 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7654 register const Py_UNICODE *e;
7655
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007656 /* Shortcut for single character strings */
7657 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007658 Py_UNICODE_ISALPHA(*p))
7659 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007660
7661 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007662 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007664
7665 e = p + PyUnicode_GET_SIZE(self);
7666 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 if (!Py_UNICODE_ISALPHA(*p))
7668 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007669 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007670 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007671}
7672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007673PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007675\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007676Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007677and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007678
7679static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007680unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007681{
7682 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7683 register const Py_UNICODE *e;
7684
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007685 /* Shortcut for single character strings */
7686 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 Py_UNICODE_ISALNUM(*p))
7688 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007689
7690 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007691 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007693
7694 e = p + PyUnicode_GET_SIZE(self);
7695 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 if (!Py_UNICODE_ISALNUM(*p))
7697 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007698 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007699 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007700}
7701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007702PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007703 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007705Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007706False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707
7708static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007709unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710{
7711 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7712 register const Py_UNICODE *e;
7713
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714 /* Shortcut for single character strings */
7715 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007716 Py_UNICODE_ISDECIMAL(*p))
7717 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007719 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007720 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007721 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007722
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 e = p + PyUnicode_GET_SIZE(self);
7724 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 if (!Py_UNICODE_ISDECIMAL(*p))
7726 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007728 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729}
7730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007731PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007732 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007734Return True if all characters in S are digits\n\
7735and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736
7737static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007738unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739{
7740 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7741 register const Py_UNICODE *e;
7742
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 /* Shortcut for single character strings */
7744 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 Py_UNICODE_ISDIGIT(*p))
7746 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007748 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007749 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007751
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 e = p + PyUnicode_GET_SIZE(self);
7753 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 if (!Py_UNICODE_ISDIGIT(*p))
7755 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007757 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758}
7759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007760PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007763Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007764False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765
7766static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007767unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768{
7769 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7770 register const Py_UNICODE *e;
7771
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 /* Shortcut for single character strings */
7773 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 Py_UNICODE_ISNUMERIC(*p))
7775 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007777 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007778 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007780
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781 e = p + PyUnicode_GET_SIZE(self);
7782 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 if (!Py_UNICODE_ISNUMERIC(*p))
7784 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007786 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787}
7788
Martin v. Löwis47383402007-08-15 07:32:56 +00007789int
7790PyUnicode_IsIdentifier(PyObject *self)
7791{
7792 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7793 register const Py_UNICODE *e;
7794
7795 /* Special case for empty strings */
7796 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007798
7799 /* PEP 3131 says that the first character must be in
7800 XID_Start and subsequent characters in XID_Continue,
7801 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007802 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007803 letters, digits, underscore). However, given the current
7804 definition of XID_Start and XID_Continue, it is sufficient
7805 to check just for these, except that _ must be allowed
7806 as starting an identifier. */
7807 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7808 return 0;
7809
7810 e = p + PyUnicode_GET_SIZE(self);
7811 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 if (!_PyUnicode_IsXidContinue(*p))
7813 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007814 }
7815 return 1;
7816}
7817
7818PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007820\n\
7821Return True if S is a valid identifier according\n\
7822to the language definition.");
7823
7824static PyObject*
7825unicode_isidentifier(PyObject *self)
7826{
7827 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7828}
7829
Georg Brandl559e5d72008-06-11 18:37:52 +00007830PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007832\n\
7833Return True if all characters in S are considered\n\
7834printable in repr() or S is empty, False otherwise.");
7835
7836static PyObject*
7837unicode_isprintable(PyObject *self)
7838{
7839 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7840 register const Py_UNICODE *e;
7841
7842 /* Shortcut for single character strings */
7843 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7844 Py_RETURN_TRUE;
7845 }
7846
7847 e = p + PyUnicode_GET_SIZE(self);
7848 for (; p < e; p++) {
7849 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7850 Py_RETURN_FALSE;
7851 }
7852 }
7853 Py_RETURN_TRUE;
7854}
7855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007856PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007857 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858\n\
7859Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007860iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861
7862static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007863unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007865 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866}
7867
Martin v. Löwis18e16552006-02-15 17:27:45 +00007868static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869unicode_length(PyUnicodeObject *self)
7870{
7871 return self->length;
7872}
7873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007874PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007877Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007878done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879
7880static PyObject *
7881unicode_ljust(PyUnicodeObject *self, PyObject *args)
7882{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007883 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007884 Py_UNICODE fillchar = ' ';
7885
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007886 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 return NULL;
7888
Tim Peters7a29bd52001-09-12 03:03:31 +00007889 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 Py_INCREF(self);
7891 return (PyObject*) self;
7892 }
7893
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007894 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895}
7896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007897PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007900Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901
7902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007903unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 return fixup(self, fixlower);
7906}
7907
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007908#define LEFTSTRIP 0
7909#define RIGHTSTRIP 1
7910#define BOTHSTRIP 2
7911
7912/* Arrays indexed by above */
7913static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7914
7915#define STRIPNAME(i) (stripformat[i]+3)
7916
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007917/* externally visible for str.strip(unicode) */
7918PyObject *
7919_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7920{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007921 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7922 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7923 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7924 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7925 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007926
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007928
Benjamin Peterson14339b62009-01-31 16:36:08 +00007929 i = 0;
7930 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7932 i++;
7933 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007934 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007935
Benjamin Peterson14339b62009-01-31 16:36:08 +00007936 j = len;
7937 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 do {
7939 j--;
7940 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7941 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007942 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007943
Benjamin Peterson14339b62009-01-31 16:36:08 +00007944 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 Py_INCREF(self);
7946 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007947 }
7948 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007950}
7951
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952
7953static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007954do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007956 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7957 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007958
Benjamin Peterson14339b62009-01-31 16:36:08 +00007959 i = 0;
7960 if (striptype != RIGHTSTRIP) {
7961 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7962 i++;
7963 }
7964 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007965
Benjamin Peterson14339b62009-01-31 16:36:08 +00007966 j = len;
7967 if (striptype != LEFTSTRIP) {
7968 do {
7969 j--;
7970 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7971 j++;
7972 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007973
Benjamin Peterson14339b62009-01-31 16:36:08 +00007974 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7975 Py_INCREF(self);
7976 return (PyObject*)self;
7977 }
7978 else
7979 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980}
7981
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007982
7983static PyObject *
7984do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7985{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007986 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007987
Benjamin Peterson14339b62009-01-31 16:36:08 +00007988 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7989 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007990
Benjamin Peterson14339b62009-01-31 16:36:08 +00007991 if (sep != NULL && sep != Py_None) {
7992 if (PyUnicode_Check(sep))
7993 return _PyUnicode_XStrip(self, striptype, sep);
7994 else {
7995 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 "%s arg must be None or str",
7997 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007998 return NULL;
7999 }
8000 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008001
Benjamin Peterson14339b62009-01-31 16:36:08 +00008002 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008003}
8004
8005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008006PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008008\n\
8009Return a copy of the string S with leading and trailing\n\
8010whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008011If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008012
8013static PyObject *
8014unicode_strip(PyUnicodeObject *self, PyObject *args)
8015{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016 if (PyTuple_GET_SIZE(args) == 0)
8017 return do_strip(self, BOTHSTRIP); /* Common case */
8018 else
8019 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008020}
8021
8022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008023PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008025\n\
8026Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008027If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008028
8029static PyObject *
8030unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8031{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008032 if (PyTuple_GET_SIZE(args) == 0)
8033 return do_strip(self, LEFTSTRIP); /* Common case */
8034 else
8035 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008036}
8037
8038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008039PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008041\n\
8042Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008043If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008044
8045static PyObject *
8046unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8047{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008048 if (PyTuple_GET_SIZE(args) == 0)
8049 return do_strip(self, RIGHTSTRIP); /* Common case */
8050 else
8051 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008052}
8053
8054
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008056unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057{
8058 PyUnicodeObject *u;
8059 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008060 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008061 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062
Georg Brandl222de0f2009-04-12 12:01:50 +00008063 if (len < 1) {
8064 Py_INCREF(unicode_empty);
8065 return (PyObject *)unicode_empty;
8066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067
Tim Peters7a29bd52001-09-12 03:03:31 +00008068 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069 /* no repeat, return original string */
8070 Py_INCREF(str);
8071 return (PyObject*) str;
8072 }
Tim Peters8f422462000-09-09 06:13:41 +00008073
8074 /* ensure # of chars needed doesn't overflow int and # of bytes
8075 * needed doesn't overflow size_t
8076 */
8077 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008078 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008079 PyErr_SetString(PyExc_OverflowError,
8080 "repeated string is too long");
8081 return NULL;
8082 }
8083 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8084 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8085 PyErr_SetString(PyExc_OverflowError,
8086 "repeated string is too long");
8087 return NULL;
8088 }
8089 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090 if (!u)
8091 return NULL;
8092
8093 p = u->str;
8094
Georg Brandl222de0f2009-04-12 12:01:50 +00008095 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008096 Py_UNICODE_FILL(p, str->str[0], len);
8097 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008098 Py_ssize_t done = str->length; /* number of characters copied this far */
8099 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008101 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008102 Py_UNICODE_COPY(p+done, p, n);
8103 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 }
8106
8107 return (PyObject*) u;
8108}
8109
8110PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 PyObject *subobj,
8112 PyObject *replobj,
8113 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114{
8115 PyObject *self;
8116 PyObject *str1;
8117 PyObject *str2;
8118 PyObject *result;
8119
8120 self = PyUnicode_FromObject(obj);
8121 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 str1 = PyUnicode_FromObject(subobj);
8124 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 Py_DECREF(self);
8126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127 }
8128 str2 = PyUnicode_FromObject(replobj);
8129 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 Py_DECREF(self);
8131 Py_DECREF(str1);
8132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133 }
Tim Petersced69f82003-09-16 20:30:58 +00008134 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 (PyUnicodeObject *)str1,
8136 (PyUnicodeObject *)str2,
8137 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 Py_DECREF(self);
8139 Py_DECREF(str1);
8140 Py_DECREF(str2);
8141 return result;
8142}
8143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008144PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008145 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146\n\
8147Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008148old replaced by new. If the optional argument count is\n\
8149given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150
8151static PyObject*
8152unicode_replace(PyUnicodeObject *self, PyObject *args)
8153{
8154 PyUnicodeObject *str1;
8155 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 PyObject *result;
8158
Martin v. Löwis18e16552006-02-15 17:27:45 +00008159 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 return NULL;
8161 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8162 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008165 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 Py_DECREF(str1);
8167 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169
8170 result = replace(self, str1, str2, maxcount);
8171
8172 Py_DECREF(str1);
8173 Py_DECREF(str2);
8174 return result;
8175}
8176
8177static
8178PyObject *unicode_repr(PyObject *unicode)
8179{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008180 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008181 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008182 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8183 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8184
8185 /* XXX(nnorwitz): rather than over-allocating, it would be
8186 better to choose a different scheme. Perhaps scan the
8187 first N-chars of the string and allocate based on that size.
8188 */
8189 /* Initial allocation is based on the longest-possible unichr
8190 escape.
8191
8192 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8193 unichr, so in this case it's the longest unichr escape. In
8194 narrow (UTF-16) builds this is five chars per source unichr
8195 since there are two unichrs in the surrogate pair, so in narrow
8196 (UTF-16) builds it's not the longest unichr escape.
8197
8198 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8199 so in the narrow (UTF-16) build case it's the longest unichr
8200 escape.
8201 */
8202
Walter Dörwald1ab83302007-05-18 17:15:44 +00008203 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008205#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008207#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008209#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008211 if (repr == NULL)
8212 return NULL;
8213
Walter Dörwald1ab83302007-05-18 17:15:44 +00008214 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008215
8216 /* Add quote */
8217 *p++ = (findchar(s, size, '\'') &&
8218 !findchar(s, size, '"')) ? '"' : '\'';
8219 while (size-- > 0) {
8220 Py_UNICODE ch = *s++;
8221
8222 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008223 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008224 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008225 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008226 continue;
8227 }
8228
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008230 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008231 *p++ = '\\';
8232 *p++ = 't';
8233 }
8234 else if (ch == '\n') {
8235 *p++ = '\\';
8236 *p++ = 'n';
8237 }
8238 else if (ch == '\r') {
8239 *p++ = '\\';
8240 *p++ = 'r';
8241 }
8242
8243 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008244 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008245 *p++ = '\\';
8246 *p++ = 'x';
8247 *p++ = hexdigits[(ch >> 4) & 0x000F];
8248 *p++ = hexdigits[ch & 0x000F];
8249 }
8250
Georg Brandl559e5d72008-06-11 18:37:52 +00008251 /* Copy ASCII characters as-is */
8252 else if (ch < 0x7F) {
8253 *p++ = ch;
8254 }
8255
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008257 else {
8258 Py_UCS4 ucs = ch;
8259
8260#ifndef Py_UNICODE_WIDE
8261 Py_UNICODE ch2 = 0;
8262 /* Get code point from surrogate pair */
8263 if (size > 0) {
8264 ch2 = *s;
8265 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008267 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008269 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008270 size--;
8271 }
8272 }
8273#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008274 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008275 (categories Z* and C* except ASCII space)
8276 */
8277 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8278 /* Map 8-bit characters to '\xhh' */
8279 if (ucs <= 0xff) {
8280 *p++ = '\\';
8281 *p++ = 'x';
8282 *p++ = hexdigits[(ch >> 4) & 0x000F];
8283 *p++ = hexdigits[ch & 0x000F];
8284 }
8285 /* Map 21-bit characters to '\U00xxxxxx' */
8286 else if (ucs >= 0x10000) {
8287 *p++ = '\\';
8288 *p++ = 'U';
8289 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8290 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8291 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8292 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8293 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8294 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8295 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8296 *p++ = hexdigits[ucs & 0x0000000F];
8297 }
8298 /* Map 16-bit characters to '\uxxxx' */
8299 else {
8300 *p++ = '\\';
8301 *p++ = 'u';
8302 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8303 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8304 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8305 *p++ = hexdigits[ucs & 0x000F];
8306 }
8307 }
8308 /* Copy characters as-is */
8309 else {
8310 *p++ = ch;
8311#ifndef Py_UNICODE_WIDE
8312 if (ucs >= 0x10000)
8313 *p++ = ch2;
8314#endif
8315 }
8316 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008317 }
8318 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008319 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008320
8321 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008322 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008323 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324}
8325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008326PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328\n\
8329Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008330such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331arguments start and end are interpreted as in slice notation.\n\
8332\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008333Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334
8335static PyObject *
8336unicode_rfind(PyUnicodeObject *self, PyObject *args)
8337{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008338 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008339 Py_ssize_t start;
8340 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008341 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342
Christian Heimes9cd17752007-11-18 19:35:23 +00008343 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008344 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345
Thomas Wouters477c8d52006-05-27 19:21:47 +00008346 result = stringlib_rfind_slice(
8347 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8348 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8349 start, end
8350 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351
8352 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008353
Christian Heimes217cfd12007-12-02 14:31:20 +00008354 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355}
8356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008357PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008360Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361
8362static PyObject *
8363unicode_rindex(PyUnicodeObject *self, PyObject *args)
8364{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008365 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008366 Py_ssize_t start;
8367 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008368 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008369
Christian Heimes9cd17752007-11-18 19:35:23 +00008370 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372
Thomas Wouters477c8d52006-05-27 19:21:47 +00008373 result = stringlib_rfind_slice(
8374 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8375 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8376 start, end
8377 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378
8379 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008380
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 if (result < 0) {
8382 PyErr_SetString(PyExc_ValueError, "substring not found");
8383 return NULL;
8384 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008385 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386}
8387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008388PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008391Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008392done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393
8394static PyObject *
8395unicode_rjust(PyUnicodeObject *self, PyObject *args)
8396{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008397 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008398 Py_UNICODE fillchar = ' ';
8399
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008400 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401 return NULL;
8402
Tim Peters7a29bd52001-09-12 03:03:31 +00008403 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 Py_INCREF(self);
8405 return (PyObject*) self;
8406 }
8407
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008408 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409}
8410
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 PyObject *sep,
8413 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414{
8415 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008416
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417 s = PyUnicode_FromObject(s);
8418 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008419 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 if (sep != NULL) {
8421 sep = PyUnicode_FromObject(sep);
8422 if (sep == NULL) {
8423 Py_DECREF(s);
8424 return NULL;
8425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 }
8427
8428 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8429
8430 Py_DECREF(s);
8431 Py_XDECREF(sep);
8432 return result;
8433}
8434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008435PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437\n\
8438Return a list of the words in S, using sep as the\n\
8439delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008440splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008441whitespace string is a separator and empty strings are\n\
8442removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443
8444static PyObject*
8445unicode_split(PyUnicodeObject *self, PyObject *args)
8446{
8447 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008448 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449
Martin v. Löwis18e16552006-02-15 17:27:45 +00008450 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 return NULL;
8452
8453 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459}
8460
Thomas Wouters477c8d52006-05-27 19:21:47 +00008461PyObject *
8462PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8463{
8464 PyObject* str_obj;
8465 PyObject* sep_obj;
8466 PyObject* out;
8467
8468 str_obj = PyUnicode_FromObject(str_in);
8469 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008471 sep_obj = PyUnicode_FromObject(sep_in);
8472 if (!sep_obj) {
8473 Py_DECREF(str_obj);
8474 return NULL;
8475 }
8476
8477 out = stringlib_partition(
8478 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8479 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8480 );
8481
8482 Py_DECREF(sep_obj);
8483 Py_DECREF(str_obj);
8484
8485 return out;
8486}
8487
8488
8489PyObject *
8490PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8491{
8492 PyObject* str_obj;
8493 PyObject* sep_obj;
8494 PyObject* out;
8495
8496 str_obj = PyUnicode_FromObject(str_in);
8497 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008499 sep_obj = PyUnicode_FromObject(sep_in);
8500 if (!sep_obj) {
8501 Py_DECREF(str_obj);
8502 return NULL;
8503 }
8504
8505 out = stringlib_rpartition(
8506 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8507 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8508 );
8509
8510 Py_DECREF(sep_obj);
8511 Py_DECREF(str_obj);
8512
8513 return out;
8514}
8515
8516PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008518\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008519Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008520the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008521found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008522
8523static PyObject*
8524unicode_partition(PyUnicodeObject *self, PyObject *separator)
8525{
8526 return PyUnicode_Partition((PyObject *)self, separator);
8527}
8528
8529PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008530 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008531\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008532Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008533the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008534separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008535
8536static PyObject*
8537unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8538{
8539 return PyUnicode_RPartition((PyObject *)self, separator);
8540}
8541
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008542PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 PyObject *sep,
8544 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008545{
8546 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008547
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008548 s = PyUnicode_FromObject(s);
8549 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008550 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 if (sep != NULL) {
8552 sep = PyUnicode_FromObject(sep);
8553 if (sep == NULL) {
8554 Py_DECREF(s);
8555 return NULL;
8556 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008557 }
8558
8559 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8560
8561 Py_DECREF(s);
8562 Py_XDECREF(sep);
8563 return result;
8564}
8565
8566PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008568\n\
8569Return a list of the words in S, using sep as the\n\
8570delimiter string, starting at the end of the string and\n\
8571working to the front. If maxsplit is given, at most maxsplit\n\
8572splits are done. If sep is not specified, any whitespace string\n\
8573is a separator.");
8574
8575static PyObject*
8576unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8577{
8578 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008579 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008580
Martin v. Löwis18e16552006-02-15 17:27:45 +00008581 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008582 return NULL;
8583
8584 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008586 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008588 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008590}
8591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008592PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594\n\
8595Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008596Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008597is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598
8599static PyObject*
8600unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8601{
Guido van Rossum86662912000-04-11 15:38:46 +00008602 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603
Guido van Rossum86662912000-04-11 15:38:46 +00008604 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605 return NULL;
8606
Guido van Rossum86662912000-04-11 15:38:46 +00008607 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608}
8609
8610static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008611PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612{
Walter Dörwald346737f2007-05-31 10:44:43 +00008613 if (PyUnicode_CheckExact(self)) {
8614 Py_INCREF(self);
8615 return self;
8616 } else
8617 /* Subtype -- return genuine unicode string with the same value. */
8618 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8619 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620}
8621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008622PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624\n\
8625Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008626and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627
8628static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008629unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 return fixup(self, fixswapcase);
8632}
8633
Georg Brandlceee0772007-11-27 23:48:05 +00008634PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008636\n\
8637Return a translation table usable for str.translate().\n\
8638If there is only one argument, it must be a dictionary mapping Unicode\n\
8639ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008640Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008641If there are two arguments, they must be strings of equal length, and\n\
8642in the resulting dictionary, each character in x will be mapped to the\n\
8643character at the same position in y. If there is a third argument, it\n\
8644must be a string, whose characters will be mapped to None in the result.");
8645
8646static PyObject*
8647unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8648{
8649 PyObject *x, *y = NULL, *z = NULL;
8650 PyObject *new = NULL, *key, *value;
8651 Py_ssize_t i = 0;
8652 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008653
Georg Brandlceee0772007-11-27 23:48:05 +00008654 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8655 return NULL;
8656 new = PyDict_New();
8657 if (!new)
8658 return NULL;
8659 if (y != NULL) {
8660 /* x must be a string too, of equal length */
8661 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8662 if (!PyUnicode_Check(x)) {
8663 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8664 "be a string if there is a second argument");
8665 goto err;
8666 }
8667 if (PyUnicode_GET_SIZE(x) != ylen) {
8668 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8669 "arguments must have equal length");
8670 goto err;
8671 }
8672 /* create entries for translating chars in x to those in y */
8673 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008674 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8675 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008676 if (!key || !value)
8677 goto err;
8678 res = PyDict_SetItem(new, key, value);
8679 Py_DECREF(key);
8680 Py_DECREF(value);
8681 if (res < 0)
8682 goto err;
8683 }
8684 /* create entries for deleting chars in z */
8685 if (z != NULL) {
8686 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008687 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008688 if (!key)
8689 goto err;
8690 res = PyDict_SetItem(new, key, Py_None);
8691 Py_DECREF(key);
8692 if (res < 0)
8693 goto err;
8694 }
8695 }
8696 } else {
8697 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008698 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008699 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8700 "to maketrans it must be a dict");
8701 goto err;
8702 }
8703 /* copy entries into the new dict, converting string keys to int keys */
8704 while (PyDict_Next(x, &i, &key, &value)) {
8705 if (PyUnicode_Check(key)) {
8706 /* convert string keys to integer keys */
8707 PyObject *newkey;
8708 if (PyUnicode_GET_SIZE(key) != 1) {
8709 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8710 "table must be of length 1");
8711 goto err;
8712 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008713 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008714 if (!newkey)
8715 goto err;
8716 res = PyDict_SetItem(new, newkey, value);
8717 Py_DECREF(newkey);
8718 if (res < 0)
8719 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008720 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008721 /* just keep integer keys */
8722 if (PyDict_SetItem(new, key, value) < 0)
8723 goto err;
8724 } else {
8725 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8726 "be strings or integers");
8727 goto err;
8728 }
8729 }
8730 }
8731 return new;
8732 err:
8733 Py_DECREF(new);
8734 return NULL;
8735}
8736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008737PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739\n\
8740Return a copy of the string S, where all characters have been mapped\n\
8741through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008742Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008743Unmapped characters are left untouched. Characters mapped to None\n\
8744are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745
8746static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008747unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748{
Georg Brandlceee0772007-11-27 23:48:05 +00008749 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750}
8751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008752PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008755Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756
8757static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008758unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760 return fixup(self, fixupper);
8761}
8762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008763PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008766Pad a numeric string S with zeros on the left, to fill a field\n\
8767of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768
8769static PyObject *
8770unicode_zfill(PyUnicodeObject *self, PyObject *args)
8771{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008772 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773 PyUnicodeObject *u;
8774
Martin v. Löwis18e16552006-02-15 17:27:45 +00008775 Py_ssize_t width;
8776 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 return NULL;
8778
8779 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008780 if (PyUnicode_CheckExact(self)) {
8781 Py_INCREF(self);
8782 return (PyObject*) self;
8783 }
8784 else
8785 return PyUnicode_FromUnicode(
8786 PyUnicode_AS_UNICODE(self),
8787 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 }
8790
8791 fill = width - self->length;
8792
8793 u = pad(self, fill, 0, '0');
8794
Walter Dörwald068325e2002-04-15 13:36:47 +00008795 if (u == NULL)
8796 return NULL;
8797
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 if (u->str[fill] == '+' || u->str[fill] == '-') {
8799 /* move sign to beginning of string */
8800 u->str[0] = u->str[fill];
8801 u->str[fill] = '0';
8802 }
8803
8804 return (PyObject*) u;
8805}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008806
8807#if 0
8808static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008809unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810{
Christian Heimes2202f872008-02-06 14:31:34 +00008811 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812}
8813#endif
8814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008815PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008818Return True if S starts with the specified prefix, False otherwise.\n\
8819With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008820With optional end, stop comparing S at that position.\n\
8821prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822
8823static PyObject *
8824unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008827 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008829 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008830 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008831 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008833 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8835 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008836 if (PyTuple_Check(subobj)) {
8837 Py_ssize_t i;
8838 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8839 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008841 if (substring == NULL)
8842 return NULL;
8843 result = tailmatch(self, substring, start, end, -1);
8844 Py_DECREF(substring);
8845 if (result) {
8846 Py_RETURN_TRUE;
8847 }
8848 }
8849 /* nothing matched */
8850 Py_RETURN_FALSE;
8851 }
8852 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008855 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008857 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858}
8859
8860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008861PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008862 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008864Return True if S ends with the specified suffix, False otherwise.\n\
8865With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008866With optional end, stop comparing S at that position.\n\
8867suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868
8869static PyObject *
8870unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008873 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008875 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008876 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008877 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008879 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8881 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008882 if (PyTuple_Check(subobj)) {
8883 Py_ssize_t i;
8884 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8885 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008887 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008889 result = tailmatch(self, substring, start, end, +1);
8890 Py_DECREF(substring);
8891 if (result) {
8892 Py_RETURN_TRUE;
8893 }
8894 }
8895 Py_RETURN_FALSE;
8896 }
8897 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008901 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008903 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904}
8905
Eric Smith8c663262007-08-25 02:26:07 +00008906#include "stringlib/string_format.h"
8907
8908PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008910\n\
8911");
8912
Eric Smith4a7d76d2008-05-30 18:10:19 +00008913static PyObject *
8914unicode__format__(PyObject* self, PyObject* args)
8915{
8916 PyObject *format_spec;
8917
8918 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8919 return NULL;
8920
8921 return _PyUnicode_FormatAdvanced(self,
8922 PyUnicode_AS_UNICODE(format_spec),
8923 PyUnicode_GET_SIZE(format_spec));
8924}
8925
Eric Smith8c663262007-08-25 02:26:07 +00008926PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008928\n\
8929");
8930
8931static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008932unicode__sizeof__(PyUnicodeObject *v)
8933{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008934 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8935 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008936}
8937
8938PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008940
8941static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008942unicode_getnewargs(PyUnicodeObject *v)
8943{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008944 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008945}
8946
8947
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948static PyMethodDef unicode_methods[] = {
8949
8950 /* Order is according to common usage: often used methods should
8951 appear first, since lookup is done sequentially. */
8952
Benjamin Peterson308d6372009-09-18 21:42:35 +00008953 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008954 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8955 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008956 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008957 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8958 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8959 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8960 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8961 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8962 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8963 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008964 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008965 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8966 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8967 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008968 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008969 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8970 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8971 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008972 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008973 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008974 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008975 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008976 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8977 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8978 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8979 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8980 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8981 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8982 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8983 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8984 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8985 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8986 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8987 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8988 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8989 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008990 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008991 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008992 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008993 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008994 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00008995 {"maketrans", (PyCFunction) unicode_maketrans,
8996 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008997 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008998#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008999 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000#endif
9001
9002#if 0
9003 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009004 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005#endif
9006
Benjamin Peterson14339b62009-01-31 16:36:08 +00009007 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 {NULL, NULL}
9009};
9010
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009011static PyObject *
9012unicode_mod(PyObject *v, PyObject *w)
9013{
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 if (!PyUnicode_Check(v)) {
9015 Py_INCREF(Py_NotImplemented);
9016 return Py_NotImplemented;
9017 }
9018 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009019}
9020
9021static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009022 0, /*nb_add*/
9023 0, /*nb_subtract*/
9024 0, /*nb_multiply*/
9025 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009026};
9027
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009029 (lenfunc) unicode_length, /* sq_length */
9030 PyUnicode_Concat, /* sq_concat */
9031 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9032 (ssizeargfunc) unicode_getitem, /* sq_item */
9033 0, /* sq_slice */
9034 0, /* sq_ass_item */
9035 0, /* sq_ass_slice */
9036 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037};
9038
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009039static PyObject*
9040unicode_subscript(PyUnicodeObject* self, PyObject* item)
9041{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009042 if (PyIndex_Check(item)) {
9043 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009044 if (i == -1 && PyErr_Occurred())
9045 return NULL;
9046 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009047 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009048 return unicode_getitem(self, i);
9049 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009050 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009051 Py_UNICODE* source_buf;
9052 Py_UNICODE* result_buf;
9053 PyObject* result;
9054
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009055 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009056 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009057 return NULL;
9058 }
9059
9060 if (slicelength <= 0) {
9061 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009062 } else if (start == 0 && step == 1 && slicelength == self->length &&
9063 PyUnicode_CheckExact(self)) {
9064 Py_INCREF(self);
9065 return (PyObject *)self;
9066 } else if (step == 1) {
9067 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009068 } else {
9069 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009070 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9071 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009072
Benjamin Peterson29060642009-01-31 22:14:21 +00009073 if (result_buf == NULL)
9074 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009075
9076 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9077 result_buf[i] = source_buf[cur];
9078 }
Tim Petersced69f82003-09-16 20:30:58 +00009079
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009080 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009081 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009082 return result;
9083 }
9084 } else {
9085 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9086 return NULL;
9087 }
9088}
9089
9090static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009091 (lenfunc)unicode_length, /* mp_length */
9092 (binaryfunc)unicode_subscript, /* mp_subscript */
9093 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009094};
9095
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097/* Helpers for PyUnicode_Format() */
9098
9099static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009100getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009102 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009104 (*p_argidx)++;
9105 if (arglen < 0)
9106 return args;
9107 else
9108 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109 }
9110 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112 return NULL;
9113}
9114
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009115/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009117static PyObject *
9118formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009120 char *p;
9121 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009123
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124 x = PyFloat_AsDouble(v);
9125 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009126 return NULL;
9127
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009130
Eric Smith0923d1d2009-04-16 20:16:10 +00009131 p = PyOS_double_to_string(x, type, prec,
9132 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009133 if (p == NULL)
9134 return NULL;
9135 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009136 PyMem_Free(p);
9137 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138}
9139
Tim Peters38fd5b62000-09-21 05:43:11 +00009140static PyObject*
9141formatlong(PyObject *val, int flags, int prec, int type)
9142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009143 char *buf;
9144 int len;
9145 PyObject *str; /* temporary string object. */
9146 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009147
Benjamin Peterson14339b62009-01-31 16:36:08 +00009148 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9149 if (!str)
9150 return NULL;
9151 result = PyUnicode_FromStringAndSize(buf, len);
9152 Py_DECREF(str);
9153 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009154}
9155
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156static int
9157formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009158 size_t buflen,
9159 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009161 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009162 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 if (PyUnicode_GET_SIZE(v) == 1) {
9164 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9165 buf[1] = '\0';
9166 return 1;
9167 }
9168#ifndef Py_UNICODE_WIDE
9169 if (PyUnicode_GET_SIZE(v) == 2) {
9170 /* Decode a valid surrogate pair */
9171 int c0 = PyUnicode_AS_UNICODE(v)[0];
9172 int c1 = PyUnicode_AS_UNICODE(v)[1];
9173 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9174 0xDC00 <= c1 && c1 <= 0xDFFF) {
9175 buf[0] = c0;
9176 buf[1] = c1;
9177 buf[2] = '\0';
9178 return 2;
9179 }
9180 }
9181#endif
9182 goto onError;
9183 }
9184 else {
9185 /* Integer input truncated to a character */
9186 long x;
9187 x = PyLong_AsLong(v);
9188 if (x == -1 && PyErr_Occurred())
9189 goto onError;
9190
9191 if (x < 0 || x > 0x10ffff) {
9192 PyErr_SetString(PyExc_OverflowError,
9193 "%c arg not in range(0x110000)");
9194 return -1;
9195 }
9196
9197#ifndef Py_UNICODE_WIDE
9198 if (x > 0xffff) {
9199 x -= 0x10000;
9200 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9201 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9202 return 2;
9203 }
9204#endif
9205 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009206 buf[1] = '\0';
9207 return 1;
9208 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009209
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009211 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009212 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009213 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214}
9215
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009216/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009217 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009218*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009219#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009220
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009222 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223{
9224 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009225 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 int args_owned = 0;
9227 PyUnicodeObject *result = NULL;
9228 PyObject *dict = NULL;
9229 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009230
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009232 PyErr_BadInternalCall();
9233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234 }
9235 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009236 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 fmt = PyUnicode_AS_UNICODE(uformat);
9239 fmtcnt = PyUnicode_GET_SIZE(uformat);
9240
9241 reslen = rescnt = fmtcnt + 100;
9242 result = _PyUnicode_New(reslen);
9243 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009244 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245 res = PyUnicode_AS_UNICODE(result);
9246
9247 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009248 arglen = PyTuple_Size(args);
9249 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250 }
9251 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009252 arglen = -1;
9253 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009255 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009256 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009257 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258
9259 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009260 if (*fmt != '%') {
9261 if (--rescnt < 0) {
9262 rescnt = fmtcnt + 100;
9263 reslen += rescnt;
9264 if (_PyUnicode_Resize(&result, reslen) < 0)
9265 goto onError;
9266 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9267 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009268 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009269 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009270 }
9271 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009272 /* Got a format specifier */
9273 int flags = 0;
9274 Py_ssize_t width = -1;
9275 int prec = -1;
9276 Py_UNICODE c = '\0';
9277 Py_UNICODE fill;
9278 int isnumok;
9279 PyObject *v = NULL;
9280 PyObject *temp = NULL;
9281 Py_UNICODE *pbuf;
9282 Py_UNICODE sign;
9283 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009284 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285
Benjamin Peterson29060642009-01-31 22:14:21 +00009286 fmt++;
9287 if (*fmt == '(') {
9288 Py_UNICODE *keystart;
9289 Py_ssize_t keylen;
9290 PyObject *key;
9291 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009292
Benjamin Peterson29060642009-01-31 22:14:21 +00009293 if (dict == NULL) {
9294 PyErr_SetString(PyExc_TypeError,
9295 "format requires a mapping");
9296 goto onError;
9297 }
9298 ++fmt;
9299 --fmtcnt;
9300 keystart = fmt;
9301 /* Skip over balanced parentheses */
9302 while (pcount > 0 && --fmtcnt >= 0) {
9303 if (*fmt == ')')
9304 --pcount;
9305 else if (*fmt == '(')
9306 ++pcount;
9307 fmt++;
9308 }
9309 keylen = fmt - keystart - 1;
9310 if (fmtcnt < 0 || pcount > 0) {
9311 PyErr_SetString(PyExc_ValueError,
9312 "incomplete format key");
9313 goto onError;
9314 }
9315#if 0
9316 /* keys are converted to strings using UTF-8 and
9317 then looked up since Python uses strings to hold
9318 variables names etc. in its namespaces and we
9319 wouldn't want to break common idioms. */
9320 key = PyUnicode_EncodeUTF8(keystart,
9321 keylen,
9322 NULL);
9323#else
9324 key = PyUnicode_FromUnicode(keystart, keylen);
9325#endif
9326 if (key == NULL)
9327 goto onError;
9328 if (args_owned) {
9329 Py_DECREF(args);
9330 args_owned = 0;
9331 }
9332 args = PyObject_GetItem(dict, key);
9333 Py_DECREF(key);
9334 if (args == NULL) {
9335 goto onError;
9336 }
9337 args_owned = 1;
9338 arglen = -1;
9339 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009340 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009341 while (--fmtcnt >= 0) {
9342 switch (c = *fmt++) {
9343 case '-': flags |= F_LJUST; continue;
9344 case '+': flags |= F_SIGN; continue;
9345 case ' ': flags |= F_BLANK; continue;
9346 case '#': flags |= F_ALT; continue;
9347 case '0': flags |= F_ZERO; continue;
9348 }
9349 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009350 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 if (c == '*') {
9352 v = getnextarg(args, arglen, &argidx);
9353 if (v == NULL)
9354 goto onError;
9355 if (!PyLong_Check(v)) {
9356 PyErr_SetString(PyExc_TypeError,
9357 "* wants int");
9358 goto onError;
9359 }
9360 width = PyLong_AsLong(v);
9361 if (width == -1 && PyErr_Occurred())
9362 goto onError;
9363 if (width < 0) {
9364 flags |= F_LJUST;
9365 width = -width;
9366 }
9367 if (--fmtcnt >= 0)
9368 c = *fmt++;
9369 }
9370 else if (c >= '0' && c <= '9') {
9371 width = c - '0';
9372 while (--fmtcnt >= 0) {
9373 c = *fmt++;
9374 if (c < '0' || c > '9')
9375 break;
9376 if ((width*10) / 10 != width) {
9377 PyErr_SetString(PyExc_ValueError,
9378 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009379 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 }
9381 width = width*10 + (c - '0');
9382 }
9383 }
9384 if (c == '.') {
9385 prec = 0;
9386 if (--fmtcnt >= 0)
9387 c = *fmt++;
9388 if (c == '*') {
9389 v = getnextarg(args, arglen, &argidx);
9390 if (v == NULL)
9391 goto onError;
9392 if (!PyLong_Check(v)) {
9393 PyErr_SetString(PyExc_TypeError,
9394 "* wants int");
9395 goto onError;
9396 }
9397 prec = PyLong_AsLong(v);
9398 if (prec == -1 && PyErr_Occurred())
9399 goto onError;
9400 if (prec < 0)
9401 prec = 0;
9402 if (--fmtcnt >= 0)
9403 c = *fmt++;
9404 }
9405 else if (c >= '0' && c <= '9') {
9406 prec = c - '0';
9407 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009408 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009409 if (c < '0' || c > '9')
9410 break;
9411 if ((prec*10) / 10 != prec) {
9412 PyErr_SetString(PyExc_ValueError,
9413 "prec too big");
9414 goto onError;
9415 }
9416 prec = prec*10 + (c - '0');
9417 }
9418 }
9419 } /* prec */
9420 if (fmtcnt >= 0) {
9421 if (c == 'h' || c == 'l' || c == 'L') {
9422 if (--fmtcnt >= 0)
9423 c = *fmt++;
9424 }
9425 }
9426 if (fmtcnt < 0) {
9427 PyErr_SetString(PyExc_ValueError,
9428 "incomplete format");
9429 goto onError;
9430 }
9431 if (c != '%') {
9432 v = getnextarg(args, arglen, &argidx);
9433 if (v == NULL)
9434 goto onError;
9435 }
9436 sign = 0;
9437 fill = ' ';
9438 switch (c) {
9439
9440 case '%':
9441 pbuf = formatbuf;
9442 /* presume that buffer length is at least 1 */
9443 pbuf[0] = '%';
9444 len = 1;
9445 break;
9446
9447 case 's':
9448 case 'r':
9449 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009450 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009451 temp = v;
9452 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009453 }
9454 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009455 if (c == 's')
9456 temp = PyObject_Str(v);
9457 else if (c == 'r')
9458 temp = PyObject_Repr(v);
9459 else
9460 temp = PyObject_ASCII(v);
9461 if (temp == NULL)
9462 goto onError;
9463 if (PyUnicode_Check(temp))
9464 /* nothing to do */;
9465 else {
9466 Py_DECREF(temp);
9467 PyErr_SetString(PyExc_TypeError,
9468 "%s argument has non-string str()");
9469 goto onError;
9470 }
9471 }
9472 pbuf = PyUnicode_AS_UNICODE(temp);
9473 len = PyUnicode_GET_SIZE(temp);
9474 if (prec >= 0 && len > prec)
9475 len = prec;
9476 break;
9477
9478 case 'i':
9479 case 'd':
9480 case 'u':
9481 case 'o':
9482 case 'x':
9483 case 'X':
9484 if (c == 'i')
9485 c = 'd';
9486 isnumok = 0;
9487 if (PyNumber_Check(v)) {
9488 PyObject *iobj=NULL;
9489
9490 if (PyLong_Check(v)) {
9491 iobj = v;
9492 Py_INCREF(iobj);
9493 }
9494 else {
9495 iobj = PyNumber_Long(v);
9496 }
9497 if (iobj!=NULL) {
9498 if (PyLong_Check(iobj)) {
9499 isnumok = 1;
9500 temp = formatlong(iobj, flags, prec, c);
9501 Py_DECREF(iobj);
9502 if (!temp)
9503 goto onError;
9504 pbuf = PyUnicode_AS_UNICODE(temp);
9505 len = PyUnicode_GET_SIZE(temp);
9506 sign = 1;
9507 }
9508 else {
9509 Py_DECREF(iobj);
9510 }
9511 }
9512 }
9513 if (!isnumok) {
9514 PyErr_Format(PyExc_TypeError,
9515 "%%%c format: a number is required, "
9516 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9517 goto onError;
9518 }
9519 if (flags & F_ZERO)
9520 fill = '0';
9521 break;
9522
9523 case 'e':
9524 case 'E':
9525 case 'f':
9526 case 'F':
9527 case 'g':
9528 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009529 temp = formatfloat(v, flags, prec, c);
9530 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009531 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009532 pbuf = PyUnicode_AS_UNICODE(temp);
9533 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 sign = 1;
9535 if (flags & F_ZERO)
9536 fill = '0';
9537 break;
9538
9539 case 'c':
9540 pbuf = formatbuf;
9541 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9542 if (len < 0)
9543 goto onError;
9544 break;
9545
9546 default:
9547 PyErr_Format(PyExc_ValueError,
9548 "unsupported format character '%c' (0x%x) "
9549 "at index %zd",
9550 (31<=c && c<=126) ? (char)c : '?',
9551 (int)c,
9552 (Py_ssize_t)(fmt - 1 -
9553 PyUnicode_AS_UNICODE(uformat)));
9554 goto onError;
9555 }
9556 if (sign) {
9557 if (*pbuf == '-' || *pbuf == '+') {
9558 sign = *pbuf++;
9559 len--;
9560 }
9561 else if (flags & F_SIGN)
9562 sign = '+';
9563 else if (flags & F_BLANK)
9564 sign = ' ';
9565 else
9566 sign = 0;
9567 }
9568 if (width < len)
9569 width = len;
9570 if (rescnt - (sign != 0) < width) {
9571 reslen -= rescnt;
9572 rescnt = width + fmtcnt + 100;
9573 reslen += rescnt;
9574 if (reslen < 0) {
9575 Py_XDECREF(temp);
9576 PyErr_NoMemory();
9577 goto onError;
9578 }
9579 if (_PyUnicode_Resize(&result, reslen) < 0) {
9580 Py_XDECREF(temp);
9581 goto onError;
9582 }
9583 res = PyUnicode_AS_UNICODE(result)
9584 + reslen - rescnt;
9585 }
9586 if (sign) {
9587 if (fill != ' ')
9588 *res++ = sign;
9589 rescnt--;
9590 if (width > len)
9591 width--;
9592 }
9593 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9594 assert(pbuf[0] == '0');
9595 assert(pbuf[1] == c);
9596 if (fill != ' ') {
9597 *res++ = *pbuf++;
9598 *res++ = *pbuf++;
9599 }
9600 rescnt -= 2;
9601 width -= 2;
9602 if (width < 0)
9603 width = 0;
9604 len -= 2;
9605 }
9606 if (width > len && !(flags & F_LJUST)) {
9607 do {
9608 --rescnt;
9609 *res++ = fill;
9610 } while (--width > len);
9611 }
9612 if (fill == ' ') {
9613 if (sign)
9614 *res++ = sign;
9615 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9616 assert(pbuf[0] == '0');
9617 assert(pbuf[1] == c);
9618 *res++ = *pbuf++;
9619 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009620 }
9621 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009622 Py_UNICODE_COPY(res, pbuf, len);
9623 res += len;
9624 rescnt -= len;
9625 while (--width >= len) {
9626 --rescnt;
9627 *res++ = ' ';
9628 }
9629 if (dict && (argidx < arglen) && c != '%') {
9630 PyErr_SetString(PyExc_TypeError,
9631 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009632 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009633 goto onError;
9634 }
9635 Py_XDECREF(temp);
9636 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637 } /* until end */
9638 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009639 PyErr_SetString(PyExc_TypeError,
9640 "not all arguments converted during string formatting");
9641 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642 }
9643
Thomas Woutersa96affe2006-03-12 00:29:36 +00009644 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009645 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009647 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648 }
9649 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650 return (PyObject *)result;
9651
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653 Py_XDECREF(result);
9654 Py_DECREF(uformat);
9655 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009656 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657 }
9658 return NULL;
9659}
9660
Jeremy Hylton938ace62002-07-17 16:30:39 +00009661static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009662unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9663
Tim Peters6d6c1a32001-08-02 04:15:00 +00009664static PyObject *
9665unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9666{
Benjamin Peterson29060642009-01-31 22:14:21 +00009667 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009668 static char *kwlist[] = {"object", "encoding", "errors", 0};
9669 char *encoding = NULL;
9670 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009671
Benjamin Peterson14339b62009-01-31 16:36:08 +00009672 if (type != &PyUnicode_Type)
9673 return unicode_subtype_new(type, args, kwds);
9674 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009675 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009676 return NULL;
9677 if (x == NULL)
9678 return (PyObject *)_PyUnicode_New(0);
9679 if (encoding == NULL && errors == NULL)
9680 return PyObject_Str(x);
9681 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009682 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009683}
9684
Guido van Rossume023fe02001-08-30 03:12:59 +00009685static PyObject *
9686unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9687{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009688 PyUnicodeObject *tmp, *pnew;
9689 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009690
Benjamin Peterson14339b62009-01-31 16:36:08 +00009691 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9692 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9693 if (tmp == NULL)
9694 return NULL;
9695 assert(PyUnicode_Check(tmp));
9696 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9697 if (pnew == NULL) {
9698 Py_DECREF(tmp);
9699 return NULL;
9700 }
9701 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9702 if (pnew->str == NULL) {
9703 _Py_ForgetReference((PyObject *)pnew);
9704 PyObject_Del(pnew);
9705 Py_DECREF(tmp);
9706 return PyErr_NoMemory();
9707 }
9708 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9709 pnew->length = n;
9710 pnew->hash = tmp->hash;
9711 Py_DECREF(tmp);
9712 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009713}
9714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009715PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009716 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009717\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009718Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009719encoding defaults to the current default string encoding.\n\
9720errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009721
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009722static PyObject *unicode_iter(PyObject *seq);
9723
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009725 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009726 "str", /* tp_name */
9727 sizeof(PyUnicodeObject), /* tp_size */
9728 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009730 (destructor)unicode_dealloc, /* tp_dealloc */
9731 0, /* tp_print */
9732 0, /* tp_getattr */
9733 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009734 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009735 unicode_repr, /* tp_repr */
9736 &unicode_as_number, /* tp_as_number */
9737 &unicode_as_sequence, /* tp_as_sequence */
9738 &unicode_as_mapping, /* tp_as_mapping */
9739 (hashfunc) unicode_hash, /* tp_hash*/
9740 0, /* tp_call*/
9741 (reprfunc) unicode_str, /* tp_str */
9742 PyObject_GenericGetAttr, /* tp_getattro */
9743 0, /* tp_setattro */
9744 0, /* tp_as_buffer */
9745 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009746 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009747 unicode_doc, /* tp_doc */
9748 0, /* tp_traverse */
9749 0, /* tp_clear */
9750 PyUnicode_RichCompare, /* tp_richcompare */
9751 0, /* tp_weaklistoffset */
9752 unicode_iter, /* tp_iter */
9753 0, /* tp_iternext */
9754 unicode_methods, /* tp_methods */
9755 0, /* tp_members */
9756 0, /* tp_getset */
9757 &PyBaseObject_Type, /* tp_base */
9758 0, /* tp_dict */
9759 0, /* tp_descr_get */
9760 0, /* tp_descr_set */
9761 0, /* tp_dictoffset */
9762 0, /* tp_init */
9763 0, /* tp_alloc */
9764 unicode_new, /* tp_new */
9765 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766};
9767
9768/* Initialize the Unicode implementation */
9769
Thomas Wouters78890102000-07-22 19:25:51 +00009770void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009772 int i;
9773
Thomas Wouters477c8d52006-05-27 19:21:47 +00009774 /* XXX - move this array to unicodectype.c ? */
9775 Py_UNICODE linebreak[] = {
9776 0x000A, /* LINE FEED */
9777 0x000D, /* CARRIAGE RETURN */
9778 0x001C, /* FILE SEPARATOR */
9779 0x001D, /* GROUP SEPARATOR */
9780 0x001E, /* RECORD SEPARATOR */
9781 0x0085, /* NEXT LINE */
9782 0x2028, /* LINE SEPARATOR */
9783 0x2029, /* PARAGRAPH SEPARATOR */
9784 };
9785
Fred Drakee4315f52000-05-09 19:53:39 +00009786 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009787 free_list = NULL;
9788 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009790 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009791 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009792
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009793 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009794 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009795 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009796 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009797
9798 /* initialize the linebreak bloom filter */
9799 bloom_linebreak = make_bloom_mask(
9800 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9801 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009802
9803 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804}
9805
9806/* Finalize the Unicode implementation */
9807
Christian Heimesa156e092008-02-16 07:38:31 +00009808int
9809PyUnicode_ClearFreeList(void)
9810{
9811 int freelist_size = numfree;
9812 PyUnicodeObject *u;
9813
9814 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009815 PyUnicodeObject *v = u;
9816 u = *(PyUnicodeObject **)u;
9817 if (v->str)
9818 PyObject_DEL(v->str);
9819 Py_XDECREF(v->defenc);
9820 PyObject_Del(v);
9821 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009822 }
9823 free_list = NULL;
9824 assert(numfree == 0);
9825 return freelist_size;
9826}
9827
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828void
Thomas Wouters78890102000-07-22 19:25:51 +00009829_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009831 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009833 Py_XDECREF(unicode_empty);
9834 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009835
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009836 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009837 if (unicode_latin1[i]) {
9838 Py_DECREF(unicode_latin1[i]);
9839 unicode_latin1[i] = NULL;
9840 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009841 }
Christian Heimesa156e092008-02-16 07:38:31 +00009842 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009844
Walter Dörwald16807132007-05-25 13:52:07 +00009845void
9846PyUnicode_InternInPlace(PyObject **p)
9847{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009848 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9849 PyObject *t;
9850 if (s == NULL || !PyUnicode_Check(s))
9851 Py_FatalError(
9852 "PyUnicode_InternInPlace: unicode strings only please!");
9853 /* If it's a subclass, we don't really know what putting
9854 it in the interned dict might do. */
9855 if (!PyUnicode_CheckExact(s))
9856 return;
9857 if (PyUnicode_CHECK_INTERNED(s))
9858 return;
9859 if (interned == NULL) {
9860 interned = PyDict_New();
9861 if (interned == NULL) {
9862 PyErr_Clear(); /* Don't leave an exception */
9863 return;
9864 }
9865 }
9866 /* It might be that the GetItem call fails even
9867 though the key is present in the dictionary,
9868 namely when this happens during a stack overflow. */
9869 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009870 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009871 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009872
Benjamin Peterson29060642009-01-31 22:14:21 +00009873 if (t) {
9874 Py_INCREF(t);
9875 Py_DECREF(*p);
9876 *p = t;
9877 return;
9878 }
Walter Dörwald16807132007-05-25 13:52:07 +00009879
Benjamin Peterson14339b62009-01-31 16:36:08 +00009880 PyThreadState_GET()->recursion_critical = 1;
9881 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9882 PyErr_Clear();
9883 PyThreadState_GET()->recursion_critical = 0;
9884 return;
9885 }
9886 PyThreadState_GET()->recursion_critical = 0;
9887 /* The two references in interned are not counted by refcnt.
9888 The deallocator will take care of this */
9889 Py_REFCNT(s) -= 2;
9890 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009891}
9892
9893void
9894PyUnicode_InternImmortal(PyObject **p)
9895{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009896 PyUnicode_InternInPlace(p);
9897 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9898 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9899 Py_INCREF(*p);
9900 }
Walter Dörwald16807132007-05-25 13:52:07 +00009901}
9902
9903PyObject *
9904PyUnicode_InternFromString(const char *cp)
9905{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009906 PyObject *s = PyUnicode_FromString(cp);
9907 if (s == NULL)
9908 return NULL;
9909 PyUnicode_InternInPlace(&s);
9910 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009911}
9912
9913void _Py_ReleaseInternedUnicodeStrings(void)
9914{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009915 PyObject *keys;
9916 PyUnicodeObject *s;
9917 Py_ssize_t i, n;
9918 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009919
Benjamin Peterson14339b62009-01-31 16:36:08 +00009920 if (interned == NULL || !PyDict_Check(interned))
9921 return;
9922 keys = PyDict_Keys(interned);
9923 if (keys == NULL || !PyList_Check(keys)) {
9924 PyErr_Clear();
9925 return;
9926 }
Walter Dörwald16807132007-05-25 13:52:07 +00009927
Benjamin Peterson14339b62009-01-31 16:36:08 +00009928 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9929 detector, interned unicode strings are not forcibly deallocated;
9930 rather, we give them their stolen references back, and then clear
9931 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009932
Benjamin Peterson14339b62009-01-31 16:36:08 +00009933 n = PyList_GET_SIZE(keys);
9934 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009935 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009936 for (i = 0; i < n; i++) {
9937 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9938 switch (s->state) {
9939 case SSTATE_NOT_INTERNED:
9940 /* XXX Shouldn't happen */
9941 break;
9942 case SSTATE_INTERNED_IMMORTAL:
9943 Py_REFCNT(s) += 1;
9944 immortal_size += s->length;
9945 break;
9946 case SSTATE_INTERNED_MORTAL:
9947 Py_REFCNT(s) += 2;
9948 mortal_size += s->length;
9949 break;
9950 default:
9951 Py_FatalError("Inconsistent interned string state.");
9952 }
9953 s->state = SSTATE_NOT_INTERNED;
9954 }
9955 fprintf(stderr, "total size of all interned strings: "
9956 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9957 "mortal/immortal\n", mortal_size, immortal_size);
9958 Py_DECREF(keys);
9959 PyDict_Clear(interned);
9960 Py_DECREF(interned);
9961 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009962}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009963
9964
9965/********************* Unicode Iterator **************************/
9966
9967typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009968 PyObject_HEAD
9969 Py_ssize_t it_index;
9970 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009971} unicodeiterobject;
9972
9973static void
9974unicodeiter_dealloc(unicodeiterobject *it)
9975{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009976 _PyObject_GC_UNTRACK(it);
9977 Py_XDECREF(it->it_seq);
9978 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009979}
9980
9981static int
9982unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9983{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009984 Py_VISIT(it->it_seq);
9985 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009986}
9987
9988static PyObject *
9989unicodeiter_next(unicodeiterobject *it)
9990{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009991 PyUnicodeObject *seq;
9992 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009993
Benjamin Peterson14339b62009-01-31 16:36:08 +00009994 assert(it != NULL);
9995 seq = it->it_seq;
9996 if (seq == NULL)
9997 return NULL;
9998 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009999
Benjamin Peterson14339b62009-01-31 16:36:08 +000010000 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10001 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010002 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010003 if (item != NULL)
10004 ++it->it_index;
10005 return item;
10006 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010007
Benjamin Peterson14339b62009-01-31 16:36:08 +000010008 Py_DECREF(seq);
10009 it->it_seq = NULL;
10010 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010011}
10012
10013static PyObject *
10014unicodeiter_len(unicodeiterobject *it)
10015{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010016 Py_ssize_t len = 0;
10017 if (it->it_seq)
10018 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10019 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010020}
10021
10022PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10023
10024static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010025 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010027 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010028};
10029
10030PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010031 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10032 "str_iterator", /* tp_name */
10033 sizeof(unicodeiterobject), /* tp_basicsize */
10034 0, /* tp_itemsize */
10035 /* methods */
10036 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10037 0, /* tp_print */
10038 0, /* tp_getattr */
10039 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010040 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010041 0, /* tp_repr */
10042 0, /* tp_as_number */
10043 0, /* tp_as_sequence */
10044 0, /* tp_as_mapping */
10045 0, /* tp_hash */
10046 0, /* tp_call */
10047 0, /* tp_str */
10048 PyObject_GenericGetAttr, /* tp_getattro */
10049 0, /* tp_setattro */
10050 0, /* tp_as_buffer */
10051 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10052 0, /* tp_doc */
10053 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10054 0, /* tp_clear */
10055 0, /* tp_richcompare */
10056 0, /* tp_weaklistoffset */
10057 PyObject_SelfIter, /* tp_iter */
10058 (iternextfunc)unicodeiter_next, /* tp_iternext */
10059 unicodeiter_methods, /* tp_methods */
10060 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010061};
10062
10063static PyObject *
10064unicode_iter(PyObject *seq)
10065{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010066 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010067
Benjamin Peterson14339b62009-01-31 16:36:08 +000010068 if (!PyUnicode_Check(seq)) {
10069 PyErr_BadInternalCall();
10070 return NULL;
10071 }
10072 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10073 if (it == NULL)
10074 return NULL;
10075 it->it_index = 0;
10076 Py_INCREF(seq);
10077 it->it_seq = (PyUnicodeObject *)seq;
10078 _PyObject_GC_TRACK(it);
10079 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010080}
10081
Martin v. Löwis5b222132007-06-10 09:51:05 +000010082size_t
10083Py_UNICODE_strlen(const Py_UNICODE *u)
10084{
10085 int res = 0;
10086 while(*u++)
10087 res++;
10088 return res;
10089}
10090
10091Py_UNICODE*
10092Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10093{
10094 Py_UNICODE *u = s1;
10095 while ((*u++ = *s2++));
10096 return s1;
10097}
10098
10099Py_UNICODE*
10100Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10101{
10102 Py_UNICODE *u = s1;
10103 while ((*u++ = *s2++))
10104 if (n-- == 0)
10105 break;
10106 return s1;
10107}
10108
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010109Py_UNICODE*
10110Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10111{
10112 Py_UNICODE *u1 = s1;
10113 u1 += Py_UNICODE_strlen(u1);
10114 Py_UNICODE_strcpy(u1, s2);
10115 return s1;
10116}
10117
Martin v. Löwis5b222132007-06-10 09:51:05 +000010118int
10119Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10120{
10121 while (*s1 && *s2 && *s1 == *s2)
10122 s1++, s2++;
10123 if (*s1 && *s2)
10124 return (*s1 < *s2) ? -1 : +1;
10125 if (*s1)
10126 return 1;
10127 if (*s2)
10128 return -1;
10129 return 0;
10130}
10131
Victor Stinneref8d95c2010-08-16 22:03:11 +000010132int
10133Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10134{
10135 register Py_UNICODE u1, u2;
10136 for (; n != 0; n--) {
10137 u1 = *s1;
10138 u2 = *s2;
10139 if (u1 != u2)
10140 return (u1 < u2) ? -1 : +1;
10141 if (u1 == '\0')
10142 return 0;
10143 s1++;
10144 s2++;
10145 }
10146 return 0;
10147}
10148
Martin v. Löwis5b222132007-06-10 09:51:05 +000010149Py_UNICODE*
10150Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10151{
10152 const Py_UNICODE *p;
10153 for (p = s; *p; p++)
10154 if (*p == c)
10155 return (Py_UNICODE*)p;
10156 return NULL;
10157}
10158
Victor Stinner331ea922010-08-10 16:37:20 +000010159Py_UNICODE*
10160Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10161{
10162 const Py_UNICODE *p;
10163 p = s + Py_UNICODE_strlen(s);
10164 while (p != s) {
10165 p--;
10166 if (*p == c)
10167 return (Py_UNICODE*)p;
10168 }
10169 return NULL;
10170}
10171
Victor Stinner71133ff2010-09-01 23:43:53 +000010172Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010173PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010174{
10175 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10176 Py_UNICODE *copy;
10177 Py_ssize_t size;
10178
10179 /* Ensure we won't overflow the size. */
10180 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10181 PyErr_NoMemory();
10182 return NULL;
10183 }
10184 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10185 size *= sizeof(Py_UNICODE);
10186 copy = PyMem_Malloc(size);
10187 if (copy == NULL) {
10188 PyErr_NoMemory();
10189 return NULL;
10190 }
10191 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10192 return copy;
10193}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010194
Georg Brandl66c221e2010-10-14 07:04:07 +000010195/* A _string module, to export formatter_parser and formatter_field_name_split
10196 to the string.Formatter class implemented in Python. */
10197
10198static PyMethodDef _string_methods[] = {
10199 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10200 METH_O, PyDoc_STR("split the argument as a field name")},
10201 {"formatter_parser", (PyCFunction) formatter_parser,
10202 METH_O, PyDoc_STR("parse the argument as a format string")},
10203 {NULL, NULL}
10204};
10205
10206static struct PyModuleDef _string_module = {
10207 PyModuleDef_HEAD_INIT,
10208 "_string",
10209 PyDoc_STR("string helper module"),
10210 0,
10211 _string_methods,
10212 NULL,
10213 NULL,
10214 NULL,
10215 NULL
10216};
10217
10218PyMODINIT_FUNC
10219PyInit__string(void)
10220{
10221 return PyModule_Create(&_string_module);
10222}
10223
10224
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010225#ifdef __cplusplus
10226}
10227#endif