blob: 7564b67a2157d2f0d608d89b39533c63e3f30afe [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Christian Heimes190d79e2008-01-30 11:58:22 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000119 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000120/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000121/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000122/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000123/* case 0x000C: * FORM FEED */
124/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000127/* case 0x001C: * FILE SEPARATOR */
128/* case 0x001D: * GROUP SEPARATOR */
129/* case 0x001E: * RECORD SEPARATOR */
130/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000131 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson14339b62009-01-31 16:36:08 +0000138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000146};
147
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149 PyObject **errorHandler,const char *encoding, const char *reason,
150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
Victor Stinner31be90b2010-04-22 19:38:16 +0000153static void raise_encode_exception(PyObject **exceptionObject,
154 const char *encoding,
155 const Py_UNICODE *unicode, Py_ssize_t size,
156 Py_ssize_t startpos, Py_ssize_t endpos,
157 const char *reason);
158
Christian Heimes190d79e2008-01-30 11:58:22 +0000159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrouf068f942010-01-13 14:19:12 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
Antoine Pitrouf068f942010-01-13 14:19:12 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
Benjamin Peterson29060642009-01-31 22:14:21 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252}
253
Benjamin Peterson29060642009-01-31 22:14:21 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000278 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000309 Ux0000 terminated; some code (e.g. new_identifier)
310 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000313 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314
315*/
316
317static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319{
320 register PyUnicodeObject *unicode;
321
Thomas Wouters477c8d52006-05-27 19:21:47 +0000322 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 if (length == 0 && unicode_empty != NULL) {
324 Py_INCREF(unicode_empty);
325 return unicode_empty;
326 }
327
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000328 /* Ensure we won't overflow the size. */
329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330 return (PyUnicodeObject *)PyErr_NoMemory();
331 }
332
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000334 if (free_list) {
335 unicode = free_list;
336 free_list = *(PyUnicodeObject **)unicode;
337 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000338 if (unicode->str) {
339 /* Keep-Alive optimization: we only upsize the buffer,
340 never downsize it. */
341 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000342 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000343 PyObject_DEL(unicode->str);
344 unicode->str = NULL;
345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000350 }
351 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 }
353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 if (unicode == NULL)
357 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 }
361
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 PyErr_NoMemory();
364 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000365 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000366 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000367 * the caller fails before initializing str -- unicode_resize()
368 * reads str[0], and the Keep-Alive optimization can keep memory
369 * allocated for str alive across a call to unicode_dealloc(unicode).
370 * We don't want unicode_resize to read uninitialized memory in
371 * that case.
372 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000373 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000377 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000378 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000380
Benjamin Peterson29060642009-01-31 22:14:21 +0000381 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000382 /* XXX UNREF/NEWREF interface should be more symmetrical */
383 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000385 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387}
388
389static
Guido van Rossum9475a232001-10-05 20:51:39 +0000390void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391{
Walter Dörwald16807132007-05-25 13:52:07 +0000392 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_NOT_INTERNED:
394 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000395
Benjamin Peterson29060642009-01-31 22:14:21 +0000396 case SSTATE_INTERNED_MORTAL:
397 /* revive dead object temporarily for DelItem */
398 Py_REFCNT(unicode) = 3;
399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400 Py_FatalError(
401 "deletion of interned string failed");
402 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 case SSTATE_INTERNED_IMMORTAL:
405 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406
Benjamin Peterson29060642009-01-31 22:14:21 +0000407 default:
408 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000409 }
410
Guido van Rossum604ddf82001-12-06 20:03:56 +0000411 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000412 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000413 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415 PyObject_DEL(unicode->str);
416 unicode->str = NULL;
417 unicode->length = 0;
418 }
419 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000420 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 }
422 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000423 *(PyUnicodeObject **)unicode = free_list;
424 free_list = unicode;
425 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000426 }
427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyObject_DEL(unicode->str);
429 Py_XDECREF(unicode->defenc);
430 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431 }
432}
433
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000436{
437 register PyUnicodeObject *v;
438
439 /* Argument checks */
440 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 PyErr_BadInternalCall();
442 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000443 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 PyErr_BadInternalCall();
447 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 }
449
450 /* Resizing unicode_empty and single character objects is not
451 possible since these are being shared. We simply return a fresh
452 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000453 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000454 (v == unicode_empty || v->length == 1)) {
455 PyUnicodeObject *w = _PyUnicode_New(length);
456 if (w == NULL)
457 return -1;
458 Py_UNICODE_COPY(w->str, v->str,
459 length < v->length ? length : v->length);
460 Py_DECREF(*unicode);
461 *unicode = w;
462 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000463 }
464
465 /* Note that we don't have to modify *unicode for unshared Unicode
466 objects, since we can modify them in-place. */
467 return unicode_resize(v, length);
468}
469
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477{
478 PyUnicodeObject *unicode;
479
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects. */
482 if (u != NULL) {
483
Benjamin Peterson29060642009-01-31 22:14:21 +0000484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000489
490 /* Single character Unicode objects in the Latin-1 range are
491 shared when using this constructor */
492 if (size == 1 && *u < 256) {
493 unicode = unicode_latin1[*u];
494 if (!unicode) {
495 unicode = _PyUnicode_New(1);
496 if (!unicode)
497 return NULL;
498 unicode->str[0] = *u;
499 unicode_latin1[*u] = unicode;
500 }
501 Py_INCREF(unicode);
502 return (PyObject *)unicode;
503 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000504 }
Tim Petersced69f82003-09-16 20:30:58 +0000505
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 unicode = _PyUnicode_New(size);
507 if (!unicode)
508 return NULL;
509
510 /* Copy the Unicode data into the new object */
511 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000512 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 return (PyObject *)unicode;
515}
516
Walter Dörwaldd2034312007-05-18 16:29:38 +0000517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518{
519 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000520
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 if (size < 0) {
522 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 return NULL;
525 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000526
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000528 some optimizations which share commonly used objects.
529 Also, this means the input must be UTF-8, so fall back to the
530 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (u != NULL) {
532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533 /* Optimization for empty strings */
534 if (size == 0 && unicode_empty != NULL) {
535 Py_INCREF(unicode_empty);
536 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000537 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000538
539 /* Single characters are shared when using this constructor.
540 Restrict to ASCII, since the input must be UTF-8. */
541 if (size == 1 && Py_CHARMASK(*u) < 128) {
542 unicode = unicode_latin1[Py_CHARMASK(*u)];
543 if (!unicode) {
544 unicode = _PyUnicode_New(1);
545 if (!unicode)
546 return NULL;
547 unicode->str[0] = Py_CHARMASK(*u);
548 unicode_latin1[Py_CHARMASK(*u)] = unicode;
549 }
550 Py_INCREF(unicode);
551 return (PyObject *)unicode;
552 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000553
554 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 }
556
Walter Dörwald55507312007-05-18 13:12:10 +0000557 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 if (!unicode)
559 return NULL;
560
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 return (PyObject *)unicode;
562}
563
Walter Dörwaldd2034312007-05-18 16:29:38 +0000564PyObject *PyUnicode_FromString(const char *u)
565{
566 size_t size = strlen(u);
567 if (size > PY_SSIZE_T_MAX) {
568 PyErr_SetString(PyExc_OverflowError, "input too long");
569 return NULL;
570 }
571
572 return PyUnicode_FromStringAndSize(u, size);
573}
574
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575#ifdef HAVE_WCHAR_H
576
Mark Dickinson081dfee2009-03-18 14:47:41 +0000577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584 to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
588{
589 PyUnicodeObject *unicode;
590 register Py_ssize_t i;
591 Py_ssize_t alloc;
592 const wchar_t *orig_w;
593
594 if (w == NULL) {
595 if (size == 0)
596 return PyUnicode_FromStringAndSize(NULL, 0);
597 PyErr_BadInternalCall();
598 return NULL;
599 }
600
601 if (size == -1) {
602 size = wcslen(w);
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000643 if (size == 0)
644 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000645 PyErr_BadInternalCall();
646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647 }
648
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == -1) {
650 size = wcslen(w);
651 }
652
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 unicode = _PyUnicode_New(size);
654 if (!unicode)
655 return NULL;
656
657 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000660#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000662 register Py_UNICODE *u;
663 register Py_ssize_t i;
664 u = PyUnicode_AS_UNICODE(unicode);
665 for (i = size; i > 0; i--)
666 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 }
668#endif
669
670 return (PyObject *)unicode;
671}
672
Mark Dickinson081dfee2009-03-18 14:47:41 +0000673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
Walter Dörwald346737f2007-05-31 10:44:43 +0000677static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000680{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000681 *fmt++ = '%';
682 if (width) {
683 if (zeropad)
684 *fmt++ = '0';
685 fmt += sprintf(fmt, "%d", width);
686 }
687 if (precision)
688 fmt += sprintf(fmt, ".%d", precision);
689 if (longflag)
690 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000691 else if (longlongflag) {
692 /* longlongflag should only ever be nonzero on machines with
693 HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695 char *f = PY_FORMAT_LONG_LONG;
696 while (*f)
697 *fmt++ = *f++;
698#else
699 /* we shouldn't ever get here */
700 assert(0);
701 *fmt++ = 'l';
702#endif
703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000704 else if (size_tflag) {
705 char *f = PY_FORMAT_SIZE_T;
706 while (*f)
707 *fmt++ = *f++;
708 }
709 *fmt++ = c;
710 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000711}
712
Walter Dörwaldd2034312007-05-18 16:29:38 +0000713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld. 21 characters
718 allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000728 va_list count;
729 Py_ssize_t callcount = 0;
730 PyObject **callresults = NULL;
731 PyObject **callresult = NULL;
732 Py_ssize_t n = 0;
733 int width = 0;
734 int precision = 0;
735 int zeropad;
736 const char* f;
737 Py_UNICODE *s;
738 PyObject *string;
739 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000740 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000741 /* use abuffer instead of buffer, if we need more space
742 * (which can happen if there's a format specifier with width). */
743 char *abuffer = NULL;
744 char *realbuffer;
745 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000749 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000750 /* step 1: count the number of %S/%R/%A/%s format specifications
751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000754 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000755 if (*f == '%') {
756 if (*(f+1)=='%')
757 continue;
758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759 ++callcount;
760 while (ISDIGIT((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763 ;
764 if (*f == 's')
765 ++callcount;
766 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000767 else if (128 <= (unsigned char)*f) {
768 PyErr_Format(PyExc_ValueError,
769 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000770 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000771 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000772 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 }
775 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000776 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000777 if (callcount) {
778 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
779 if (!callresults) {
780 PyErr_NoMemory();
781 return NULL;
782 }
783 callresult = callresults;
784 }
785 /* step 3: figure out how large a buffer we need */
786 for (f = format; *f; f++) {
787 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000788#ifdef HAVE_LONG_LONG
789 int longlongflag = 0;
790#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 const char* p = f;
792 width = 0;
793 while (ISDIGIT((unsigned)*f))
794 width = (width*10) + *f++ - '0';
795 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
796 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797
Benjamin Peterson14339b62009-01-31 16:36:08 +0000798 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
799 * they don't affect the amount of space we reserve.
800 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000801 if (*f == 'l') {
802 if (f[1] == 'd' || f[1] == 'u') {
803 ++f;
804 }
805#ifdef HAVE_LONG_LONG
806 else if (f[1] == 'l' &&
807 (f[2] == 'd' || f[2] == 'u')) {
808 longlongflag = 1;
809 f += 2;
810 }
811#endif
812 }
813 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000814 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000816
Benjamin Peterson14339b62009-01-31 16:36:08 +0000817 switch (*f) {
818 case 'c':
819 (void)va_arg(count, int);
820 /* fall through... */
821 case '%':
822 n++;
823 break;
824 case 'd': case 'u': case 'i': case 'x':
825 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000826#ifdef HAVE_LONG_LONG
827 if (longlongflag) {
828 if (width < MAX_LONG_LONG_CHARS)
829 width = MAX_LONG_LONG_CHARS;
830 }
831 else
832#endif
833 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
834 including sign. Decimal takes the most space. This
835 isn't enough for octal. If a width is specified we
836 need more (which we allocate later). */
837 if (width < MAX_LONG_CHARS)
838 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000839 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000840 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 if (abuffersize < width)
842 abuffersize = width;
843 break;
844 case 's':
845 {
846 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000847 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000848 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
849 if (!str)
850 goto fail;
851 n += PyUnicode_GET_SIZE(str);
852 /* Remember the str and switch to the next slot */
853 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000854 break;
855 }
856 case 'U':
857 {
858 PyObject *obj = va_arg(count, PyObject *);
859 assert(obj && PyUnicode_Check(obj));
860 n += PyUnicode_GET_SIZE(obj);
861 break;
862 }
863 case 'V':
864 {
865 PyObject *obj = va_arg(count, PyObject *);
866 const char *str = va_arg(count, const char *);
867 assert(obj || str);
868 assert(!obj || PyUnicode_Check(obj));
869 if (obj)
870 n += PyUnicode_GET_SIZE(obj);
871 else
872 n += strlen(str);
873 break;
874 }
875 case 'S':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 PyObject *str;
879 assert(obj);
880 str = PyObject_Str(obj);
881 if (!str)
882 goto fail;
883 n += PyUnicode_GET_SIZE(str);
884 /* Remember the str and switch to the next slot */
885 *callresult++ = str;
886 break;
887 }
888 case 'R':
889 {
890 PyObject *obj = va_arg(count, PyObject *);
891 PyObject *repr;
892 assert(obj);
893 repr = PyObject_Repr(obj);
894 if (!repr)
895 goto fail;
896 n += PyUnicode_GET_SIZE(repr);
897 /* Remember the repr and switch to the next slot */
898 *callresult++ = repr;
899 break;
900 }
901 case 'A':
902 {
903 PyObject *obj = va_arg(count, PyObject *);
904 PyObject *ascii;
905 assert(obj);
906 ascii = PyObject_ASCII(obj);
907 if (!ascii)
908 goto fail;
909 n += PyUnicode_GET_SIZE(ascii);
910 /* Remember the repr and switch to the next slot */
911 *callresult++ = ascii;
912 break;
913 }
914 case 'p':
915 (void) va_arg(count, int);
916 /* maximum 64-bit pointer representation:
917 * 0xffffffffffffffff
918 * so 19 characters is enough.
919 * XXX I count 18 -- what's the extra for?
920 */
921 n += 19;
922 break;
923 default:
924 /* if we stumble upon an unknown
925 formatting code, copy the rest of
926 the format string to the output
927 string. (we cannot just skip the
928 code, since there's no way to know
929 what's in the argument list) */
930 n += strlen(p);
931 goto expand;
932 }
933 } else
934 n++;
935 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000936 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000937 if (abuffersize > ITEM_BUFFER_LEN) {
938 /* add 1 for sprintf's trailing null byte */
939 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000940 if (!abuffer) {
941 PyErr_NoMemory();
942 goto fail;
943 }
944 realbuffer = abuffer;
945 }
946 else
947 realbuffer = buffer;
948 /* step 4: fill the buffer */
949 /* Since we've analyzed how much space we need for the worst case,
950 we don't have to resize the string.
951 There can be no errors beyond this point. */
952 string = PyUnicode_FromUnicode(NULL, n);
953 if (!string)
954 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000955
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 s = PyUnicode_AS_UNICODE(string);
957 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000958
Benjamin Peterson14339b62009-01-31 16:36:08 +0000959 for (f = format; *f; f++) {
960 if (*f == '%') {
961 const char* p = f++;
962 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000963 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000964 int size_tflag = 0;
965 zeropad = (*f == '0');
966 /* parse the width.precision part */
967 width = 0;
968 while (ISDIGIT((unsigned)*f))
969 width = (width*10) + *f++ - '0';
970 precision = 0;
971 if (*f == '.') {
972 f++;
973 while (ISDIGIT((unsigned)*f))
974 precision = (precision*10) + *f++ - '0';
975 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000976 /* Handle %ld, %lu, %lld and %llu. */
977 if (*f == 'l') {
978 if (f[1] == 'd' || f[1] == 'u') {
979 longflag = 1;
980 ++f;
981 }
982#ifdef HAVE_LONG_LONG
983 else if (f[1] == 'l' &&
984 (f[2] == 'd' || f[2] == 'u')) {
985 longlongflag = 1;
986 f += 2;
987 }
988#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 }
990 /* handle the size_t flag. */
991 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
992 size_tflag = 1;
993 ++f;
994 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000995
Benjamin Peterson14339b62009-01-31 16:36:08 +0000996 switch (*f) {
997 case 'c':
998 *s++ = va_arg(vargs, int);
999 break;
1000 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001001 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1002 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001003 if (longflag)
1004 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001005#ifdef HAVE_LONG_LONG
1006 else if (longlongflag)
1007 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1008#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001009 else if (size_tflag)
1010 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1011 else
1012 sprintf(realbuffer, fmt, va_arg(vargs, int));
1013 appendstring(realbuffer);
1014 break;
1015 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001016 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1017 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 if (longflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001020#ifdef HAVE_LONG_LONG
1021 else if (longlongflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs,
1023 unsigned PY_LONG_LONG));
1024#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001025 else if (size_tflag)
1026 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1027 else
1028 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1029 appendstring(realbuffer);
1030 break;
1031 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 sprintf(realbuffer, fmt, va_arg(vargs, int));
1034 appendstring(realbuffer);
1035 break;
1036 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001037 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 's':
1042 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001043 /* unused, since we already have the result */
1044 (void) va_arg(vargs, char *);
1045 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1046 PyUnicode_GET_SIZE(*callresult));
1047 s += PyUnicode_GET_SIZE(*callresult);
1048 /* We're done with the unicode()/repr() => forget it */
1049 Py_DECREF(*callresult);
1050 /* switch to next unicode()/repr() result */
1051 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001052 break;
1053 }
1054 case 'U':
1055 {
1056 PyObject *obj = va_arg(vargs, PyObject *);
1057 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1058 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1059 s += size;
1060 break;
1061 }
1062 case 'V':
1063 {
1064 PyObject *obj = va_arg(vargs, PyObject *);
1065 const char *str = va_arg(vargs, const char *);
1066 if (obj) {
1067 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1068 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1069 s += size;
1070 } else {
1071 appendstring(str);
1072 }
1073 break;
1074 }
1075 case 'S':
1076 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001077 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001078 {
1079 Py_UNICODE *ucopy;
1080 Py_ssize_t usize;
1081 Py_ssize_t upos;
1082 /* unused, since we already have the result */
1083 (void) va_arg(vargs, PyObject *);
1084 ucopy = PyUnicode_AS_UNICODE(*callresult);
1085 usize = PyUnicode_GET_SIZE(*callresult);
1086 for (upos = 0; upos<usize;)
1087 *s++ = ucopy[upos++];
1088 /* We're done with the unicode()/repr() => forget it */
1089 Py_DECREF(*callresult);
1090 /* switch to next unicode()/repr() result */
1091 ++callresult;
1092 break;
1093 }
1094 case 'p':
1095 sprintf(buffer, "%p", va_arg(vargs, void*));
1096 /* %p is ill-defined: ensure leading 0x. */
1097 if (buffer[1] == 'X')
1098 buffer[1] = 'x';
1099 else if (buffer[1] != 'x') {
1100 memmove(buffer+2, buffer, strlen(buffer)+1);
1101 buffer[0] = '0';
1102 buffer[1] = 'x';
1103 }
1104 appendstring(buffer);
1105 break;
1106 case '%':
1107 *s++ = '%';
1108 break;
1109 default:
1110 appendstring(p);
1111 goto end;
1112 }
Victor Stinner1205f272010-09-11 00:54:47 +00001113 }
Victor Stinner1205f272010-09-11 00:54:47 +00001114 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001115 *s++ = *f;
1116 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001117
Benjamin Peterson29060642009-01-31 22:14:21 +00001118 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001119 if (callresults)
1120 PyObject_Free(callresults);
1121 if (abuffer)
1122 PyObject_Free(abuffer);
1123 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1124 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001125 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001126 if (callresults) {
1127 PyObject **callresult2 = callresults;
1128 while (callresult2 < callresult) {
1129 Py_DECREF(*callresult2);
1130 ++callresult2;
1131 }
1132 PyObject_Free(callresults);
1133 }
1134 if (abuffer)
1135 PyObject_Free(abuffer);
1136 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001137}
1138
1139#undef appendstring
1140
1141PyObject *
1142PyUnicode_FromFormat(const char *format, ...)
1143{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 PyObject* ret;
1145 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001146
1147#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001148 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001149#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001150 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001151#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001152 ret = PyUnicode_FromFormatV(format, vargs);
1153 va_end(vargs);
1154 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155}
1156
Victor Stinner5593d8a2010-10-02 11:11:27 +00001157/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1158 convert a Unicode object to a wide character string.
1159
1160 - If w is NULL: return the number of wide characters (including the nul
1161 character) required to convert the unicode object. Ignore size argument.
1162
1163 - Otherwise: return the number of wide characters (excluding the nul
1164 character) written into w. Write at most size wide characters (including
1165 the nul character). */
1166static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001167unicode_aswidechar(PyUnicodeObject *unicode,
1168 wchar_t *w,
1169 Py_ssize_t size)
1170{
1171#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001172 Py_ssize_t res;
1173 if (w != NULL) {
1174 res = PyUnicode_GET_SIZE(unicode);
1175 if (size > res)
1176 size = res + 1;
1177 else
1178 res = size;
1179 memcpy(w, unicode->str, size * sizeof(wchar_t));
1180 return res;
1181 }
1182 else
1183 return PyUnicode_GET_SIZE(unicode) + 1;
1184#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1185 register const Py_UNICODE *u;
1186 const Py_UNICODE *uend;
1187 const wchar_t *worig, *wend;
1188 Py_ssize_t nchar;
1189
Victor Stinner137c34c2010-09-29 10:25:54 +00001190 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001191 uend = u + PyUnicode_GET_SIZE(unicode);
1192 if (w != NULL) {
1193 worig = w;
1194 wend = w + size;
1195 while (u != uend && w != wend) {
1196 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1197 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1198 {
1199 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1200 u += 2;
1201 }
1202 else {
1203 *w = *u;
1204 u++;
1205 }
1206 w++;
1207 }
1208 if (w != wend)
1209 *w = L'\0';
1210 return w - worig;
1211 }
1212 else {
1213 nchar = 1; /* nul character at the end */
1214 while (u != uend) {
1215 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1216 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1217 u += 2;
1218 else
1219 u++;
1220 nchar++;
1221 }
1222 }
1223 return nchar;
1224#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1225 register Py_UNICODE *u, *uend, ordinal;
1226 register Py_ssize_t i;
1227 wchar_t *worig, *wend;
1228 Py_ssize_t nchar;
1229
1230 u = PyUnicode_AS_UNICODE(unicode);
1231 uend = u + PyUnicode_GET_SIZE(u);
1232 if (w != NULL) {
1233 worig = w;
1234 wend = w + size;
1235 while (u != uend && w != wend) {
1236 ordinal = *u;
1237 if (ordinal > 0xffff) {
1238 ordinal -= 0x10000;
1239 *w++ = 0xD800 | (ordinal >> 10);
1240 *w++ = 0xDC00 | (ordinal & 0x3FF);
1241 }
1242 else
1243 *w++ = ordinal;
1244 u++;
1245 }
1246 if (w != wend)
1247 *w = 0;
1248 return w - worig;
1249 }
1250 else {
1251 nchar = 1; /* nul character */
1252 while (u != uend) {
1253 if (*u > 0xffff)
1254 nchar += 2;
1255 else
1256 nchar++;
1257 u++;
1258 }
1259 return nchar;
1260 }
1261#else
1262# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001263#endif
1264}
1265
1266Py_ssize_t
1267PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1268 wchar_t *w,
1269 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270{
1271 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 PyErr_BadInternalCall();
1273 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00001275 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276}
1277
Victor Stinner137c34c2010-09-29 10:25:54 +00001278wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001279PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001280 Py_ssize_t *size)
1281{
1282 wchar_t* buffer;
1283 Py_ssize_t buflen;
1284
1285 if (unicode == NULL) {
1286 PyErr_BadInternalCall();
1287 return NULL;
1288 }
1289
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001290 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001291 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001292 PyErr_NoMemory();
1293 return NULL;
1294 }
1295
Victor Stinner137c34c2010-09-29 10:25:54 +00001296 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1297 if (buffer == NULL) {
1298 PyErr_NoMemory();
1299 return NULL;
1300 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001301 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001302 if (size != NULL)
1303 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001304 return buffer;
1305}
1306
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307#endif
1308
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001309PyObject *PyUnicode_FromOrdinal(int ordinal)
1310{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001311 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001312
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001313 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001314 PyErr_SetString(PyExc_ValueError,
1315 "chr() arg not in range(0x110000)");
1316 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001317 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001318
1319#ifndef Py_UNICODE_WIDE
1320 if (ordinal > 0xffff) {
1321 ordinal -= 0x10000;
1322 s[0] = 0xD800 | (ordinal >> 10);
1323 s[1] = 0xDC00 | (ordinal & 0x3FF);
1324 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001325 }
1326#endif
1327
Hye-Shik Chang40574832004-04-06 07:24:51 +00001328 s[0] = (Py_UNICODE)ordinal;
1329 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001330}
1331
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332PyObject *PyUnicode_FromObject(register PyObject *obj)
1333{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001334 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001335 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001336 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001337 Py_INCREF(obj);
1338 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001339 }
1340 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001341 /* For a Unicode subtype that's not a Unicode object,
1342 return a true Unicode object with the same data. */
1343 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1344 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001345 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001346 PyErr_Format(PyExc_TypeError,
1347 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001348 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001349 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001350}
1351
1352PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 const char *encoding,
1354 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001355{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001356 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001357 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001358
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001360 PyErr_BadInternalCall();
1361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001363
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001364 /* Decoding bytes objects is the most common case and should be fast */
1365 if (PyBytes_Check(obj)) {
1366 if (PyBytes_GET_SIZE(obj) == 0) {
1367 Py_INCREF(unicode_empty);
1368 v = (PyObject *) unicode_empty;
1369 }
1370 else {
1371 v = PyUnicode_Decode(
1372 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1373 encoding, errors);
1374 }
1375 return v;
1376 }
1377
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001378 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001379 PyErr_SetString(PyExc_TypeError,
1380 "decoding str is not supported");
1381 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001382 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001383
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001384 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1385 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1386 PyErr_Format(PyExc_TypeError,
1387 "coercing to str: need bytes, bytearray "
1388 "or buffer-like object, %.80s found",
1389 Py_TYPE(obj)->tp_name);
1390 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001391 }
Tim Petersced69f82003-09-16 20:30:58 +00001392
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001393 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001395 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 }
Tim Petersced69f82003-09-16 20:30:58 +00001397 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001398 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001399
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001400 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001401 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001402}
1403
Victor Stinner600d3be2010-06-10 12:00:55 +00001404/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001405 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1406 1 on success. */
1407static int
1408normalize_encoding(const char *encoding,
1409 char *lower,
1410 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001412 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001413 char *l;
1414 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001415
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001416 e = encoding;
1417 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001418 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001419 while (*e) {
1420 if (l == l_end)
1421 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001422 if (ISUPPER(*e)) {
1423 *l++ = TOLOWER(*e++);
1424 }
1425 else if (*e == '_') {
1426 *l++ = '-';
1427 e++;
1428 }
1429 else {
1430 *l++ = *e++;
1431 }
1432 }
1433 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001434 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001435}
1436
1437PyObject *PyUnicode_Decode(const char *s,
1438 Py_ssize_t size,
1439 const char *encoding,
1440 const char *errors)
1441{
1442 PyObject *buffer = NULL, *unicode;
1443 Py_buffer info;
1444 char lower[11]; /* Enough for any encoding shortcut */
1445
1446 if (encoding == NULL)
1447 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001448
1449 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001450 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1451 if (strcmp(lower, "utf-8") == 0)
1452 return PyUnicode_DecodeUTF8(s, size, errors);
1453 else if ((strcmp(lower, "latin-1") == 0) ||
1454 (strcmp(lower, "iso-8859-1") == 0))
1455 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001456#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001457 else if (strcmp(lower, "mbcs") == 0)
1458 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001459#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001460 else if (strcmp(lower, "ascii") == 0)
1461 return PyUnicode_DecodeASCII(s, size, errors);
1462 else if (strcmp(lower, "utf-16") == 0)
1463 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1464 else if (strcmp(lower, "utf-32") == 0)
1465 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467
1468 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001469 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001470 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001471 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001472 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473 if (buffer == NULL)
1474 goto onError;
1475 unicode = PyCodec_Decode(buffer, encoding, errors);
1476 if (unicode == NULL)
1477 goto onError;
1478 if (!PyUnicode_Check(unicode)) {
1479 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001480 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001481 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 Py_DECREF(unicode);
1483 goto onError;
1484 }
1485 Py_DECREF(buffer);
1486 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001487
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489 Py_XDECREF(buffer);
1490 return NULL;
1491}
1492
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001493PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1494 const char *encoding,
1495 const char *errors)
1496{
1497 PyObject *v;
1498
1499 if (!PyUnicode_Check(unicode)) {
1500 PyErr_BadArgument();
1501 goto onError;
1502 }
1503
1504 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001505 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001506
1507 /* Decode via the codec registry */
1508 v = PyCodec_Decode(unicode, encoding, errors);
1509 if (v == NULL)
1510 goto onError;
1511 return v;
1512
Benjamin Peterson29060642009-01-31 22:14:21 +00001513 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001514 return NULL;
1515}
1516
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001517PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1518 const char *encoding,
1519 const char *errors)
1520{
1521 PyObject *v;
1522
1523 if (!PyUnicode_Check(unicode)) {
1524 PyErr_BadArgument();
1525 goto onError;
1526 }
1527
1528 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001529 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001530
1531 /* Decode via the codec registry */
1532 v = PyCodec_Decode(unicode, encoding, errors);
1533 if (v == NULL)
1534 goto onError;
1535 if (!PyUnicode_Check(v)) {
1536 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001537 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001538 Py_TYPE(v)->tp_name);
1539 Py_DECREF(v);
1540 goto onError;
1541 }
1542 return v;
1543
Benjamin Peterson29060642009-01-31 22:14:21 +00001544 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001545 return NULL;
1546}
1547
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001549 Py_ssize_t size,
1550 const char *encoding,
1551 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001552{
1553 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001554
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 unicode = PyUnicode_FromUnicode(s, size);
1556 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1559 Py_DECREF(unicode);
1560 return v;
1561}
1562
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001563PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1564 const char *encoding,
1565 const char *errors)
1566{
1567 PyObject *v;
1568
1569 if (!PyUnicode_Check(unicode)) {
1570 PyErr_BadArgument();
1571 goto onError;
1572 }
1573
1574 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001576
1577 /* Encode via the codec registry */
1578 v = PyCodec_Encode(unicode, encoding, errors);
1579 if (v == NULL)
1580 goto onError;
1581 return v;
1582
Benjamin Peterson29060642009-01-31 22:14:21 +00001583 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001584 return NULL;
1585}
1586
Victor Stinnerae6265f2010-05-15 16:27:27 +00001587PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1588{
Victor Stinner313a1202010-06-11 23:56:51 +00001589 if (Py_FileSystemDefaultEncoding) {
1590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1591 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1592 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1593 PyUnicode_GET_SIZE(unicode),
1594 NULL);
1595#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001596 return PyUnicode_AsEncodedString(unicode,
1597 Py_FileSystemDefaultEncoding,
1598 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001599 }
1600 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001601 /* locale encoding with surrogateescape */
1602 wchar_t *wchar;
1603 char *bytes;
1604 PyObject *bytes_obj;
1605
1606 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1607 if (wchar == NULL)
1608 return NULL;
1609 bytes = _Py_wchar2char(wchar);
1610 PyMem_Free(wchar);
1611 if (bytes == NULL)
1612 return NULL;
1613
1614 bytes_obj = PyBytes_FromString(bytes);
1615 PyMem_Free(bytes);
1616 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001617 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00001618}
1619
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1621 const char *encoding,
1622 const char *errors)
1623{
1624 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001625 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001626
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627 if (!PyUnicode_Check(unicode)) {
1628 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001629 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630 }
Fred Drakee4315f52000-05-09 19:53:39 +00001631
Tim Petersced69f82003-09-16 20:30:58 +00001632 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001633 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001634
1635 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001636 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1637 if (strcmp(lower, "utf-8") == 0)
1638 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1639 PyUnicode_GET_SIZE(unicode),
1640 errors);
1641 else if ((strcmp(lower, "latin-1") == 0) ||
1642 (strcmp(lower, "iso-8859-1") == 0))
1643 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1644 PyUnicode_GET_SIZE(unicode),
1645 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001646#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001647 else if (strcmp(lower, "mbcs") == 0)
1648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649 PyUnicode_GET_SIZE(unicode),
1650 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001651#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001652 else if (strcmp(lower, "ascii") == 0)
1653 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1654 PyUnicode_GET_SIZE(unicode),
1655 errors);
1656 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001657 /* During bootstrap, we may need to find the encodings
1658 package, to load the file system encoding, and require the
1659 file system encoding in order to load the encodings
1660 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001661
Victor Stinner59e62db2010-05-15 13:14:32 +00001662 Break out of this dependency by assuming that the path to
1663 the encodings module is ASCII-only. XXX could try wcstombs
1664 instead, if the file system encoding is the locale's
1665 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001666 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001667 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1668 !PyThreadState_GET()->interp->codecs_initialized)
1669 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1670 PyUnicode_GET_SIZE(unicode),
1671 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672
1673 /* Encode via the codec registry */
1674 v = PyCodec_Encode(unicode, encoding, errors);
1675 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001676 return NULL;
1677
1678 /* The normal path */
1679 if (PyBytes_Check(v))
1680 return v;
1681
1682 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001683 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001684 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001685 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001686
1687 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1688 "encoder %s returned bytearray instead of bytes",
1689 encoding);
1690 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001691 Py_DECREF(v);
1692 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001693 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001694
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001695 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1696 Py_DECREF(v);
1697 return b;
1698 }
1699
1700 PyErr_Format(PyExc_TypeError,
1701 "encoder did not return a bytes object (type=%.400s)",
1702 Py_TYPE(v)->tp_name);
1703 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001704 return NULL;
1705}
1706
1707PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1708 const char *encoding,
1709 const char *errors)
1710{
1711 PyObject *v;
1712
1713 if (!PyUnicode_Check(unicode)) {
1714 PyErr_BadArgument();
1715 goto onError;
1716 }
1717
1718 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001719 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001720
1721 /* Encode via the codec registry */
1722 v = PyCodec_Encode(unicode, encoding, errors);
1723 if (v == NULL)
1724 goto onError;
1725 if (!PyUnicode_Check(v)) {
1726 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001727 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001728 Py_TYPE(v)->tp_name);
1729 Py_DECREF(v);
1730 goto onError;
1731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001733
Benjamin Peterson29060642009-01-31 22:14:21 +00001734 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 return NULL;
1736}
1737
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001738PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001739 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001740{
1741 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001742 if (v)
1743 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001744 if (errors != NULL)
1745 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001746 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001747 PyUnicode_GET_SIZE(unicode),
1748 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001749 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001750 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001751 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001752 return v;
1753}
1754
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001755PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001756PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001757 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001758 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1759}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001760
Christian Heimes5894ba72007-11-04 11:43:14 +00001761PyObject*
1762PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1763{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001764 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1765 can be undefined. If it is case, decode using UTF-8. The following assumes
1766 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1767 bootstrapping process where the codecs aren't ready yet.
1768 */
1769 if (Py_FileSystemDefaultEncoding) {
1770#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001771 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001772 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001773 }
1774#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001775 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001776 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001777 }
1778#endif
1779 return PyUnicode_Decode(s, size,
1780 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001781 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001782 }
1783 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001784 /* locale encoding with surrogateescape */
1785 wchar_t *wchar;
1786 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001787 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001788
1789 if (s[size] != '\0' || size != strlen(s)) {
1790 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1791 return NULL;
1792 }
1793
Victor Stinner168e1172010-10-16 23:16:16 +00001794 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001795 if (wchar == NULL)
1796 return NULL;
1797
Victor Stinner168e1172010-10-16 23:16:16 +00001798 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001799 PyMem_Free(wchar);
1800 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001801 }
1802}
1803
Martin v. Löwis011e8422009-05-05 04:43:17 +00001804
1805int
1806PyUnicode_FSConverter(PyObject* arg, void* addr)
1807{
1808 PyObject *output = NULL;
1809 Py_ssize_t size;
1810 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001811 if (arg == NULL) {
1812 Py_DECREF(*(PyObject**)addr);
1813 return 1;
1814 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001815 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001816 output = arg;
1817 Py_INCREF(output);
1818 }
1819 else {
1820 arg = PyUnicode_FromObject(arg);
1821 if (!arg)
1822 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001823 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001824 Py_DECREF(arg);
1825 if (!output)
1826 return 0;
1827 if (!PyBytes_Check(output)) {
1828 Py_DECREF(output);
1829 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1830 return 0;
1831 }
1832 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001833 size = PyBytes_GET_SIZE(output);
1834 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001835 if (size != strlen(data)) {
1836 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1837 Py_DECREF(output);
1838 return 0;
1839 }
1840 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001841 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001842}
1843
1844
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001845int
1846PyUnicode_FSDecoder(PyObject* arg, void* addr)
1847{
1848 PyObject *output = NULL;
1849 Py_ssize_t size;
1850 void *data;
1851 if (arg == NULL) {
1852 Py_DECREF(*(PyObject**)addr);
1853 return 1;
1854 }
1855 if (PyUnicode_Check(arg)) {
1856 output = arg;
1857 Py_INCREF(output);
1858 }
1859 else {
1860 arg = PyBytes_FromObject(arg);
1861 if (!arg)
1862 return 0;
1863 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1864 PyBytes_GET_SIZE(arg));
1865 Py_DECREF(arg);
1866 if (!output)
1867 return 0;
1868 if (!PyUnicode_Check(output)) {
1869 Py_DECREF(output);
1870 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1871 return 0;
1872 }
1873 }
1874 size = PyUnicode_GET_SIZE(output);
1875 data = PyUnicode_AS_UNICODE(output);
1876 if (size != Py_UNICODE_strlen(data)) {
1877 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1878 Py_DECREF(output);
1879 return 0;
1880 }
1881 *(PyObject**)addr = output;
1882 return Py_CLEANUP_SUPPORTED;
1883}
1884
1885
Martin v. Löwis5b222132007-06-10 09:51:05 +00001886char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001887_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001888{
Christian Heimesf3863112007-11-22 07:46:41 +00001889 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001890 if (!PyUnicode_Check(unicode)) {
1891 PyErr_BadArgument();
1892 return NULL;
1893 }
Christian Heimesf3863112007-11-22 07:46:41 +00001894 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1895 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001896 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001897 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001898 *psize = PyBytes_GET_SIZE(bytes);
1899 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001900}
1901
1902char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001903_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001904{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001905 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001906}
1907
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1909{
1910 if (!PyUnicode_Check(unicode)) {
1911 PyErr_BadArgument();
1912 goto onError;
1913 }
1914 return PyUnicode_AS_UNICODE(unicode);
1915
Benjamin Peterson29060642009-01-31 22:14:21 +00001916 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 return NULL;
1918}
1919
Martin v. Löwis18e16552006-02-15 17:27:45 +00001920Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921{
1922 if (!PyUnicode_Check(unicode)) {
1923 PyErr_BadArgument();
1924 goto onError;
1925 }
1926 return PyUnicode_GET_SIZE(unicode);
1927
Benjamin Peterson29060642009-01-31 22:14:21 +00001928 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 return -1;
1930}
1931
Thomas Wouters78890102000-07-22 19:25:51 +00001932const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001933{
Victor Stinner42cb4622010-09-01 19:39:01 +00001934 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001935}
1936
Victor Stinner554f3f02010-06-16 23:33:54 +00001937/* create or adjust a UnicodeDecodeError */
1938static void
1939make_decode_exception(PyObject **exceptionObject,
1940 const char *encoding,
1941 const char *input, Py_ssize_t length,
1942 Py_ssize_t startpos, Py_ssize_t endpos,
1943 const char *reason)
1944{
1945 if (*exceptionObject == NULL) {
1946 *exceptionObject = PyUnicodeDecodeError_Create(
1947 encoding, input, length, startpos, endpos, reason);
1948 }
1949 else {
1950 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1951 goto onError;
1952 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1953 goto onError;
1954 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1955 goto onError;
1956 }
1957 return;
1958
1959onError:
1960 Py_DECREF(*exceptionObject);
1961 *exceptionObject = NULL;
1962}
1963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001964/* error handling callback helper:
1965 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001966 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001967 and adjust various state variables.
1968 return 0 on success, -1 on error
1969*/
1970
1971static
1972int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001973 const char *encoding, const char *reason,
1974 const char **input, const char **inend, Py_ssize_t *startinpos,
1975 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1976 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001977{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001978 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001979
1980 PyObject *restuple = NULL;
1981 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001982 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001983 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001984 Py_ssize_t requiredsize;
1985 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001986 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001987 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001988 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 int res = -1;
1990
1991 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001992 *errorHandler = PyCodec_LookupError(errors);
1993 if (*errorHandler == NULL)
1994 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001995 }
1996
Victor Stinner554f3f02010-06-16 23:33:54 +00001997 make_decode_exception(exceptionObject,
1998 encoding,
1999 *input, *inend - *input,
2000 *startinpos, *endinpos,
2001 reason);
2002 if (*exceptionObject == NULL)
2003 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002004
2005 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2006 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002008 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002009 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002010 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 }
2012 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002013 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002014
2015 /* Copy back the bytes variables, which might have been modified by the
2016 callback */
2017 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2018 if (!inputobj)
2019 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002020 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002021 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002022 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002023 *input = PyBytes_AS_STRING(inputobj);
2024 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002025 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002026 /* we can DECREF safely, as the exception has another reference,
2027 so the object won't go away. */
2028 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002029
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002030 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002032 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002033 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2034 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002035 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002036
2037 /* need more space? (at least enough for what we
2038 have+the replacement+the rest of the string (starting
2039 at the new input position), so we won't have to check space
2040 when there are no errors in the rest of the string) */
2041 repptr = PyUnicode_AS_UNICODE(repunicode);
2042 repsize = PyUnicode_GET_SIZE(repunicode);
2043 requiredsize = *outpos + repsize + insize-newpos;
2044 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002045 if (requiredsize<2*outsize)
2046 requiredsize = 2*outsize;
2047 if (_PyUnicode_Resize(output, requiredsize) < 0)
2048 goto onError;
2049 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002050 }
2051 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002052 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 Py_UNICODE_COPY(*outptr, repptr, repsize);
2054 *outptr += repsize;
2055 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002056
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 /* we made it! */
2058 res = 0;
2059
Benjamin Peterson29060642009-01-31 22:14:21 +00002060 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 Py_XDECREF(restuple);
2062 return res;
2063}
2064
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002065/* --- UTF-7 Codec -------------------------------------------------------- */
2066
Antoine Pitrou244651a2009-05-04 18:56:13 +00002067/* See RFC2152 for details. We encode conservatively and decode liberally. */
2068
2069/* Three simple macros defining base-64. */
2070
2071/* Is c a base-64 character? */
2072
2073#define IS_BASE64(c) \
2074 (((c) >= 'A' && (c) <= 'Z') || \
2075 ((c) >= 'a' && (c) <= 'z') || \
2076 ((c) >= '0' && (c) <= '9') || \
2077 (c) == '+' || (c) == '/')
2078
2079/* given that c is a base-64 character, what is its base-64 value? */
2080
2081#define FROM_BASE64(c) \
2082 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2083 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2084 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2085 (c) == '+' ? 62 : 63)
2086
2087/* What is the base-64 character of the bottom 6 bits of n? */
2088
2089#define TO_BASE64(n) \
2090 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2091
2092/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2093 * decoded as itself. We are permissive on decoding; the only ASCII
2094 * byte not decoding to itself is the + which begins a base64
2095 * string. */
2096
2097#define DECODE_DIRECT(c) \
2098 ((c) <= 127 && (c) != '+')
2099
2100/* The UTF-7 encoder treats ASCII characters differently according to
2101 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2102 * the above). See RFC2152. This array identifies these different
2103 * sets:
2104 * 0 : "Set D"
2105 * alphanumeric and '(),-./:?
2106 * 1 : "Set O"
2107 * !"#$%&*;<=>@[]^_`{|}
2108 * 2 : "whitespace"
2109 * ht nl cr sp
2110 * 3 : special (must be base64 encoded)
2111 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2112 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002113
Tim Petersced69f82003-09-16 20:30:58 +00002114static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002115char utf7_category[128] = {
2116/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2117 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2118/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2119 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2120/* sp ! " # $ % & ' ( ) * + , - . / */
2121 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2122/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2124/* @ A B C D E F G H I J K L M N O */
2125 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2126/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2128/* ` a b c d e f g h i j k l m n o */
2129 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2130/* p q r s t u v w x y z { | } ~ del */
2131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002132};
2133
Antoine Pitrou244651a2009-05-04 18:56:13 +00002134/* ENCODE_DIRECT: this character should be encoded as itself. The
2135 * answer depends on whether we are encoding set O as itself, and also
2136 * on whether we are encoding whitespace as itself. RFC2152 makes it
2137 * clear that the answers to these questions vary between
2138 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002139
Antoine Pitrou244651a2009-05-04 18:56:13 +00002140#define ENCODE_DIRECT(c, directO, directWS) \
2141 ((c) < 128 && (c) > 0 && \
2142 ((utf7_category[(c)] == 0) || \
2143 (directWS && (utf7_category[(c)] == 2)) || \
2144 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002146PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002147 Py_ssize_t size,
2148 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002149{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002150 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2151}
2152
Antoine Pitrou244651a2009-05-04 18:56:13 +00002153/* The decoder. The only state we preserve is our read position,
2154 * i.e. how many characters we have consumed. So if we end in the
2155 * middle of a shift sequence we have to back off the read position
2156 * and the output to the beginning of the sequence, otherwise we lose
2157 * all the shift state (seen bits, number of bits seen, high
2158 * surrogate). */
2159
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002160PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002161 Py_ssize_t size,
2162 const char *errors,
2163 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002164{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002166 Py_ssize_t startinpos;
2167 Py_ssize_t endinpos;
2168 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002169 const char *e;
2170 PyUnicodeObject *unicode;
2171 Py_UNICODE *p;
2172 const char *errmsg = "";
2173 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002174 Py_UNICODE *shiftOutStart;
2175 unsigned int base64bits = 0;
2176 unsigned long base64buffer = 0;
2177 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002178 PyObject *errorHandler = NULL;
2179 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002180
2181 unicode = _PyUnicode_New(size);
2182 if (!unicode)
2183 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002184 if (size == 0) {
2185 if (consumed)
2186 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002187 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002188 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002189
2190 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002191 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002192 e = s + size;
2193
2194 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002196 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002197 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002198
Antoine Pitrou244651a2009-05-04 18:56:13 +00002199 if (inShift) { /* in a base-64 section */
2200 if (IS_BASE64(ch)) { /* consume a base-64 character */
2201 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2202 base64bits += 6;
2203 s++;
2204 if (base64bits >= 16) {
2205 /* we have enough bits for a UTF-16 value */
2206 Py_UNICODE outCh = (Py_UNICODE)
2207 (base64buffer >> (base64bits-16));
2208 base64bits -= 16;
2209 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2210 if (surrogate) {
2211 /* expecting a second surrogate */
2212 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2213#ifdef Py_UNICODE_WIDE
2214 *p++ = (((surrogate & 0x3FF)<<10)
2215 | (outCh & 0x3FF)) + 0x10000;
2216#else
2217 *p++ = surrogate;
2218 *p++ = outCh;
2219#endif
2220 surrogate = 0;
2221 }
2222 else {
2223 surrogate = 0;
2224 errmsg = "second surrogate missing";
2225 goto utf7Error;
2226 }
2227 }
2228 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2229 /* first surrogate */
2230 surrogate = outCh;
2231 }
2232 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2233 errmsg = "unexpected second surrogate";
2234 goto utf7Error;
2235 }
2236 else {
2237 *p++ = outCh;
2238 }
2239 }
2240 }
2241 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002242 inShift = 0;
2243 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002244 if (surrogate) {
2245 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002246 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002247 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002248 if (base64bits > 0) { /* left-over bits */
2249 if (base64bits >= 6) {
2250 /* We've seen at least one base-64 character */
2251 errmsg = "partial character in shift sequence";
2252 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002253 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002254 else {
2255 /* Some bits remain; they should be zero */
2256 if (base64buffer != 0) {
2257 errmsg = "non-zero padding bits in shift sequence";
2258 goto utf7Error;
2259 }
2260 }
2261 }
2262 if (ch != '-') {
2263 /* '-' is absorbed; other terminating
2264 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002265 *p++ = ch;
2266 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002267 }
2268 }
2269 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002271 s++; /* consume '+' */
2272 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002273 s++;
2274 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002275 }
2276 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002277 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002278 shiftOutStart = p;
2279 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002280 }
2281 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002282 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002283 *p++ = ch;
2284 s++;
2285 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002286 else {
2287 startinpos = s-starts;
2288 s++;
2289 errmsg = "unexpected special character";
2290 goto utf7Error;
2291 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002292 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002294 outpos = p-PyUnicode_AS_UNICODE(unicode);
2295 endinpos = s-starts;
2296 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002297 errors, &errorHandler,
2298 "utf7", errmsg,
2299 &starts, &e, &startinpos, &endinpos, &exc, &s,
2300 &unicode, &outpos, &p))
2301 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002302 }
2303
Antoine Pitrou244651a2009-05-04 18:56:13 +00002304 /* end of string */
2305
2306 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2307 /* if we're in an inconsistent state, that's an error */
2308 if (surrogate ||
2309 (base64bits >= 6) ||
2310 (base64bits > 0 && base64buffer != 0)) {
2311 outpos = p-PyUnicode_AS_UNICODE(unicode);
2312 endinpos = size;
2313 if (unicode_decode_call_errorhandler(
2314 errors, &errorHandler,
2315 "utf7", "unterminated shift sequence",
2316 &starts, &e, &startinpos, &endinpos, &exc, &s,
2317 &unicode, &outpos, &p))
2318 goto onError;
2319 if (s < e)
2320 goto restart;
2321 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002322 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002323
2324 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002325 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002326 if (inShift) {
2327 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002328 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002329 }
2330 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002331 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002332 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002333 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002334
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002335 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002336 goto onError;
2337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002338 Py_XDECREF(errorHandler);
2339 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002340 return (PyObject *)unicode;
2341
Benjamin Peterson29060642009-01-31 22:14:21 +00002342 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002343 Py_XDECREF(errorHandler);
2344 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002345 Py_DECREF(unicode);
2346 return NULL;
2347}
2348
2349
2350PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002351 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002352 int base64SetO,
2353 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002354 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002355{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002356 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002357 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002358 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002359 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002360 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002361 unsigned int base64bits = 0;
2362 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002363 char * out;
2364 char * start;
2365
2366 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002367 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002368
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002369 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002370 return PyErr_NoMemory();
2371
Antoine Pitrou244651a2009-05-04 18:56:13 +00002372 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002373 if (v == NULL)
2374 return NULL;
2375
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002376 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002377 for (;i < size; ++i) {
2378 Py_UNICODE ch = s[i];
2379
Antoine Pitrou244651a2009-05-04 18:56:13 +00002380 if (inShift) {
2381 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2382 /* shifting out */
2383 if (base64bits) { /* output remaining bits */
2384 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2385 base64buffer = 0;
2386 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002387 }
2388 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002389 /* Characters not in the BASE64 set implicitly unshift the sequence
2390 so no '-' is required, except if the character is itself a '-' */
2391 if (IS_BASE64(ch) || ch == '-') {
2392 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002393 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002394 *out++ = (char) ch;
2395 }
2396 else {
2397 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002400 else { /* not in a shift sequence */
2401 if (ch == '+') {
2402 *out++ = '+';
2403 *out++ = '-';
2404 }
2405 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2406 *out++ = (char) ch;
2407 }
2408 else {
2409 *out++ = '+';
2410 inShift = 1;
2411 goto encode_char;
2412 }
2413 }
2414 continue;
2415encode_char:
2416#ifdef Py_UNICODE_WIDE
2417 if (ch >= 0x10000) {
2418 /* code first surrogate */
2419 base64bits += 16;
2420 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2421 while (base64bits >= 6) {
2422 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2423 base64bits -= 6;
2424 }
2425 /* prepare second surrogate */
2426 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2427 }
2428#endif
2429 base64bits += 16;
2430 base64buffer = (base64buffer << 16) | ch;
2431 while (base64bits >= 6) {
2432 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2433 base64bits -= 6;
2434 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002435 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002436 if (base64bits)
2437 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2438 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002439 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002440 if (_PyBytes_Resize(&v, out - start) < 0)
2441 return NULL;
2442 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002443}
2444
Antoine Pitrou244651a2009-05-04 18:56:13 +00002445#undef IS_BASE64
2446#undef FROM_BASE64
2447#undef TO_BASE64
2448#undef DECODE_DIRECT
2449#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002450
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451/* --- UTF-8 Codec -------------------------------------------------------- */
2452
Tim Petersced69f82003-09-16 20:30:58 +00002453static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002455 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2456 illegal prefix. See RFC 3629 for details */
2457 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2458 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002459 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2463 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002464 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2465 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2469 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2470 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2471 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2472 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473};
2474
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002476 Py_ssize_t size,
2477 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478{
Walter Dörwald69652032004-09-07 20:24:22 +00002479 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2480}
2481
Antoine Pitrouab868312009-01-10 15:40:25 +00002482/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2483#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2484
2485/* Mask to quickly check whether a C 'long' contains a
2486 non-ASCII, UTF8-encoded char. */
2487#if (SIZEOF_LONG == 8)
2488# define ASCII_CHAR_MASK 0x8080808080808080L
2489#elif (SIZEOF_LONG == 4)
2490# define ASCII_CHAR_MASK 0x80808080L
2491#else
2492# error C 'long' size should be either 4 or 8!
2493#endif
2494
Walter Dörwald69652032004-09-07 20:24:22 +00002495PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002496 Py_ssize_t size,
2497 const char *errors,
2498 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002499{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002502 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002503 Py_ssize_t startinpos;
2504 Py_ssize_t endinpos;
2505 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002506 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507 PyUnicodeObject *unicode;
2508 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002509 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002510 PyObject *errorHandler = NULL;
2511 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512
2513 /* Note: size will always be longer than the resulting Unicode
2514 character count */
2515 unicode = _PyUnicode_New(size);
2516 if (!unicode)
2517 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002518 if (size == 0) {
2519 if (consumed)
2520 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523
2524 /* Unpack UTF-8 encoded data */
2525 p = unicode->str;
2526 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002527 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528
2529 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002530 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531
2532 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002533 /* Fast path for runs of ASCII characters. Given that common UTF-8
2534 input will consist of an overwhelming majority of ASCII
2535 characters, we try to optimize for this case by checking
2536 as many characters as a C 'long' can contain.
2537 First, check if we can do an aligned read, as most CPUs have
2538 a penalty for unaligned reads.
2539 */
2540 if (!((size_t) s & LONG_PTR_MASK)) {
2541 /* Help register allocation */
2542 register const char *_s = s;
2543 register Py_UNICODE *_p = p;
2544 while (_s < aligned_end) {
2545 /* Read a whole long at a time (either 4 or 8 bytes),
2546 and do a fast unrolled copy if it only contains ASCII
2547 characters. */
2548 unsigned long data = *(unsigned long *) _s;
2549 if (data & ASCII_CHAR_MASK)
2550 break;
2551 _p[0] = (unsigned char) _s[0];
2552 _p[1] = (unsigned char) _s[1];
2553 _p[2] = (unsigned char) _s[2];
2554 _p[3] = (unsigned char) _s[3];
2555#if (SIZEOF_LONG == 8)
2556 _p[4] = (unsigned char) _s[4];
2557 _p[5] = (unsigned char) _s[5];
2558 _p[6] = (unsigned char) _s[6];
2559 _p[7] = (unsigned char) _s[7];
2560#endif
2561 _s += SIZEOF_LONG;
2562 _p += SIZEOF_LONG;
2563 }
2564 s = _s;
2565 p = _p;
2566 if (s == e)
2567 break;
2568 ch = (unsigned char)*s;
2569 }
2570 }
2571
2572 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002573 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 s++;
2575 continue;
2576 }
2577
2578 n = utf8_code_length[ch];
2579
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002580 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002581 if (consumed)
2582 break;
2583 else {
2584 errmsg = "unexpected end of data";
2585 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002586 endinpos = startinpos+1;
2587 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2588 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002589 goto utf8Error;
2590 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592
2593 switch (n) {
2594
2595 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002596 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002597 startinpos = s-starts;
2598 endinpos = startinpos+1;
2599 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600
2601 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002602 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002603 startinpos = s-starts;
2604 endinpos = startinpos+1;
2605 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606
2607 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002608 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002609 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002610 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002611 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002612 goto utf8Error;
2613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002615 assert ((ch > 0x007F) && (ch <= 0x07FF));
2616 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 break;
2618
2619 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002620 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2621 will result in surrogates in range d800-dfff. Surrogates are
2622 not valid UTF-8 so they are rejected.
2623 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2624 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002625 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002626 (s[2] & 0xc0) != 0x80 ||
2627 ((unsigned char)s[0] == 0xE0 &&
2628 (unsigned char)s[1] < 0xA0) ||
2629 ((unsigned char)s[0] == 0xED &&
2630 (unsigned char)s[1] > 0x9F)) {
2631 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002632 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002633 endinpos = startinpos + 1;
2634
2635 /* if s[1] first two bits are 1 and 0, then the invalid
2636 continuation byte is s[2], so increment endinpos by 1,
2637 if not, s[1] is invalid and endinpos doesn't need to
2638 be incremented. */
2639 if ((s[1] & 0xC0) == 0x80)
2640 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002641 goto utf8Error;
2642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002644 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2645 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002646 break;
2647
2648 case 4:
2649 if ((s[1] & 0xc0) != 0x80 ||
2650 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002651 (s[3] & 0xc0) != 0x80 ||
2652 ((unsigned char)s[0] == 0xF0 &&
2653 (unsigned char)s[1] < 0x90) ||
2654 ((unsigned char)s[0] == 0xF4 &&
2655 (unsigned char)s[1] > 0x8F)) {
2656 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002657 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002658 endinpos = startinpos + 1;
2659 if ((s[1] & 0xC0) == 0x80) {
2660 endinpos++;
2661 if ((s[2] & 0xC0) == 0x80)
2662 endinpos++;
2663 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002664 goto utf8Error;
2665 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002666 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002667 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2668 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2669
Fredrik Lundh8f455852001-06-27 18:59:43 +00002670#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002672#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002673 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002674
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002675 /* translate from 10000..10FFFF to 0..FFFF */
2676 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002677
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002678 /* high surrogate = top 10 bits added to D800 */
2679 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002680
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002681 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002682 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002683#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 }
2686 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002687 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002688
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 utf8Error:
2690 outpos = p-PyUnicode_AS_UNICODE(unicode);
2691 if (unicode_decode_call_errorhandler(
2692 errors, &errorHandler,
2693 "utf8", errmsg,
2694 &starts, &e, &startinpos, &endinpos, &exc, &s,
2695 &unicode, &outpos, &p))
2696 goto onError;
2697 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 }
Walter Dörwald69652032004-09-07 20:24:22 +00002699 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002700 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701
2702 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002703 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 goto onError;
2705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 Py_XDECREF(errorHandler);
2707 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 return (PyObject *)unicode;
2709
Benjamin Peterson29060642009-01-31 22:14:21 +00002710 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 Py_DECREF(unicode);
2714 return NULL;
2715}
2716
Antoine Pitrouab868312009-01-10 15:40:25 +00002717#undef ASCII_CHAR_MASK
2718
2719
Tim Peters602f7402002-04-27 18:03:26 +00002720/* Allocation strategy: if the string is short, convert into a stack buffer
2721 and allocate exactly as much space needed at the end. Else allocate the
2722 maximum possible needed (4 result bytes per Unicode character), and return
2723 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002724*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002725PyObject *
2726PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002727 Py_ssize_t size,
2728 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729{
Tim Peters602f7402002-04-27 18:03:26 +00002730#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002731
Guido van Rossum98297ee2007-11-06 21:34:58 +00002732 Py_ssize_t i; /* index into s of next input byte */
2733 PyObject *result; /* result string object */
2734 char *p; /* next free byte in output buffer */
2735 Py_ssize_t nallocated; /* number of result bytes allocated */
2736 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002737 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002738 PyObject *errorHandler = NULL;
2739 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002740
Tim Peters602f7402002-04-27 18:03:26 +00002741 assert(s != NULL);
2742 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743
Tim Peters602f7402002-04-27 18:03:26 +00002744 if (size <= MAX_SHORT_UNICHARS) {
2745 /* Write into the stack buffer; nallocated can't overflow.
2746 * At the end, we'll allocate exactly as much heap space as it
2747 * turns out we need.
2748 */
2749 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002750 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002751 p = stackbuf;
2752 }
2753 else {
2754 /* Overallocate on the heap, and give the excess back at the end. */
2755 nallocated = size * 4;
2756 if (nallocated / 4 != size) /* overflow! */
2757 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002758 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002759 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002760 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002761 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002762 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002763
Tim Peters602f7402002-04-27 18:03:26 +00002764 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002765 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002766
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002767 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002768 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002770
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002772 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002773 *p++ = (char)(0xc0 | (ch >> 6));
2774 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002775 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002776#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002777 /* Special case: check for high and low surrogate */
2778 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2779 Py_UCS4 ch2 = s[i];
2780 /* Combine the two surrogates to form a UCS4 value */
2781 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2782 i++;
2783
2784 /* Encode UCS4 Unicode ordinals */
2785 *p++ = (char)(0xf0 | (ch >> 18));
2786 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002787 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2788 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002789 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002790#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002791 Py_ssize_t newpos;
2792 PyObject *rep;
2793 Py_ssize_t repsize, k;
2794 rep = unicode_encode_call_errorhandler
2795 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2796 s, size, &exc, i-1, i, &newpos);
2797 if (!rep)
2798 goto error;
2799
2800 if (PyBytes_Check(rep))
2801 repsize = PyBytes_GET_SIZE(rep);
2802 else
2803 repsize = PyUnicode_GET_SIZE(rep);
2804
2805 if (repsize > 4) {
2806 Py_ssize_t offset;
2807
2808 if (result == NULL)
2809 offset = p - stackbuf;
2810 else
2811 offset = p - PyBytes_AS_STRING(result);
2812
2813 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2814 /* integer overflow */
2815 PyErr_NoMemory();
2816 goto error;
2817 }
2818 nallocated += repsize - 4;
2819 if (result != NULL) {
2820 if (_PyBytes_Resize(&result, nallocated) < 0)
2821 goto error;
2822 } else {
2823 result = PyBytes_FromStringAndSize(NULL, nallocated);
2824 if (result == NULL)
2825 goto error;
2826 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2827 }
2828 p = PyBytes_AS_STRING(result) + offset;
2829 }
2830
2831 if (PyBytes_Check(rep)) {
2832 char *prep = PyBytes_AS_STRING(rep);
2833 for(k = repsize; k > 0; k--)
2834 *p++ = *prep++;
2835 } else /* rep is unicode */ {
2836 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2837 Py_UNICODE c;
2838
2839 for(k=0; k<repsize; k++) {
2840 c = prep[k];
2841 if (0x80 <= c) {
2842 raise_encode_exception(&exc, "utf-8", s, size,
2843 i-1, i, "surrogates not allowed");
2844 goto error;
2845 }
2846 *p++ = (char)prep[k];
2847 }
2848 }
2849 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002850#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002851 }
Victor Stinner445a6232010-04-22 20:01:57 +00002852#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002853 } else if (ch < 0x10000) {
2854 *p++ = (char)(0xe0 | (ch >> 12));
2855 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2856 *p++ = (char)(0x80 | (ch & 0x3f));
2857 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002858 /* Encode UCS4 Unicode ordinals */
2859 *p++ = (char)(0xf0 | (ch >> 18));
2860 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2861 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2862 *p++ = (char)(0x80 | (ch & 0x3f));
2863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002865
Guido van Rossum98297ee2007-11-06 21:34:58 +00002866 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002867 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002868 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002869 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002870 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002871 }
2872 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002873 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002874 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002875 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002876 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002877 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002878 Py_XDECREF(errorHandler);
2879 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002880 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002881 error:
2882 Py_XDECREF(errorHandler);
2883 Py_XDECREF(exc);
2884 Py_XDECREF(result);
2885 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002886
Tim Peters602f7402002-04-27 18:03:26 +00002887#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888}
2889
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2891{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 if (!PyUnicode_Check(unicode)) {
2893 PyErr_BadArgument();
2894 return NULL;
2895 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002896 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 PyUnicode_GET_SIZE(unicode),
2898 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899}
2900
Walter Dörwald41980ca2007-08-16 21:55:45 +00002901/* --- UTF-32 Codec ------------------------------------------------------- */
2902
2903PyObject *
2904PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002905 Py_ssize_t size,
2906 const char *errors,
2907 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002908{
2909 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2910}
2911
2912PyObject *
2913PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002914 Py_ssize_t size,
2915 const char *errors,
2916 int *byteorder,
2917 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002918{
2919 const char *starts = s;
2920 Py_ssize_t startinpos;
2921 Py_ssize_t endinpos;
2922 Py_ssize_t outpos;
2923 PyUnicodeObject *unicode;
2924 Py_UNICODE *p;
2925#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002926 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002927 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002928#else
2929 const int pairs = 0;
2930#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002931 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002932 int bo = 0; /* assume native ordering by default */
2933 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002934 /* Offsets from q for retrieving bytes in the right order. */
2935#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2936 int iorder[] = {0, 1, 2, 3};
2937#else
2938 int iorder[] = {3, 2, 1, 0};
2939#endif
2940 PyObject *errorHandler = NULL;
2941 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002942
Walter Dörwald41980ca2007-08-16 21:55:45 +00002943 q = (unsigned char *)s;
2944 e = q + size;
2945
2946 if (byteorder)
2947 bo = *byteorder;
2948
2949 /* Check for BOM marks (U+FEFF) in the input and adjust current
2950 byte order setting accordingly. In native mode, the leading BOM
2951 mark is skipped, in all other modes, it is copied to the output
2952 stream as-is (giving a ZWNBSP character). */
2953 if (bo == 0) {
2954 if (size >= 4) {
2955 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002956 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002957#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002958 if (bom == 0x0000FEFF) {
2959 q += 4;
2960 bo = -1;
2961 }
2962 else if (bom == 0xFFFE0000) {
2963 q += 4;
2964 bo = 1;
2965 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002966#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002967 if (bom == 0x0000FEFF) {
2968 q += 4;
2969 bo = 1;
2970 }
2971 else if (bom == 0xFFFE0000) {
2972 q += 4;
2973 bo = -1;
2974 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002975#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002976 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002977 }
2978
2979 if (bo == -1) {
2980 /* force LE */
2981 iorder[0] = 0;
2982 iorder[1] = 1;
2983 iorder[2] = 2;
2984 iorder[3] = 3;
2985 }
2986 else if (bo == 1) {
2987 /* force BE */
2988 iorder[0] = 3;
2989 iorder[1] = 2;
2990 iorder[2] = 1;
2991 iorder[3] = 0;
2992 }
2993
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002994 /* On narrow builds we split characters outside the BMP into two
2995 codepoints => count how much extra space we need. */
2996#ifndef Py_UNICODE_WIDE
2997 for (qq = q; qq < e; qq += 4)
2998 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2999 pairs++;
3000#endif
3001
3002 /* This might be one to much, because of a BOM */
3003 unicode = _PyUnicode_New((size+3)/4+pairs);
3004 if (!unicode)
3005 return NULL;
3006 if (size == 0)
3007 return (PyObject *)unicode;
3008
3009 /* Unpack UTF-32 encoded data */
3010 p = unicode->str;
3011
Walter Dörwald41980ca2007-08-16 21:55:45 +00003012 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003013 Py_UCS4 ch;
3014 /* remaining bytes at the end? (size should be divisible by 4) */
3015 if (e-q<4) {
3016 if (consumed)
3017 break;
3018 errmsg = "truncated data";
3019 startinpos = ((const char *)q)-starts;
3020 endinpos = ((const char *)e)-starts;
3021 goto utf32Error;
3022 /* The remaining input chars are ignored if the callback
3023 chooses to skip the input */
3024 }
3025 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3026 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003027
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 if (ch >= 0x110000)
3029 {
3030 errmsg = "codepoint not in range(0x110000)";
3031 startinpos = ((const char *)q)-starts;
3032 endinpos = startinpos+4;
3033 goto utf32Error;
3034 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003035#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 if (ch >= 0x10000)
3037 {
3038 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3039 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3040 }
3041 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003042#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 *p++ = ch;
3044 q += 4;
3045 continue;
3046 utf32Error:
3047 outpos = p-PyUnicode_AS_UNICODE(unicode);
3048 if (unicode_decode_call_errorhandler(
3049 errors, &errorHandler,
3050 "utf32", errmsg,
3051 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3052 &unicode, &outpos, &p))
3053 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003054 }
3055
3056 if (byteorder)
3057 *byteorder = bo;
3058
3059 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003060 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003061
3062 /* Adjust length */
3063 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3064 goto onError;
3065
3066 Py_XDECREF(errorHandler);
3067 Py_XDECREF(exc);
3068 return (PyObject *)unicode;
3069
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003071 Py_DECREF(unicode);
3072 Py_XDECREF(errorHandler);
3073 Py_XDECREF(exc);
3074 return NULL;
3075}
3076
3077PyObject *
3078PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003079 Py_ssize_t size,
3080 const char *errors,
3081 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003082{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003083 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003084 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003085 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003086#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003087 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003088#else
3089 const int pairs = 0;
3090#endif
3091 /* Offsets from p for storing byte pairs in the right order. */
3092#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3093 int iorder[] = {0, 1, 2, 3};
3094#else
3095 int iorder[] = {3, 2, 1, 0};
3096#endif
3097
Benjamin Peterson29060642009-01-31 22:14:21 +00003098#define STORECHAR(CH) \
3099 do { \
3100 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3101 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3102 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3103 p[iorder[0]] = (CH) & 0xff; \
3104 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003105 } while(0)
3106
3107 /* In narrow builds we can output surrogate pairs as one codepoint,
3108 so we need less space. */
3109#ifndef Py_UNICODE_WIDE
3110 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3112 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3113 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003114#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003115 nsize = (size - pairs + (byteorder == 0));
3116 bytesize = nsize * 4;
3117 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003118 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003119 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003120 if (v == NULL)
3121 return NULL;
3122
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003123 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003124 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003125 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003126 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003127 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003128
3129 if (byteorder == -1) {
3130 /* force LE */
3131 iorder[0] = 0;
3132 iorder[1] = 1;
3133 iorder[2] = 2;
3134 iorder[3] = 3;
3135 }
3136 else if (byteorder == 1) {
3137 /* force BE */
3138 iorder[0] = 3;
3139 iorder[1] = 2;
3140 iorder[2] = 1;
3141 iorder[3] = 0;
3142 }
3143
3144 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003146#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003147 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3148 Py_UCS4 ch2 = *s;
3149 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3150 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3151 s++;
3152 size--;
3153 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003154 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003155#endif
3156 STORECHAR(ch);
3157 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003158
3159 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003160 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003161#undef STORECHAR
3162}
3163
3164PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3165{
3166 if (!PyUnicode_Check(unicode)) {
3167 PyErr_BadArgument();
3168 return NULL;
3169 }
3170 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 PyUnicode_GET_SIZE(unicode),
3172 NULL,
3173 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003174}
3175
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176/* --- UTF-16 Codec ------------------------------------------------------- */
3177
Tim Peters772747b2001-08-09 22:21:55 +00003178PyObject *
3179PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003180 Py_ssize_t size,
3181 const char *errors,
3182 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183{
Walter Dörwald69652032004-09-07 20:24:22 +00003184 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3185}
3186
Antoine Pitrouab868312009-01-10 15:40:25 +00003187/* Two masks for fast checking of whether a C 'long' may contain
3188 UTF16-encoded surrogate characters. This is an efficient heuristic,
3189 assuming that non-surrogate characters with a code point >= 0x8000 are
3190 rare in most input.
3191 FAST_CHAR_MASK is used when the input is in native byte ordering,
3192 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003193*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003194#if (SIZEOF_LONG == 8)
3195# define FAST_CHAR_MASK 0x8000800080008000L
3196# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3197#elif (SIZEOF_LONG == 4)
3198# define FAST_CHAR_MASK 0x80008000L
3199# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3200#else
3201# error C 'long' size should be either 4 or 8!
3202#endif
3203
Walter Dörwald69652032004-09-07 20:24:22 +00003204PyObject *
3205PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 Py_ssize_t size,
3207 const char *errors,
3208 int *byteorder,
3209 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003210{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003212 Py_ssize_t startinpos;
3213 Py_ssize_t endinpos;
3214 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 PyUnicodeObject *unicode;
3216 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003217 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003218 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003219 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003220 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003221 /* Offsets from q for retrieving byte pairs in the right order. */
3222#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3223 int ihi = 1, ilo = 0;
3224#else
3225 int ihi = 0, ilo = 1;
3226#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003227 PyObject *errorHandler = NULL;
3228 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229
3230 /* Note: size will always be longer than the resulting Unicode
3231 character count */
3232 unicode = _PyUnicode_New(size);
3233 if (!unicode)
3234 return NULL;
3235 if (size == 0)
3236 return (PyObject *)unicode;
3237
3238 /* Unpack UTF-16 encoded data */
3239 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003240 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003241 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242
3243 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003244 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003246 /* Check for BOM marks (U+FEFF) in the input and adjust current
3247 byte order setting accordingly. In native mode, the leading BOM
3248 mark is skipped, in all other modes, it is copied to the output
3249 stream as-is (giving a ZWNBSP character). */
3250 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003251 if (size >= 2) {
3252 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003253#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003254 if (bom == 0xFEFF) {
3255 q += 2;
3256 bo = -1;
3257 }
3258 else if (bom == 0xFFFE) {
3259 q += 2;
3260 bo = 1;
3261 }
Tim Petersced69f82003-09-16 20:30:58 +00003262#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003263 if (bom == 0xFEFF) {
3264 q += 2;
3265 bo = 1;
3266 }
3267 else if (bom == 0xFFFE) {
3268 q += 2;
3269 bo = -1;
3270 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003271#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003272 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003274
Tim Peters772747b2001-08-09 22:21:55 +00003275 if (bo == -1) {
3276 /* force LE */
3277 ihi = 1;
3278 ilo = 0;
3279 }
3280 else if (bo == 1) {
3281 /* force BE */
3282 ihi = 0;
3283 ilo = 1;
3284 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003285#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3286 native_ordering = ilo < ihi;
3287#else
3288 native_ordering = ilo > ihi;
3289#endif
Tim Peters772747b2001-08-09 22:21:55 +00003290
Antoine Pitrouab868312009-01-10 15:40:25 +00003291 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003292 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003293 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003294 /* First check for possible aligned read of a C 'long'. Unaligned
3295 reads are more expensive, better to defer to another iteration. */
3296 if (!((size_t) q & LONG_PTR_MASK)) {
3297 /* Fast path for runs of non-surrogate chars. */
3298 register const unsigned char *_q = q;
3299 Py_UNICODE *_p = p;
3300 if (native_ordering) {
3301 /* Native ordering is simple: as long as the input cannot
3302 possibly contain a surrogate char, do an unrolled copy
3303 of several 16-bit code points to the target object.
3304 The non-surrogate check is done on several input bytes
3305 at a time (as many as a C 'long' can contain). */
3306 while (_q < aligned_end) {
3307 unsigned long data = * (unsigned long *) _q;
3308 if (data & FAST_CHAR_MASK)
3309 break;
3310 _p[0] = ((unsigned short *) _q)[0];
3311 _p[1] = ((unsigned short *) _q)[1];
3312#if (SIZEOF_LONG == 8)
3313 _p[2] = ((unsigned short *) _q)[2];
3314 _p[3] = ((unsigned short *) _q)[3];
3315#endif
3316 _q += SIZEOF_LONG;
3317 _p += SIZEOF_LONG / 2;
3318 }
3319 }
3320 else {
3321 /* Byteswapped ordering is similar, but we must decompose
3322 the copy bytewise, and take care of zero'ing out the
3323 upper bytes if the target object is in 32-bit units
3324 (that is, in UCS-4 builds). */
3325 while (_q < aligned_end) {
3326 unsigned long data = * (unsigned long *) _q;
3327 if (data & SWAPPED_FAST_CHAR_MASK)
3328 break;
3329 /* Zero upper bytes in UCS-4 builds */
3330#if (Py_UNICODE_SIZE > 2)
3331 _p[0] = 0;
3332 _p[1] = 0;
3333#if (SIZEOF_LONG == 8)
3334 _p[2] = 0;
3335 _p[3] = 0;
3336#endif
3337#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003338 /* Issue #4916; UCS-4 builds on big endian machines must
3339 fill the two last bytes of each 4-byte unit. */
3340#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3341# define OFF 2
3342#else
3343# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003344#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003345 ((unsigned char *) _p)[OFF + 1] = _q[0];
3346 ((unsigned char *) _p)[OFF + 0] = _q[1];
3347 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3348 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3349#if (SIZEOF_LONG == 8)
3350 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3351 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3352 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3353 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3354#endif
3355#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003356 _q += SIZEOF_LONG;
3357 _p += SIZEOF_LONG / 2;
3358 }
3359 }
3360 p = _p;
3361 q = _q;
3362 if (q >= e)
3363 break;
3364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003365 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366
Benjamin Peterson14339b62009-01-31 16:36:08 +00003367 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003368
3369 if (ch < 0xD800 || ch > 0xDFFF) {
3370 *p++ = ch;
3371 continue;
3372 }
3373
3374 /* UTF-16 code pair: */
3375 if (q > e) {
3376 errmsg = "unexpected end of data";
3377 startinpos = (((const char *)q) - 2) - starts;
3378 endinpos = ((const char *)e) + 1 - starts;
3379 goto utf16Error;
3380 }
3381 if (0xD800 <= ch && ch <= 0xDBFF) {
3382 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3383 q += 2;
3384 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003385#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003386 *p++ = ch;
3387 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003388#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003389 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003390#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 continue;
3392 }
3393 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003394 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 startinpos = (((const char *)q)-4)-starts;
3396 endinpos = startinpos+2;
3397 goto utf16Error;
3398 }
3399
Benjamin Peterson14339b62009-01-31 16:36:08 +00003400 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003401 errmsg = "illegal encoding";
3402 startinpos = (((const char *)q)-2)-starts;
3403 endinpos = startinpos+2;
3404 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003405
Benjamin Peterson29060642009-01-31 22:14:21 +00003406 utf16Error:
3407 outpos = p - PyUnicode_AS_UNICODE(unicode);
3408 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003409 errors,
3410 &errorHandler,
3411 "utf16", errmsg,
3412 &starts,
3413 (const char **)&e,
3414 &startinpos,
3415 &endinpos,
3416 &exc,
3417 (const char **)&q,
3418 &unicode,
3419 &outpos,
3420 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003421 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003423 /* remaining byte at the end? (size should be even) */
3424 if (e == q) {
3425 if (!consumed) {
3426 errmsg = "truncated data";
3427 startinpos = ((const char *)q) - starts;
3428 endinpos = ((const char *)e) + 1 - starts;
3429 outpos = p - PyUnicode_AS_UNICODE(unicode);
3430 if (unicode_decode_call_errorhandler(
3431 errors,
3432 &errorHandler,
3433 "utf16", errmsg,
3434 &starts,
3435 (const char **)&e,
3436 &startinpos,
3437 &endinpos,
3438 &exc,
3439 (const char **)&q,
3440 &unicode,
3441 &outpos,
3442 &p))
3443 goto onError;
3444 /* The remaining input chars are ignored if the callback
3445 chooses to skip the input */
3446 }
3447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448
3449 if (byteorder)
3450 *byteorder = bo;
3451
Walter Dörwald69652032004-09-07 20:24:22 +00003452 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003453 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003454
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003456 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 goto onError;
3458
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003459 Py_XDECREF(errorHandler);
3460 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461 return (PyObject *)unicode;
3462
Benjamin Peterson29060642009-01-31 22:14:21 +00003463 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 Py_XDECREF(errorHandler);
3466 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 return NULL;
3468}
3469
Antoine Pitrouab868312009-01-10 15:40:25 +00003470#undef FAST_CHAR_MASK
3471#undef SWAPPED_FAST_CHAR_MASK
3472
Tim Peters772747b2001-08-09 22:21:55 +00003473PyObject *
3474PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003475 Py_ssize_t size,
3476 const char *errors,
3477 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003479 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003480 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003481 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003482#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003483 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003484#else
3485 const int pairs = 0;
3486#endif
Tim Peters772747b2001-08-09 22:21:55 +00003487 /* Offsets from p for storing byte pairs in the right order. */
3488#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3489 int ihi = 1, ilo = 0;
3490#else
3491 int ihi = 0, ilo = 1;
3492#endif
3493
Benjamin Peterson29060642009-01-31 22:14:21 +00003494#define STORECHAR(CH) \
3495 do { \
3496 p[ihi] = ((CH) >> 8) & 0xff; \
3497 p[ilo] = (CH) & 0xff; \
3498 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003499 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003501#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003502 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003503 if (s[i] >= 0x10000)
3504 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003505#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003506 /* 2 * (size + pairs + (byteorder == 0)) */
3507 if (size > PY_SSIZE_T_MAX ||
3508 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003509 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003510 nsize = size + pairs + (byteorder == 0);
3511 bytesize = nsize * 2;
3512 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003513 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003514 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 if (v == NULL)
3516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003518 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003520 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003521 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003522 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003523
3524 if (byteorder == -1) {
3525 /* force LE */
3526 ihi = 1;
3527 ilo = 0;
3528 }
3529 else if (byteorder == 1) {
3530 /* force BE */
3531 ihi = 0;
3532 ilo = 1;
3533 }
3534
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003535 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003536 Py_UNICODE ch = *s++;
3537 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003538#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003539 if (ch >= 0x10000) {
3540 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3541 ch = 0xD800 | ((ch-0x10000) >> 10);
3542 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003543#endif
Tim Peters772747b2001-08-09 22:21:55 +00003544 STORECHAR(ch);
3545 if (ch2)
3546 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003547 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003548
3549 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003550 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003551#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552}
3553
3554PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3555{
3556 if (!PyUnicode_Check(unicode)) {
3557 PyErr_BadArgument();
3558 return NULL;
3559 }
3560 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003561 PyUnicode_GET_SIZE(unicode),
3562 NULL,
3563 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564}
3565
3566/* --- Unicode Escape Codec ----------------------------------------------- */
3567
Fredrik Lundh06d12682001-01-24 07:59:11 +00003568static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003569
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003571 Py_ssize_t size,
3572 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003575 Py_ssize_t startinpos;
3576 Py_ssize_t endinpos;
3577 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003580 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003582 char* message;
3583 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 PyObject *errorHandler = NULL;
3585 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003586
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 /* Escaped strings will always be longer than the resulting
3588 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 length after conversion to the true value.
3590 (but if the error callback returns a long replacement string
3591 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 v = _PyUnicode_New(size);
3593 if (v == NULL)
3594 goto onError;
3595 if (size == 0)
3596 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003597
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003600
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601 while (s < end) {
3602 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003603 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605
3606 /* Non-escape characters are interpreted as Unicode ordinals */
3607 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003608 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609 continue;
3610 }
3611
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 /* \ - Escapes */
3614 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003615 c = *s++;
3616 if (s > end)
3617 c = '\0'; /* Invalid after \ */
3618 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619
Benjamin Peterson29060642009-01-31 22:14:21 +00003620 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 case '\n': break;
3622 case '\\': *p++ = '\\'; break;
3623 case '\'': *p++ = '\''; break;
3624 case '\"': *p++ = '\"'; break;
3625 case 'b': *p++ = '\b'; break;
3626 case 'f': *p++ = '\014'; break; /* FF */
3627 case 't': *p++ = '\t'; break;
3628 case 'n': *p++ = '\n'; break;
3629 case 'r': *p++ = '\r'; break;
3630 case 'v': *p++ = '\013'; break; /* VT */
3631 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3632
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 case '0': case '1': case '2': case '3':
3635 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003636 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003637 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003638 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003639 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003640 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003642 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643 break;
3644
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 /* hex escapes */
3646 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003648 digits = 2;
3649 message = "truncated \\xXX escape";
3650 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651
Benjamin Peterson29060642009-01-31 22:14:21 +00003652 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003654 digits = 4;
3655 message = "truncated \\uXXXX escape";
3656 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003659 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003660 digits = 8;
3661 message = "truncated \\UXXXXXXXX escape";
3662 hexescape:
3663 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 outpos = p-PyUnicode_AS_UNICODE(v);
3665 if (s+digits>end) {
3666 endinpos = size;
3667 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 errors, &errorHandler,
3669 "unicodeescape", "end of string in escape sequence",
3670 &starts, &end, &startinpos, &endinpos, &exc, &s,
3671 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 goto onError;
3673 goto nextByte;
3674 }
3675 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003676 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003677 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 endinpos = (s+i+1)-starts;
3679 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003680 errors, &errorHandler,
3681 "unicodeescape", message,
3682 &starts, &end, &startinpos, &endinpos, &exc, &s,
3683 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003684 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003686 }
3687 chr = (chr<<4) & ~0xF;
3688 if (c >= '0' && c <= '9')
3689 chr += c - '0';
3690 else if (c >= 'a' && c <= 'f')
3691 chr += 10 + c - 'a';
3692 else
3693 chr += 10 + c - 'A';
3694 }
3695 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003696 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003697 /* _decoding_error will have already written into the
3698 target buffer. */
3699 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003700 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003701 /* when we get here, chr is a 32-bit unicode character */
3702 if (chr <= 0xffff)
3703 /* UCS-2 character */
3704 *p++ = (Py_UNICODE) chr;
3705 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003706 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003707 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003708#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003709 *p++ = chr;
3710#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003711 chr -= 0x10000L;
3712 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003713 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003714#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003715 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 endinpos = s-starts;
3717 outpos = p-PyUnicode_AS_UNICODE(v);
3718 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003719 errors, &errorHandler,
3720 "unicodeescape", "illegal Unicode character",
3721 &starts, &end, &startinpos, &endinpos, &exc, &s,
3722 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003723 goto onError;
3724 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003725 break;
3726
Benjamin Peterson29060642009-01-31 22:14:21 +00003727 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003728 case 'N':
3729 message = "malformed \\N character escape";
3730 if (ucnhash_CAPI == NULL) {
3731 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003732 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003733 if (ucnhash_CAPI == NULL)
3734 goto ucnhashError;
3735 }
3736 if (*s == '{') {
3737 const char *start = s+1;
3738 /* look for the closing brace */
3739 while (*s != '}' && s < end)
3740 s++;
3741 if (s > start && s < end && *s == '}') {
3742 /* found a name. look it up in the unicode database */
3743 message = "unknown Unicode character name";
3744 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003745 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003746 goto store;
3747 }
3748 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003749 endinpos = s-starts;
3750 outpos = p-PyUnicode_AS_UNICODE(v);
3751 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003752 errors, &errorHandler,
3753 "unicodeescape", message,
3754 &starts, &end, &startinpos, &endinpos, &exc, &s,
3755 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003756 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003757 break;
3758
3759 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003760 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 message = "\\ at end of string";
3762 s--;
3763 endinpos = s-starts;
3764 outpos = p-PyUnicode_AS_UNICODE(v);
3765 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003766 errors, &errorHandler,
3767 "unicodeescape", message,
3768 &starts, &end, &startinpos, &endinpos, &exc, &s,
3769 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003770 goto onError;
3771 }
3772 else {
3773 *p++ = '\\';
3774 *p++ = (unsigned char)s[-1];
3775 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003776 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003778 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003781 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003783 Py_XDECREF(errorHandler);
3784 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003786
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003788 PyErr_SetString(
3789 PyExc_UnicodeError,
3790 "\\N escapes not supported (can't load unicodedata module)"
3791 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003792 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 Py_XDECREF(errorHandler);
3794 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003795 return NULL;
3796
Benjamin Peterson29060642009-01-31 22:14:21 +00003797 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003799 Py_XDECREF(errorHandler);
3800 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 return NULL;
3802}
3803
3804/* Return a Unicode-Escape string version of the Unicode object.
3805
3806 If quotes is true, the string is enclosed in u"" or u'' quotes as
3807 appropriate.
3808
3809*/
3810
Thomas Wouters477c8d52006-05-27 19:21:47 +00003811Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003812 Py_ssize_t size,
3813 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003814{
3815 /* like wcschr, but doesn't stop at NULL characters */
3816
3817 while (size-- > 0) {
3818 if (*s == ch)
3819 return s;
3820 s++;
3821 }
3822
3823 return NULL;
3824}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003825
Walter Dörwald79e913e2007-05-12 11:08:06 +00003826static const char *hexdigits = "0123456789abcdef";
3827
3828PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003829 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003831 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003834#ifdef Py_UNICODE_WIDE
3835 const Py_ssize_t expandsize = 10;
3836#else
3837 const Py_ssize_t expandsize = 6;
3838#endif
3839
Thomas Wouters89f507f2006-12-13 04:49:30 +00003840 /* XXX(nnorwitz): rather than over-allocating, it would be
3841 better to choose a different scheme. Perhaps scan the
3842 first N-chars of the string and allocate based on that size.
3843 */
3844 /* Initial allocation is based on the longest-possible unichr
3845 escape.
3846
3847 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3848 unichr, so in this case it's the longest unichr escape. In
3849 narrow (UTF-16) builds this is five chars per source unichr
3850 since there are two unichrs in the surrogate pair, so in narrow
3851 (UTF-16) builds it's not the longest unichr escape.
3852
3853 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3854 so in the narrow (UTF-16) build case it's the longest unichr
3855 escape.
3856 */
3857
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003858 if (size == 0)
3859 return PyBytes_FromStringAndSize(NULL, 0);
3860
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003861 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003862 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003863
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003864 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003865 2
3866 + expandsize*size
3867 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 if (repr == NULL)
3869 return NULL;
3870
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003871 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872
Guido van Rossumd57fd912000-03-10 22:53:23 +00003873 while (size-- > 0) {
3874 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003875
Walter Dörwald79e913e2007-05-12 11:08:06 +00003876 /* Escape backslashes */
3877 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878 *p++ = '\\';
3879 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003880 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003881 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003882
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003883#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003884 /* Map 21-bit characters to '\U00xxxxxx' */
3885 else if (ch >= 0x10000) {
3886 *p++ = '\\';
3887 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003888 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3889 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3890 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3891 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3892 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3893 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3894 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3895 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003896 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003897 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003898#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003899 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3900 else if (ch >= 0xD800 && ch < 0xDC00) {
3901 Py_UNICODE ch2;
3902 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003903
Benjamin Peterson29060642009-01-31 22:14:21 +00003904 ch2 = *s++;
3905 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003906 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003907 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3908 *p++ = '\\';
3909 *p++ = 'U';
3910 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3911 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3912 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3913 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3914 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3915 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3916 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3917 *p++ = hexdigits[ucs & 0x0000000F];
3918 continue;
3919 }
3920 /* Fall through: isolated surrogates are copied as-is */
3921 s--;
3922 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003923 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003924#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003925
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003927 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 *p++ = '\\';
3929 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003930 *p++ = hexdigits[(ch >> 12) & 0x000F];
3931 *p++ = hexdigits[(ch >> 8) & 0x000F];
3932 *p++ = hexdigits[(ch >> 4) & 0x000F];
3933 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003935
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003936 /* Map special whitespace to '\t', \n', '\r' */
3937 else if (ch == '\t') {
3938 *p++ = '\\';
3939 *p++ = 't';
3940 }
3941 else if (ch == '\n') {
3942 *p++ = '\\';
3943 *p++ = 'n';
3944 }
3945 else if (ch == '\r') {
3946 *p++ = '\\';
3947 *p++ = 'r';
3948 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003949
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003950 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003951 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003953 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003954 *p++ = hexdigits[(ch >> 4) & 0x000F];
3955 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003956 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003957
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 /* Copy everything else as-is */
3959 else
3960 *p++ = (char) ch;
3961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003963 assert(p - PyBytes_AS_STRING(repr) > 0);
3964 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3965 return NULL;
3966 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967}
3968
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003969PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003971 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972 if (!PyUnicode_Check(unicode)) {
3973 PyErr_BadArgument();
3974 return NULL;
3975 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003976 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3977 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003978 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979}
3980
3981/* --- Raw Unicode Escape Codec ------------------------------------------- */
3982
3983PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003984 Py_ssize_t size,
3985 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003988 Py_ssize_t startinpos;
3989 Py_ssize_t endinpos;
3990 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993 const char *end;
3994 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995 PyObject *errorHandler = NULL;
3996 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003997
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 /* Escaped strings will always be longer than the resulting
3999 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 length after conversion to the true value. (But decoding error
4001 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002 v = _PyUnicode_New(size);
4003 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 end = s + size;
4009 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 unsigned char c;
4011 Py_UCS4 x;
4012 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004013 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 /* Non-escape characters are interpreted as Unicode ordinals */
4016 if (*s != '\\') {
4017 *p++ = (unsigned char)*s++;
4018 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004019 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004020 startinpos = s-starts;
4021
4022 /* \u-escapes are only interpreted iff the number of leading
4023 backslashes if odd */
4024 bs = s;
4025 for (;s < end;) {
4026 if (*s != '\\')
4027 break;
4028 *p++ = (unsigned char)*s++;
4029 }
4030 if (((s - bs) & 1) == 0 ||
4031 s >= end ||
4032 (*s != 'u' && *s != 'U')) {
4033 continue;
4034 }
4035 p--;
4036 count = *s=='u' ? 4 : 8;
4037 s++;
4038
4039 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4040 outpos = p-PyUnicode_AS_UNICODE(v);
4041 for (x = 0, i = 0; i < count; ++i, ++s) {
4042 c = (unsigned char)*s;
4043 if (!ISXDIGIT(c)) {
4044 endinpos = s-starts;
4045 if (unicode_decode_call_errorhandler(
4046 errors, &errorHandler,
4047 "rawunicodeescape", "truncated \\uXXXX",
4048 &starts, &end, &startinpos, &endinpos, &exc, &s,
4049 &v, &outpos, &p))
4050 goto onError;
4051 goto nextByte;
4052 }
4053 x = (x<<4) & ~0xF;
4054 if (c >= '0' && c <= '9')
4055 x += c - '0';
4056 else if (c >= 'a' && c <= 'f')
4057 x += 10 + c - 'a';
4058 else
4059 x += 10 + c - 'A';
4060 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004061 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 /* UCS-2 character */
4063 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004064 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 /* UCS-4 character. Either store directly, or as
4066 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004067#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004068 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004069#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 x -= 0x10000L;
4071 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4072 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004073#endif
4074 } else {
4075 endinpos = s-starts;
4076 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004077 if (unicode_decode_call_errorhandler(
4078 errors, &errorHandler,
4079 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 &starts, &end, &startinpos, &endinpos, &exc, &s,
4081 &v, &outpos, &p))
4082 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004083 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 nextByte:
4085 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004087 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089 Py_XDECREF(errorHandler);
4090 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004092
Benjamin Peterson29060642009-01-31 22:14:21 +00004093 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004095 Py_XDECREF(errorHandler);
4096 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 return NULL;
4098}
4099
4100PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004103 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 char *p;
4105 char *q;
4106
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004107#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004108 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004109#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004110 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004111#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004112
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004113 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004115
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004116 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 if (repr == NULL)
4118 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004119 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004120 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004122 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 while (size-- > 0) {
4124 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004125#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 /* Map 32-bit characters to '\Uxxxxxxxx' */
4127 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004128 *p++ = '\\';
4129 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004130 *p++ = hexdigits[(ch >> 28) & 0xf];
4131 *p++ = hexdigits[(ch >> 24) & 0xf];
4132 *p++ = hexdigits[(ch >> 20) & 0xf];
4133 *p++ = hexdigits[(ch >> 16) & 0xf];
4134 *p++ = hexdigits[(ch >> 12) & 0xf];
4135 *p++ = hexdigits[(ch >> 8) & 0xf];
4136 *p++ = hexdigits[(ch >> 4) & 0xf];
4137 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004138 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004139 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004140#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4142 if (ch >= 0xD800 && ch < 0xDC00) {
4143 Py_UNICODE ch2;
4144 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004145
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 ch2 = *s++;
4147 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004148 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4150 *p++ = '\\';
4151 *p++ = 'U';
4152 *p++ = hexdigits[(ucs >> 28) & 0xf];
4153 *p++ = hexdigits[(ucs >> 24) & 0xf];
4154 *p++ = hexdigits[(ucs >> 20) & 0xf];
4155 *p++ = hexdigits[(ucs >> 16) & 0xf];
4156 *p++ = hexdigits[(ucs >> 12) & 0xf];
4157 *p++ = hexdigits[(ucs >> 8) & 0xf];
4158 *p++ = hexdigits[(ucs >> 4) & 0xf];
4159 *p++ = hexdigits[ucs & 0xf];
4160 continue;
4161 }
4162 /* Fall through: isolated surrogates are copied as-is */
4163 s--;
4164 size++;
4165 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004166#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 /* Map 16-bit characters to '\uxxxx' */
4168 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 *p++ = '\\';
4170 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004171 *p++ = hexdigits[(ch >> 12) & 0xf];
4172 *p++ = hexdigits[(ch >> 8) & 0xf];
4173 *p++ = hexdigits[(ch >> 4) & 0xf];
4174 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 /* Copy everything else as-is */
4177 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 *p++ = (char) ch;
4179 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004180 size = p - q;
4181
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004182 assert(size > 0);
4183 if (_PyBytes_Resize(&repr, size) < 0)
4184 return NULL;
4185 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186}
4187
4188PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4189{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004190 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004192 PyErr_BadArgument();
4193 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004195 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4196 PyUnicode_GET_SIZE(unicode));
4197
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004198 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199}
4200
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004201/* --- Unicode Internal Codec ------------------------------------------- */
4202
4203PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 Py_ssize_t size,
4205 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004206{
4207 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004208 Py_ssize_t startinpos;
4209 Py_ssize_t endinpos;
4210 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004211 PyUnicodeObject *v;
4212 Py_UNICODE *p;
4213 const char *end;
4214 const char *reason;
4215 PyObject *errorHandler = NULL;
4216 PyObject *exc = NULL;
4217
Neal Norwitzd43069c2006-01-08 01:12:10 +00004218#ifdef Py_UNICODE_WIDE
4219 Py_UNICODE unimax = PyUnicode_GetMax();
4220#endif
4221
Thomas Wouters89f507f2006-12-13 04:49:30 +00004222 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004223 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4224 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004226 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004227 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004228 p = PyUnicode_AS_UNICODE(v);
4229 end = s + size;
4230
4231 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004232 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004233 /* We have to sanity check the raw data, otherwise doom looms for
4234 some malformed UCS-4 data. */
4235 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004236#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004237 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004238#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004239 end-s < Py_UNICODE_SIZE
4240 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004241 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004242 startinpos = s - starts;
4243 if (end-s < Py_UNICODE_SIZE) {
4244 endinpos = end-starts;
4245 reason = "truncated input";
4246 }
4247 else {
4248 endinpos = s - starts + Py_UNICODE_SIZE;
4249 reason = "illegal code point (> 0x10FFFF)";
4250 }
4251 outpos = p - PyUnicode_AS_UNICODE(v);
4252 if (unicode_decode_call_errorhandler(
4253 errors, &errorHandler,
4254 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004255 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004256 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004257 goto onError;
4258 }
4259 }
4260 else {
4261 p++;
4262 s += Py_UNICODE_SIZE;
4263 }
4264 }
4265
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004266 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004267 goto onError;
4268 Py_XDECREF(errorHandler);
4269 Py_XDECREF(exc);
4270 return (PyObject *)v;
4271
Benjamin Peterson29060642009-01-31 22:14:21 +00004272 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004273 Py_XDECREF(v);
4274 Py_XDECREF(errorHandler);
4275 Py_XDECREF(exc);
4276 return NULL;
4277}
4278
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279/* --- Latin-1 Codec ------------------------------------------------------ */
4280
4281PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 Py_ssize_t size,
4283 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284{
4285 PyUnicodeObject *v;
4286 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004287 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004288
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004290 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 Py_UNICODE r = *(unsigned char*)s;
4292 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004293 }
4294
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295 v = _PyUnicode_New(size);
4296 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004297 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004301 e = s + size;
4302 /* Unrolling the copy makes it much faster by reducing the looping
4303 overhead. This is similar to what many memcpy() implementations do. */
4304 unrolled_end = e - 4;
4305 while (s < unrolled_end) {
4306 p[0] = (unsigned char) s[0];
4307 p[1] = (unsigned char) s[1];
4308 p[2] = (unsigned char) s[2];
4309 p[3] = (unsigned char) s[3];
4310 s += 4;
4311 p += 4;
4312 }
4313 while (s < e)
4314 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004316
Benjamin Peterson29060642009-01-31 22:14:21 +00004317 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 Py_XDECREF(v);
4319 return NULL;
4320}
4321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322/* create or adjust a UnicodeEncodeError */
4323static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004324 const char *encoding,
4325 const Py_UNICODE *unicode, Py_ssize_t size,
4326 Py_ssize_t startpos, Py_ssize_t endpos,
4327 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004330 *exceptionObject = PyUnicodeEncodeError_Create(
4331 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 }
4333 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4335 goto onError;
4336 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4337 goto onError;
4338 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4339 goto onError;
4340 return;
4341 onError:
4342 Py_DECREF(*exceptionObject);
4343 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344 }
4345}
4346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347/* raises a UnicodeEncodeError */
4348static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 const char *encoding,
4350 const Py_UNICODE *unicode, Py_ssize_t size,
4351 Py_ssize_t startpos, Py_ssize_t endpos,
4352 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353{
4354 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004355 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004357 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004358}
4359
4360/* error handling callback helper:
4361 build arguments, call the callback and check the arguments,
4362 put the result into newpos and return the replacement string, which
4363 has to be freed by the caller */
4364static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 PyObject **errorHandler,
4366 const char *encoding, const char *reason,
4367 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4368 Py_ssize_t startpos, Py_ssize_t endpos,
4369 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004371 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372
4373 PyObject *restuple;
4374 PyObject *resunicode;
4375
4376 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004379 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 }
4381
4382 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386
4387 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004392 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004393 Py_DECREF(restuple);
4394 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004396 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 &resunicode, newpos)) {
4398 Py_DECREF(restuple);
4399 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004401 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4402 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4403 Py_DECREF(restuple);
4404 return NULL;
4405 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004408 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4410 Py_DECREF(restuple);
4411 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004412 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 Py_INCREF(resunicode);
4414 Py_DECREF(restuple);
4415 return resunicode;
4416}
4417
4418static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 Py_ssize_t size,
4420 const char *errors,
4421 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422{
4423 /* output object */
4424 PyObject *res;
4425 /* pointers to the beginning and end+1 of input */
4426 const Py_UNICODE *startp = p;
4427 const Py_UNICODE *endp = p + size;
4428 /* pointer to the beginning of the unencodable characters */
4429 /* const Py_UNICODE *badp = NULL; */
4430 /* pointer into the output */
4431 char *str;
4432 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004433 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004434 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4435 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 PyObject *errorHandler = NULL;
4437 PyObject *exc = NULL;
4438 /* the following variable is used for caching string comparisons
4439 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4440 int known_errorHandler = -1;
4441
4442 /* allocate enough for a simple encoding without
4443 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004444 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004445 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004446 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004448 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004449 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 ressize = size;
4451
4452 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454
Benjamin Peterson29060642009-01-31 22:14:21 +00004455 /* can we encode this? */
4456 if (c<limit) {
4457 /* no overflow check, because we know that the space is enough */
4458 *str++ = (char)c;
4459 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004460 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004461 else {
4462 Py_ssize_t unicodepos = p-startp;
4463 Py_ssize_t requiredsize;
4464 PyObject *repunicode;
4465 Py_ssize_t repsize;
4466 Py_ssize_t newpos;
4467 Py_ssize_t respos;
4468 Py_UNICODE *uni2;
4469 /* startpos for collecting unencodable chars */
4470 const Py_UNICODE *collstart = p;
4471 const Py_UNICODE *collend = p;
4472 /* find all unecodable characters */
4473 while ((collend < endp) && ((*collend)>=limit))
4474 ++collend;
4475 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4476 if (known_errorHandler==-1) {
4477 if ((errors==NULL) || (!strcmp(errors, "strict")))
4478 known_errorHandler = 1;
4479 else if (!strcmp(errors, "replace"))
4480 known_errorHandler = 2;
4481 else if (!strcmp(errors, "ignore"))
4482 known_errorHandler = 3;
4483 else if (!strcmp(errors, "xmlcharrefreplace"))
4484 known_errorHandler = 4;
4485 else
4486 known_errorHandler = 0;
4487 }
4488 switch (known_errorHandler) {
4489 case 1: /* strict */
4490 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4491 goto onError;
4492 case 2: /* replace */
4493 while (collstart++<collend)
4494 *str++ = '?'; /* fall through */
4495 case 3: /* ignore */
4496 p = collend;
4497 break;
4498 case 4: /* xmlcharrefreplace */
4499 respos = str - PyBytes_AS_STRING(res);
4500 /* determine replacement size (temporarily (mis)uses p) */
4501 for (p = collstart, repsize = 0; p < collend; ++p) {
4502 if (*p<10)
4503 repsize += 2+1+1;
4504 else if (*p<100)
4505 repsize += 2+2+1;
4506 else if (*p<1000)
4507 repsize += 2+3+1;
4508 else if (*p<10000)
4509 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004510#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 else
4512 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004513#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 else if (*p<100000)
4515 repsize += 2+5+1;
4516 else if (*p<1000000)
4517 repsize += 2+6+1;
4518 else
4519 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004520#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 }
4522 requiredsize = respos+repsize+(endp-collend);
4523 if (requiredsize > ressize) {
4524 if (requiredsize<2*ressize)
4525 requiredsize = 2*ressize;
4526 if (_PyBytes_Resize(&res, requiredsize))
4527 goto onError;
4528 str = PyBytes_AS_STRING(res) + respos;
4529 ressize = requiredsize;
4530 }
4531 /* generate replacement (temporarily (mis)uses p) */
4532 for (p = collstart; p < collend; ++p) {
4533 str += sprintf(str, "&#%d;", (int)*p);
4534 }
4535 p = collend;
4536 break;
4537 default:
4538 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4539 encoding, reason, startp, size, &exc,
4540 collstart-startp, collend-startp, &newpos);
4541 if (repunicode == NULL)
4542 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004543 if (PyBytes_Check(repunicode)) {
4544 /* Directly copy bytes result to output. */
4545 repsize = PyBytes_Size(repunicode);
4546 if (repsize > 1) {
4547 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004548 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004549 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4550 Py_DECREF(repunicode);
4551 goto onError;
4552 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004553 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004554 ressize += repsize-1;
4555 }
4556 memcpy(str, PyBytes_AsString(repunicode), repsize);
4557 str += repsize;
4558 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004559 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004560 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004561 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 /* need more space? (at least enough for what we
4563 have+the replacement+the rest of the string, so
4564 we won't have to check space for encodable characters) */
4565 respos = str - PyBytes_AS_STRING(res);
4566 repsize = PyUnicode_GET_SIZE(repunicode);
4567 requiredsize = respos+repsize+(endp-collend);
4568 if (requiredsize > ressize) {
4569 if (requiredsize<2*ressize)
4570 requiredsize = 2*ressize;
4571 if (_PyBytes_Resize(&res, requiredsize)) {
4572 Py_DECREF(repunicode);
4573 goto onError;
4574 }
4575 str = PyBytes_AS_STRING(res) + respos;
4576 ressize = requiredsize;
4577 }
4578 /* check if there is anything unencodable in the replacement
4579 and copy it to the output */
4580 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4581 c = *uni2;
4582 if (c >= limit) {
4583 raise_encode_exception(&exc, encoding, startp, size,
4584 unicodepos, unicodepos+1, reason);
4585 Py_DECREF(repunicode);
4586 goto onError;
4587 }
4588 *str = (char)c;
4589 }
4590 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004591 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004592 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004593 }
4594 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004595 /* Resize if we allocated to much */
4596 size = str - PyBytes_AS_STRING(res);
4597 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004598 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004599 if (_PyBytes_Resize(&res, size) < 0)
4600 goto onError;
4601 }
4602
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603 Py_XDECREF(errorHandler);
4604 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004605 return res;
4606
4607 onError:
4608 Py_XDECREF(res);
4609 Py_XDECREF(errorHandler);
4610 Py_XDECREF(exc);
4611 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612}
4613
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 Py_ssize_t size,
4616 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619}
4620
4621PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4622{
4623 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004624 PyErr_BadArgument();
4625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626 }
4627 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004628 PyUnicode_GET_SIZE(unicode),
4629 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630}
4631
4632/* --- 7-bit ASCII Codec -------------------------------------------------- */
4633
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 Py_ssize_t size,
4636 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004638 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004639 PyUnicodeObject *v;
4640 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004641 Py_ssize_t startinpos;
4642 Py_ssize_t endinpos;
4643 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 const char *e;
4645 PyObject *errorHandler = NULL;
4646 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004647
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004649 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 Py_UNICODE r = *(unsigned char*)s;
4651 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004652 }
Tim Petersced69f82003-09-16 20:30:58 +00004653
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654 v = _PyUnicode_New(size);
4655 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004658 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004660 e = s + size;
4661 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 register unsigned char c = (unsigned char)*s;
4663 if (c < 128) {
4664 *p++ = c;
4665 ++s;
4666 }
4667 else {
4668 startinpos = s-starts;
4669 endinpos = startinpos + 1;
4670 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4671 if (unicode_decode_call_errorhandler(
4672 errors, &errorHandler,
4673 "ascii", "ordinal not in range(128)",
4674 &starts, &e, &startinpos, &endinpos, &exc, &s,
4675 &v, &outpos, &p))
4676 goto onError;
4677 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004679 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4681 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004682 Py_XDECREF(errorHandler);
4683 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004685
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004688 Py_XDECREF(errorHandler);
4689 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 return NULL;
4691}
4692
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 Py_ssize_t size,
4695 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698}
4699
4700PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4701{
4702 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 PyErr_BadArgument();
4704 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 }
4706 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004707 PyUnicode_GET_SIZE(unicode),
4708 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709}
4710
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004711#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004712
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004713/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004714
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004715#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004716#define NEED_RETRY
4717#endif
4718
4719/* XXX This code is limited to "true" double-byte encodings, as
4720 a) it assumes an incomplete character consists of a single byte, and
4721 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004723
4724static int is_dbcs_lead_byte(const char *s, int offset)
4725{
4726 const char *curr = s + offset;
4727
4728 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 const char *prev = CharPrev(s, curr);
4730 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004731 }
4732 return 0;
4733}
4734
4735/*
4736 * Decode MBCS string into unicode object. If 'final' is set, converts
4737 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4738 */
4739static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 const char *s, /* MBCS string */
4741 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004742 int final,
4743 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004744{
4745 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004746 Py_ssize_t n;
4747 DWORD usize;
4748 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004749
4750 assert(size >= 0);
4751
Victor Stinner554f3f02010-06-16 23:33:54 +00004752 /* check and handle 'errors' arg */
4753 if (errors==NULL || strcmp(errors, "strict")==0)
4754 flags = MB_ERR_INVALID_CHARS;
4755 else if (strcmp(errors, "ignore")==0)
4756 flags = 0;
4757 else {
4758 PyErr_Format(PyExc_ValueError,
4759 "mbcs encoding does not support errors='%s'",
4760 errors);
4761 return -1;
4762 }
4763
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004764 /* Skip trailing lead-byte unless 'final' is set */
4765 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004767
4768 /* First get the size of the result */
4769 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004770 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4771 if (usize==0)
4772 goto mbcs_decode_error;
4773 } else
4774 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004775
4776 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 /* Create unicode object */
4778 *v = _PyUnicode_New(usize);
4779 if (*v == NULL)
4780 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004781 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004782 }
4783 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004784 /* Extend unicode object */
4785 n = PyUnicode_GET_SIZE(*v);
4786 if (_PyUnicode_Resize(v, n + usize) < 0)
4787 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004788 }
4789
4790 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004791 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004793 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4794 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004795 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004796 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004797 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004798
4799mbcs_decode_error:
4800 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4801 we raise a UnicodeDecodeError - else it is a 'generic'
4802 windows error
4803 */
4804 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4805 /* Ideally, we should get reason from FormatMessage - this
4806 is the Windows 2000 English version of the message
4807 */
4808 PyObject *exc = NULL;
4809 const char *reason = "No mapping for the Unicode character exists "
4810 "in the target multi-byte code page.";
4811 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4812 if (exc != NULL) {
4813 PyCodec_StrictErrors(exc);
4814 Py_DECREF(exc);
4815 }
4816 } else {
4817 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4818 }
4819 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004820}
4821
4822PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004823 Py_ssize_t size,
4824 const char *errors,
4825 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004826{
4827 PyUnicodeObject *v = NULL;
4828 int done;
4829
4830 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004832
4833#ifdef NEED_RETRY
4834 retry:
4835 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004836 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004837 else
4838#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004839 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004840
4841 if (done < 0) {
4842 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004844 }
4845
4846 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004847 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004848
4849#ifdef NEED_RETRY
4850 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004851 s += done;
4852 size -= done;
4853 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004854 }
4855#endif
4856
4857 return (PyObject *)v;
4858}
4859
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004860PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 Py_ssize_t size,
4862 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004863{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004864 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4865}
4866
4867/*
4868 * Convert unicode into string object (MBCS).
4869 * Returns 0 if succeed, -1 otherwise.
4870 */
4871static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004872 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004873 int size, /* size of unicode */
4874 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004875{
Victor Stinner554f3f02010-06-16 23:33:54 +00004876 BOOL usedDefaultChar = FALSE;
4877 BOOL *pusedDefaultChar;
4878 int mbcssize;
4879 Py_ssize_t n;
4880 PyObject *exc = NULL;
4881 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004882
4883 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004884
Victor Stinner554f3f02010-06-16 23:33:54 +00004885 /* check and handle 'errors' arg */
4886 if (errors==NULL || strcmp(errors, "strict")==0) {
4887 flags = WC_NO_BEST_FIT_CHARS;
4888 pusedDefaultChar = &usedDefaultChar;
4889 } else if (strcmp(errors, "replace")==0) {
4890 flags = 0;
4891 pusedDefaultChar = NULL;
4892 } else {
4893 PyErr_Format(PyExc_ValueError,
4894 "mbcs encoding does not support errors='%s'",
4895 errors);
4896 return -1;
4897 }
4898
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004899 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004900 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004901 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4902 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 if (mbcssize == 0) {
4904 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4905 return -1;
4906 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004907 /* If we used a default char, then we failed! */
4908 if (pusedDefaultChar && *pusedDefaultChar)
4909 goto mbcs_encode_error;
4910 } else {
4911 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004912 }
4913
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004914 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004915 /* Create string object */
4916 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4917 if (*repr == NULL)
4918 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004919 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004920 }
4921 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 /* Extend string object */
4923 n = PyBytes_Size(*repr);
4924 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4925 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004926 }
4927
4928 /* Do the conversion */
4929 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004930 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004931 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4932 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4934 return -1;
4935 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004936 if (pusedDefaultChar && *pusedDefaultChar)
4937 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004938 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004939 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004940
4941mbcs_encode_error:
4942 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4943 Py_XDECREF(exc);
4944 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004945}
4946
4947PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 Py_ssize_t size,
4949 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004950{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004951 PyObject *repr = NULL;
4952 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004953
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004954#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004956 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004957 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004958 else
4959#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004960 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004961
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004962 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 Py_XDECREF(repr);
4964 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004965 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004966
4967#ifdef NEED_RETRY
4968 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 p += INT_MAX;
4970 size -= INT_MAX;
4971 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004972 }
4973#endif
4974
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004975 return repr;
4976}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004977
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004978PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4979{
4980 if (!PyUnicode_Check(unicode)) {
4981 PyErr_BadArgument();
4982 return NULL;
4983 }
4984 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004985 PyUnicode_GET_SIZE(unicode),
4986 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004987}
4988
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004989#undef NEED_RETRY
4990
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004991#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004992
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993/* --- Character Mapping Codec -------------------------------------------- */
4994
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 Py_ssize_t size,
4997 PyObject *mapping,
4998 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005001 Py_ssize_t startinpos;
5002 Py_ssize_t endinpos;
5003 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005004 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 PyUnicodeObject *v;
5006 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005007 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005008 PyObject *errorHandler = NULL;
5009 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005010 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005011 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005012
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013 /* Default to Latin-1 */
5014 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005015 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016
5017 v = _PyUnicode_New(size);
5018 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005019 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005023 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005024 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 mapstring = PyUnicode_AS_UNICODE(mapping);
5026 maplen = PyUnicode_GET_SIZE(mapping);
5027 while (s < e) {
5028 unsigned char ch = *s;
5029 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 if (ch < maplen)
5032 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 if (x == 0xfffe) {
5035 /* undefined mapping */
5036 outpos = p-PyUnicode_AS_UNICODE(v);
5037 startinpos = s-starts;
5038 endinpos = startinpos+1;
5039 if (unicode_decode_call_errorhandler(
5040 errors, &errorHandler,
5041 "charmap", "character maps to <undefined>",
5042 &starts, &e, &startinpos, &endinpos, &exc, &s,
5043 &v, &outpos, &p)) {
5044 goto onError;
5045 }
5046 continue;
5047 }
5048 *p++ = x;
5049 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005050 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005051 }
5052 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 while (s < e) {
5054 unsigned char ch = *s;
5055 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005056
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5058 w = PyLong_FromLong((long)ch);
5059 if (w == NULL)
5060 goto onError;
5061 x = PyObject_GetItem(mapping, w);
5062 Py_DECREF(w);
5063 if (x == NULL) {
5064 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5065 /* No mapping found means: mapping is undefined. */
5066 PyErr_Clear();
5067 x = Py_None;
5068 Py_INCREF(x);
5069 } else
5070 goto onError;
5071 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005072
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 /* Apply mapping */
5074 if (PyLong_Check(x)) {
5075 long value = PyLong_AS_LONG(x);
5076 if (value < 0 || value > 65535) {
5077 PyErr_SetString(PyExc_TypeError,
5078 "character mapping must be in range(65536)");
5079 Py_DECREF(x);
5080 goto onError;
5081 }
5082 *p++ = (Py_UNICODE)value;
5083 }
5084 else if (x == Py_None) {
5085 /* undefined mapping */
5086 outpos = p-PyUnicode_AS_UNICODE(v);
5087 startinpos = s-starts;
5088 endinpos = startinpos+1;
5089 if (unicode_decode_call_errorhandler(
5090 errors, &errorHandler,
5091 "charmap", "character maps to <undefined>",
5092 &starts, &e, &startinpos, &endinpos, &exc, &s,
5093 &v, &outpos, &p)) {
5094 Py_DECREF(x);
5095 goto onError;
5096 }
5097 Py_DECREF(x);
5098 continue;
5099 }
5100 else if (PyUnicode_Check(x)) {
5101 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005102
Benjamin Peterson29060642009-01-31 22:14:21 +00005103 if (targetsize == 1)
5104 /* 1-1 mapping */
5105 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005106
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 else if (targetsize > 1) {
5108 /* 1-n mapping */
5109 if (targetsize > extrachars) {
5110 /* resize first */
5111 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5112 Py_ssize_t needed = (targetsize - extrachars) + \
5113 (targetsize << 2);
5114 extrachars += needed;
5115 /* XXX overflow detection missing */
5116 if (_PyUnicode_Resize(&v,
5117 PyUnicode_GET_SIZE(v) + needed) < 0) {
5118 Py_DECREF(x);
5119 goto onError;
5120 }
5121 p = PyUnicode_AS_UNICODE(v) + oldpos;
5122 }
5123 Py_UNICODE_COPY(p,
5124 PyUnicode_AS_UNICODE(x),
5125 targetsize);
5126 p += targetsize;
5127 extrachars -= targetsize;
5128 }
5129 /* 1-0 mapping: skip the character */
5130 }
5131 else {
5132 /* wrong return value */
5133 PyErr_SetString(PyExc_TypeError,
5134 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005135 Py_DECREF(x);
5136 goto onError;
5137 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005138 Py_DECREF(x);
5139 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 }
5142 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005143 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5144 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005145 Py_XDECREF(errorHandler);
5146 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005148
Benjamin Peterson29060642009-01-31 22:14:21 +00005149 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005150 Py_XDECREF(errorHandler);
5151 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 Py_XDECREF(v);
5153 return NULL;
5154}
5155
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005156/* Charmap encoding: the lookup table */
5157
5158struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 PyObject_HEAD
5160 unsigned char level1[32];
5161 int count2, count3;
5162 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005163};
5164
5165static PyObject*
5166encoding_map_size(PyObject *obj, PyObject* args)
5167{
5168 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005169 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005171}
5172
5173static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005174 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 PyDoc_STR("Return the size (in bytes) of this object") },
5176 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005177};
5178
5179static void
5180encoding_map_dealloc(PyObject* o)
5181{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005182 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005183}
5184
5185static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005186 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 "EncodingMap", /*tp_name*/
5188 sizeof(struct encoding_map), /*tp_basicsize*/
5189 0, /*tp_itemsize*/
5190 /* methods */
5191 encoding_map_dealloc, /*tp_dealloc*/
5192 0, /*tp_print*/
5193 0, /*tp_getattr*/
5194 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005195 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 0, /*tp_repr*/
5197 0, /*tp_as_number*/
5198 0, /*tp_as_sequence*/
5199 0, /*tp_as_mapping*/
5200 0, /*tp_hash*/
5201 0, /*tp_call*/
5202 0, /*tp_str*/
5203 0, /*tp_getattro*/
5204 0, /*tp_setattro*/
5205 0, /*tp_as_buffer*/
5206 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5207 0, /*tp_doc*/
5208 0, /*tp_traverse*/
5209 0, /*tp_clear*/
5210 0, /*tp_richcompare*/
5211 0, /*tp_weaklistoffset*/
5212 0, /*tp_iter*/
5213 0, /*tp_iternext*/
5214 encoding_map_methods, /*tp_methods*/
5215 0, /*tp_members*/
5216 0, /*tp_getset*/
5217 0, /*tp_base*/
5218 0, /*tp_dict*/
5219 0, /*tp_descr_get*/
5220 0, /*tp_descr_set*/
5221 0, /*tp_dictoffset*/
5222 0, /*tp_init*/
5223 0, /*tp_alloc*/
5224 0, /*tp_new*/
5225 0, /*tp_free*/
5226 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005227};
5228
5229PyObject*
5230PyUnicode_BuildEncodingMap(PyObject* string)
5231{
5232 Py_UNICODE *decode;
5233 PyObject *result;
5234 struct encoding_map *mresult;
5235 int i;
5236 int need_dict = 0;
5237 unsigned char level1[32];
5238 unsigned char level2[512];
5239 unsigned char *mlevel1, *mlevel2, *mlevel3;
5240 int count2 = 0, count3 = 0;
5241
5242 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5243 PyErr_BadArgument();
5244 return NULL;
5245 }
5246 decode = PyUnicode_AS_UNICODE(string);
5247 memset(level1, 0xFF, sizeof level1);
5248 memset(level2, 0xFF, sizeof level2);
5249
5250 /* If there isn't a one-to-one mapping of NULL to \0,
5251 or if there are non-BMP characters, we need to use
5252 a mapping dictionary. */
5253 if (decode[0] != 0)
5254 need_dict = 1;
5255 for (i = 1; i < 256; i++) {
5256 int l1, l2;
5257 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005258#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005259 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005260#endif
5261 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005262 need_dict = 1;
5263 break;
5264 }
5265 if (decode[i] == 0xFFFE)
5266 /* unmapped character */
5267 continue;
5268 l1 = decode[i] >> 11;
5269 l2 = decode[i] >> 7;
5270 if (level1[l1] == 0xFF)
5271 level1[l1] = count2++;
5272 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005273 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005274 }
5275
5276 if (count2 >= 0xFF || count3 >= 0xFF)
5277 need_dict = 1;
5278
5279 if (need_dict) {
5280 PyObject *result = PyDict_New();
5281 PyObject *key, *value;
5282 if (!result)
5283 return NULL;
5284 for (i = 0; i < 256; i++) {
5285 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005286 key = PyLong_FromLong(decode[i]);
5287 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005288 if (!key || !value)
5289 goto failed1;
5290 if (PyDict_SetItem(result, key, value) == -1)
5291 goto failed1;
5292 Py_DECREF(key);
5293 Py_DECREF(value);
5294 }
5295 return result;
5296 failed1:
5297 Py_XDECREF(key);
5298 Py_XDECREF(value);
5299 Py_DECREF(result);
5300 return NULL;
5301 }
5302
5303 /* Create a three-level trie */
5304 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5305 16*count2 + 128*count3 - 1);
5306 if (!result)
5307 return PyErr_NoMemory();
5308 PyObject_Init(result, &EncodingMapType);
5309 mresult = (struct encoding_map*)result;
5310 mresult->count2 = count2;
5311 mresult->count3 = count3;
5312 mlevel1 = mresult->level1;
5313 mlevel2 = mresult->level23;
5314 mlevel3 = mresult->level23 + 16*count2;
5315 memcpy(mlevel1, level1, 32);
5316 memset(mlevel2, 0xFF, 16*count2);
5317 memset(mlevel3, 0, 128*count3);
5318 count3 = 0;
5319 for (i = 1; i < 256; i++) {
5320 int o1, o2, o3, i2, i3;
5321 if (decode[i] == 0xFFFE)
5322 /* unmapped character */
5323 continue;
5324 o1 = decode[i]>>11;
5325 o2 = (decode[i]>>7) & 0xF;
5326 i2 = 16*mlevel1[o1] + o2;
5327 if (mlevel2[i2] == 0xFF)
5328 mlevel2[i2] = count3++;
5329 o3 = decode[i] & 0x7F;
5330 i3 = 128*mlevel2[i2] + o3;
5331 mlevel3[i3] = i;
5332 }
5333 return result;
5334}
5335
5336static int
5337encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5338{
5339 struct encoding_map *map = (struct encoding_map*)mapping;
5340 int l1 = c>>11;
5341 int l2 = (c>>7) & 0xF;
5342 int l3 = c & 0x7F;
5343 int i;
5344
5345#ifdef Py_UNICODE_WIDE
5346 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005348 }
5349#endif
5350 if (c == 0)
5351 return 0;
5352 /* level 1*/
5353 i = map->level1[l1];
5354 if (i == 0xFF) {
5355 return -1;
5356 }
5357 /* level 2*/
5358 i = map->level23[16*i+l2];
5359 if (i == 0xFF) {
5360 return -1;
5361 }
5362 /* level 3 */
5363 i = map->level23[16*map->count2 + 128*i + l3];
5364 if (i == 0) {
5365 return -1;
5366 }
5367 return i;
5368}
5369
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370/* Lookup the character ch in the mapping. If the character
5371 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005372 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005373static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374{
Christian Heimes217cfd12007-12-02 14:31:20 +00005375 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005376 PyObject *x;
5377
5378 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380 x = PyObject_GetItem(mapping, w);
5381 Py_DECREF(w);
5382 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5384 /* No mapping found means: mapping is undefined. */
5385 PyErr_Clear();
5386 x = Py_None;
5387 Py_INCREF(x);
5388 return x;
5389 } else
5390 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005392 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005394 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 long value = PyLong_AS_LONG(x);
5396 if (value < 0 || value > 255) {
5397 PyErr_SetString(PyExc_TypeError,
5398 "character mapping must be in range(256)");
5399 Py_DECREF(x);
5400 return NULL;
5401 }
5402 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005404 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 /* wrong return value */
5408 PyErr_Format(PyExc_TypeError,
5409 "character mapping must return integer, bytes or None, not %.400s",
5410 x->ob_type->tp_name);
5411 Py_DECREF(x);
5412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 }
5414}
5415
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005416static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005417charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005418{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005419 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5420 /* exponentially overallocate to minimize reallocations */
5421 if (requiredsize < 2*outsize)
5422 requiredsize = 2*outsize;
5423 if (_PyBytes_Resize(outobj, requiredsize))
5424 return -1;
5425 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005426}
5427
Benjamin Peterson14339b62009-01-31 16:36:08 +00005428typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005430}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005432 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005433 space is available. Return a new reference to the object that
5434 was put in the output buffer, or Py_None, if the mapping was undefined
5435 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005436 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005438charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005440{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005441 PyObject *rep;
5442 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005443 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444
Christian Heimes90aa7642007-12-19 02:45:37 +00005445 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005446 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005448 if (res == -1)
5449 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005450 if (outsize<requiredsize)
5451 if (charmapencode_resize(outobj, outpos, requiredsize))
5452 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005453 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 outstart[(*outpos)++] = (char)res;
5455 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005456 }
5457
5458 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005461 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 Py_DECREF(rep);
5463 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005464 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 if (PyLong_Check(rep)) {
5466 Py_ssize_t requiredsize = *outpos+1;
5467 if (outsize<requiredsize)
5468 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5469 Py_DECREF(rep);
5470 return enc_EXCEPTION;
5471 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005472 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005473 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005474 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 else {
5476 const char *repchars = PyBytes_AS_STRING(rep);
5477 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5478 Py_ssize_t requiredsize = *outpos+repsize;
5479 if (outsize<requiredsize)
5480 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5481 Py_DECREF(rep);
5482 return enc_EXCEPTION;
5483 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005484 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 memcpy(outstart + *outpos, repchars, repsize);
5486 *outpos += repsize;
5487 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005489 Py_DECREF(rep);
5490 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005491}
5492
5493/* handle an error in PyUnicode_EncodeCharmap
5494 Return 0 on success, -1 on error */
5495static
5496int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005497 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005498 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005499 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005500 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501{
5502 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005503 Py_ssize_t repsize;
5504 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 Py_UNICODE *uni2;
5506 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005507 Py_ssize_t collstartpos = *inpos;
5508 Py_ssize_t collendpos = *inpos+1;
5509 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 char *encoding = "charmap";
5511 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005512 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005513
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005514 /* find all unencodable characters */
5515 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005516 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005517 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 int res = encoding_map_lookup(p[collendpos], mapping);
5519 if (res != -1)
5520 break;
5521 ++collendpos;
5522 continue;
5523 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005524
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 rep = charmapencode_lookup(p[collendpos], mapping);
5526 if (rep==NULL)
5527 return -1;
5528 else if (rep!=Py_None) {
5529 Py_DECREF(rep);
5530 break;
5531 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005532 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005534 }
5535 /* cache callback name lookup
5536 * (if not done yet, i.e. it's the first error) */
5537 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 if ((errors==NULL) || (!strcmp(errors, "strict")))
5539 *known_errorHandler = 1;
5540 else if (!strcmp(errors, "replace"))
5541 *known_errorHandler = 2;
5542 else if (!strcmp(errors, "ignore"))
5543 *known_errorHandler = 3;
5544 else if (!strcmp(errors, "xmlcharrefreplace"))
5545 *known_errorHandler = 4;
5546 else
5547 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005548 }
5549 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005550 case 1: /* strict */
5551 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5552 return -1;
5553 case 2: /* replace */
5554 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 x = charmapencode_output('?', mapping, res, respos);
5556 if (x==enc_EXCEPTION) {
5557 return -1;
5558 }
5559 else if (x==enc_FAILED) {
5560 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5561 return -1;
5562 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005563 }
5564 /* fall through */
5565 case 3: /* ignore */
5566 *inpos = collendpos;
5567 break;
5568 case 4: /* xmlcharrefreplace */
5569 /* generate replacement (temporarily (mis)uses p) */
5570 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 char buffer[2+29+1+1];
5572 char *cp;
5573 sprintf(buffer, "&#%d;", (int)p[collpos]);
5574 for (cp = buffer; *cp; ++cp) {
5575 x = charmapencode_output(*cp, mapping, res, respos);
5576 if (x==enc_EXCEPTION)
5577 return -1;
5578 else if (x==enc_FAILED) {
5579 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5580 return -1;
5581 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005582 }
5583 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005584 *inpos = collendpos;
5585 break;
5586 default:
5587 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 encoding, reason, p, size, exceptionObject,
5589 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005590 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005592 if (PyBytes_Check(repunicode)) {
5593 /* Directly copy bytes result to output. */
5594 Py_ssize_t outsize = PyBytes_Size(*res);
5595 Py_ssize_t requiredsize;
5596 repsize = PyBytes_Size(repunicode);
5597 requiredsize = *respos + repsize;
5598 if (requiredsize > outsize)
5599 /* Make room for all additional bytes. */
5600 if (charmapencode_resize(res, respos, requiredsize)) {
5601 Py_DECREF(repunicode);
5602 return -1;
5603 }
5604 memcpy(PyBytes_AsString(*res) + *respos,
5605 PyBytes_AsString(repunicode), repsize);
5606 *respos += repsize;
5607 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005608 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005609 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005610 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005611 /* generate replacement */
5612 repsize = PyUnicode_GET_SIZE(repunicode);
5613 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005614 x = charmapencode_output(*uni2, mapping, res, respos);
5615 if (x==enc_EXCEPTION) {
5616 return -1;
5617 }
5618 else if (x==enc_FAILED) {
5619 Py_DECREF(repunicode);
5620 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5621 return -1;
5622 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005623 }
5624 *inpos = newpos;
5625 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005626 }
5627 return 0;
5628}
5629
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 Py_ssize_t size,
5632 PyObject *mapping,
5633 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005635 /* output object */
5636 PyObject *res = NULL;
5637 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005638 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005640 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 PyObject *errorHandler = NULL;
5642 PyObject *exc = NULL;
5643 /* the following variable is used for caching string comparisons
5644 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5645 * 3=ignore, 4=xmlcharrefreplace */
5646 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647
5648 /* Default to Latin-1 */
5649 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005652 /* allocate enough for a simple encoding without
5653 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005654 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 if (res == NULL)
5656 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005657 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005660 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 /* try to encode it */
5662 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5663 if (x==enc_EXCEPTION) /* error */
5664 goto onError;
5665 if (x==enc_FAILED) { /* unencodable character */
5666 if (charmap_encoding_error(p, size, &inpos, mapping,
5667 &exc,
5668 &known_errorHandler, &errorHandler, errors,
5669 &res, &respos)) {
5670 goto onError;
5671 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005672 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005673 else
5674 /* done with this character => adjust input position */
5675 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005678 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005679 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005680 if (_PyBytes_Resize(&res, respos) < 0)
5681 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005682
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683 Py_XDECREF(exc);
5684 Py_XDECREF(errorHandler);
5685 return res;
5686
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688 Py_XDECREF(res);
5689 Py_XDECREF(exc);
5690 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 return NULL;
5692}
5693
5694PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696{
5697 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 PyErr_BadArgument();
5699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 }
5701 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 PyUnicode_GET_SIZE(unicode),
5703 mapping,
5704 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705}
5706
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707/* create or adjust a UnicodeTranslateError */
5708static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 const Py_UNICODE *unicode, Py_ssize_t size,
5710 Py_ssize_t startpos, Py_ssize_t endpos,
5711 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005714 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 }
5717 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5719 goto onError;
5720 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5721 goto onError;
5722 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5723 goto onError;
5724 return;
5725 onError:
5726 Py_DECREF(*exceptionObject);
5727 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 }
5729}
5730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731/* raises a UnicodeTranslateError */
5732static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 const Py_UNICODE *unicode, Py_ssize_t size,
5734 Py_ssize_t startpos, Py_ssize_t endpos,
5735 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005736{
5737 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741}
5742
5743/* error handling callback helper:
5744 build arguments, call the callback and check the arguments,
5745 put the result into newpos and return the replacement string, which
5746 has to be freed by the caller */
5747static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 PyObject **errorHandler,
5749 const char *reason,
5750 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5751 Py_ssize_t startpos, Py_ssize_t endpos,
5752 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005754 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005756 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005757 PyObject *restuple;
5758 PyObject *resunicode;
5759
5760 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 }
5765
5766 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770
5771 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005776 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 Py_DECREF(restuple);
5778 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005779 }
5780 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 &resunicode, &i_newpos)) {
5782 Py_DECREF(restuple);
5783 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005785 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005787 else
5788 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005789 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5791 Py_DECREF(restuple);
5792 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005793 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005794 Py_INCREF(resunicode);
5795 Py_DECREF(restuple);
5796 return resunicode;
5797}
5798
5799/* Lookup the character ch in the mapping and put the result in result,
5800 which must be decrefed by the caller.
5801 Return 0 on success, -1 on error */
5802static
5803int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5804{
Christian Heimes217cfd12007-12-02 14:31:20 +00005805 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 PyObject *x;
5807
5808 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 x = PyObject_GetItem(mapping, w);
5811 Py_DECREF(w);
5812 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5814 /* No mapping found means: use 1:1 mapping. */
5815 PyErr_Clear();
5816 *result = NULL;
5817 return 0;
5818 } else
5819 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820 }
5821 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 *result = x;
5823 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005825 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 long value = PyLong_AS_LONG(x);
5827 long max = PyUnicode_GetMax();
5828 if (value < 0 || value > max) {
5829 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005830 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 Py_DECREF(x);
5832 return -1;
5833 }
5834 *result = x;
5835 return 0;
5836 }
5837 else if (PyUnicode_Check(x)) {
5838 *result = x;
5839 return 0;
5840 }
5841 else {
5842 /* wrong return value */
5843 PyErr_SetString(PyExc_TypeError,
5844 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005845 Py_DECREF(x);
5846 return -1;
5847 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848}
5849/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 if not reallocate and adjust various state variables.
5851 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852static
Walter Dörwald4894c302003-10-24 14:25:28 +00005853int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005856 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005857 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 /* remember old output position */
5859 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5860 /* exponentially overallocate to minimize reallocations */
5861 if (requiredsize < 2 * oldsize)
5862 requiredsize = 2 * oldsize;
5863 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5864 return -1;
5865 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 }
5867 return 0;
5868}
5869/* lookup the character, put the result in the output string and adjust
5870 various state variables. Return a new reference to the object that
5871 was put in the output buffer in *result, or Py_None, if the mapping was
5872 undefined (in which case no character was written).
5873 The called must decref result.
5874 Return 0 on success, -1 on error. */
5875static
Walter Dörwald4894c302003-10-24 14:25:28 +00005876int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5878 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879{
Walter Dörwald4894c302003-10-24 14:25:28 +00005880 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 /* not found => default to 1:1 mapping */
5884 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885 }
5886 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005888 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 /* no overflow check, because we know that the space is enough */
5890 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005891 }
5892 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5894 if (repsize==1) {
5895 /* no overflow check, because we know that the space is enough */
5896 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5897 }
5898 else if (repsize!=0) {
5899 /* more than one character */
5900 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5901 (insize - (curinp-startinp)) +
5902 repsize - 1;
5903 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5904 return -1;
5905 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5906 *outp += repsize;
5907 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005908 }
5909 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 return 0;
5912}
5913
5914PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 Py_ssize_t size,
5916 PyObject *mapping,
5917 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 /* output object */
5920 PyObject *res = NULL;
5921 /* pointers to the beginning and end+1 of input */
5922 const Py_UNICODE *startp = p;
5923 const Py_UNICODE *endp = p + size;
5924 /* pointer into the output */
5925 Py_UNICODE *str;
5926 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005927 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005928 char *reason = "character maps to <undefined>";
5929 PyObject *errorHandler = NULL;
5930 PyObject *exc = NULL;
5931 /* the following variable is used for caching string comparisons
5932 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5933 * 3=ignore, 4=xmlcharrefreplace */
5934 int known_errorHandler = -1;
5935
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 PyErr_BadArgument();
5938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005940
5941 /* allocate enough for a simple 1:1 translation without
5942 replacements, if we need more, we'll resize */
5943 res = PyUnicode_FromUnicode(NULL, size);
5944 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005948 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005950 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 /* try to encode it */
5952 PyObject *x = NULL;
5953 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5954 Py_XDECREF(x);
5955 goto onError;
5956 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005957 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 if (x!=Py_None) /* it worked => adjust input pointer */
5959 ++p;
5960 else { /* untranslatable character */
5961 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5962 Py_ssize_t repsize;
5963 Py_ssize_t newpos;
5964 Py_UNICODE *uni2;
5965 /* startpos for collecting untranslatable chars */
5966 const Py_UNICODE *collstart = p;
5967 const Py_UNICODE *collend = p+1;
5968 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 /* find all untranslatable characters */
5971 while (collend < endp) {
5972 if (charmaptranslate_lookup(*collend, mapping, &x))
5973 goto onError;
5974 Py_XDECREF(x);
5975 if (x!=Py_None)
5976 break;
5977 ++collend;
5978 }
5979 /* cache callback name lookup
5980 * (if not done yet, i.e. it's the first error) */
5981 if (known_errorHandler==-1) {
5982 if ((errors==NULL) || (!strcmp(errors, "strict")))
5983 known_errorHandler = 1;
5984 else if (!strcmp(errors, "replace"))
5985 known_errorHandler = 2;
5986 else if (!strcmp(errors, "ignore"))
5987 known_errorHandler = 3;
5988 else if (!strcmp(errors, "xmlcharrefreplace"))
5989 known_errorHandler = 4;
5990 else
5991 known_errorHandler = 0;
5992 }
5993 switch (known_errorHandler) {
5994 case 1: /* strict */
5995 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005996 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 case 2: /* replace */
5998 /* No need to check for space, this is a 1:1 replacement */
5999 for (coll = collstart; coll<collend; ++coll)
6000 *str++ = '?';
6001 /* fall through */
6002 case 3: /* ignore */
6003 p = collend;
6004 break;
6005 case 4: /* xmlcharrefreplace */
6006 /* generate replacement (temporarily (mis)uses p) */
6007 for (p = collstart; p < collend; ++p) {
6008 char buffer[2+29+1+1];
6009 char *cp;
6010 sprintf(buffer, "&#%d;", (int)*p);
6011 if (charmaptranslate_makespace(&res, &str,
6012 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6013 goto onError;
6014 for (cp = buffer; *cp; ++cp)
6015 *str++ = *cp;
6016 }
6017 p = collend;
6018 break;
6019 default:
6020 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6021 reason, startp, size, &exc,
6022 collstart-startp, collend-startp, &newpos);
6023 if (repunicode == NULL)
6024 goto onError;
6025 /* generate replacement */
6026 repsize = PyUnicode_GET_SIZE(repunicode);
6027 if (charmaptranslate_makespace(&res, &str,
6028 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6029 Py_DECREF(repunicode);
6030 goto onError;
6031 }
6032 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6033 *str++ = *uni2;
6034 p = startp + newpos;
6035 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006036 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006037 }
6038 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006039 /* Resize if we allocated to much */
6040 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006041 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 if (PyUnicode_Resize(&res, respos) < 0)
6043 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 }
6045 Py_XDECREF(exc);
6046 Py_XDECREF(errorHandler);
6047 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 Py_XDECREF(res);
6051 Py_XDECREF(exc);
6052 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 return NULL;
6054}
6055
6056PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 PyObject *mapping,
6058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059{
6060 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 str = PyUnicode_FromObject(str);
6063 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 PyUnicode_GET_SIZE(str),
6067 mapping,
6068 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 Py_DECREF(str);
6070 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006071
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 Py_XDECREF(str);
6074 return NULL;
6075}
Tim Petersced69f82003-09-16 20:30:58 +00006076
Guido van Rossum9e896b32000-04-05 20:11:21 +00006077/* --- Decimal Encoder ---------------------------------------------------- */
6078
6079int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 Py_ssize_t length,
6081 char *output,
6082 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006083{
6084 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006085 PyObject *errorHandler = NULL;
6086 PyObject *exc = NULL;
6087 const char *encoding = "decimal";
6088 const char *reason = "invalid decimal Unicode string";
6089 /* the following variable is used for caching string comparisons
6090 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6091 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006092
6093 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 PyErr_BadArgument();
6095 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006096 }
6097
6098 p = s;
6099 end = s + length;
6100 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 register Py_UNICODE ch = *p;
6102 int decimal;
6103 PyObject *repunicode;
6104 Py_ssize_t repsize;
6105 Py_ssize_t newpos;
6106 Py_UNICODE *uni2;
6107 Py_UNICODE *collstart;
6108 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006109
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006111 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 ++p;
6113 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 decimal = Py_UNICODE_TODECIMAL(ch);
6116 if (decimal >= 0) {
6117 *output++ = '0' + decimal;
6118 ++p;
6119 continue;
6120 }
6121 if (0 < ch && ch < 256) {
6122 *output++ = (char)ch;
6123 ++p;
6124 continue;
6125 }
6126 /* All other characters are considered unencodable */
6127 collstart = p;
6128 collend = p+1;
6129 while (collend < end) {
6130 if ((0 < *collend && *collend < 256) ||
6131 !Py_UNICODE_ISSPACE(*collend) ||
6132 Py_UNICODE_TODECIMAL(*collend))
6133 break;
6134 }
6135 /* cache callback name lookup
6136 * (if not done yet, i.e. it's the first error) */
6137 if (known_errorHandler==-1) {
6138 if ((errors==NULL) || (!strcmp(errors, "strict")))
6139 known_errorHandler = 1;
6140 else if (!strcmp(errors, "replace"))
6141 known_errorHandler = 2;
6142 else if (!strcmp(errors, "ignore"))
6143 known_errorHandler = 3;
6144 else if (!strcmp(errors, "xmlcharrefreplace"))
6145 known_errorHandler = 4;
6146 else
6147 known_errorHandler = 0;
6148 }
6149 switch (known_errorHandler) {
6150 case 1: /* strict */
6151 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6152 goto onError;
6153 case 2: /* replace */
6154 for (p = collstart; p < collend; ++p)
6155 *output++ = '?';
6156 /* fall through */
6157 case 3: /* ignore */
6158 p = collend;
6159 break;
6160 case 4: /* xmlcharrefreplace */
6161 /* generate replacement (temporarily (mis)uses p) */
6162 for (p = collstart; p < collend; ++p)
6163 output += sprintf(output, "&#%d;", (int)*p);
6164 p = collend;
6165 break;
6166 default:
6167 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6168 encoding, reason, s, length, &exc,
6169 collstart-s, collend-s, &newpos);
6170 if (repunicode == NULL)
6171 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006172 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006173 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006174 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6175 Py_DECREF(repunicode);
6176 goto onError;
6177 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 /* generate replacement */
6179 repsize = PyUnicode_GET_SIZE(repunicode);
6180 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6181 Py_UNICODE ch = *uni2;
6182 if (Py_UNICODE_ISSPACE(ch))
6183 *output++ = ' ';
6184 else {
6185 decimal = Py_UNICODE_TODECIMAL(ch);
6186 if (decimal >= 0)
6187 *output++ = '0' + decimal;
6188 else if (0 < ch && ch < 256)
6189 *output++ = (char)ch;
6190 else {
6191 Py_DECREF(repunicode);
6192 raise_encode_exception(&exc, encoding,
6193 s, length, collstart-s, collend-s, reason);
6194 goto onError;
6195 }
6196 }
6197 }
6198 p = s + newpos;
6199 Py_DECREF(repunicode);
6200 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006201 }
6202 /* 0-terminate the output string */
6203 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006204 Py_XDECREF(exc);
6205 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006206 return 0;
6207
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209 Py_XDECREF(exc);
6210 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006211 return -1;
6212}
6213
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214/* --- Helpers ------------------------------------------------------------ */
6215
Eric Smith8c663262007-08-25 02:26:07 +00006216#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006217#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006218
Thomas Wouters477c8d52006-05-27 19:21:47 +00006219#include "stringlib/count.h"
6220#include "stringlib/find.h"
6221#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006222#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006223
Eric Smith5807c412008-05-11 21:00:57 +00006224#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006225#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006226#include "stringlib/localeutil.h"
6227
Thomas Wouters477c8d52006-05-27 19:21:47 +00006228/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006229#define ADJUST_INDICES(start, end, len) \
6230 if (end > len) \
6231 end = len; \
6232 else if (end < 0) { \
6233 end += len; \
6234 if (end < 0) \
6235 end = 0; \
6236 } \
6237 if (start < 0) { \
6238 start += len; \
6239 if (start < 0) \
6240 start = 0; \
6241 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006242
Martin v. Löwis18e16552006-02-15 17:27:45 +00006243Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006244 PyObject *substr,
6245 Py_ssize_t start,
6246 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006248 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006249 PyUnicodeObject* str_obj;
6250 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006251
Thomas Wouters477c8d52006-05-27 19:21:47 +00006252 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6253 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006255 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6256 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 Py_DECREF(str_obj);
6258 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259 }
Tim Petersced69f82003-09-16 20:30:58 +00006260
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006261 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006262 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006263 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6264 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006265 );
6266
6267 Py_DECREF(sub_obj);
6268 Py_DECREF(str_obj);
6269
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 return result;
6271}
6272
Martin v. Löwis18e16552006-02-15 17:27:45 +00006273Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006274 PyObject *sub,
6275 Py_ssize_t start,
6276 Py_ssize_t end,
6277 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006279 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006280
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006282 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006284 sub = PyUnicode_FromObject(sub);
6285 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 Py_DECREF(str);
6287 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 }
Tim Petersced69f82003-09-16 20:30:58 +00006289
Thomas Wouters477c8d52006-05-27 19:21:47 +00006290 if (direction > 0)
6291 result = stringlib_find_slice(
6292 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6293 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6294 start, end
6295 );
6296 else
6297 result = stringlib_rfind_slice(
6298 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6299 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6300 start, end
6301 );
6302
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006304 Py_DECREF(sub);
6305
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 return result;
6307}
6308
Tim Petersced69f82003-09-16 20:30:58 +00006309static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 PyUnicodeObject *substring,
6312 Py_ssize_t start,
6313 Py_ssize_t end,
6314 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 if (substring->length == 0)
6317 return 1;
6318
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006319 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 end -= substring->length;
6321 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323
6324 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 if (Py_UNICODE_MATCH(self, end, substring))
6326 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 } else {
6328 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 }
6331
6332 return 0;
6333}
6334
Martin v. Löwis18e16552006-02-15 17:27:45 +00006335Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006336 PyObject *substr,
6337 Py_ssize_t start,
6338 Py_ssize_t end,
6339 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006341 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006342
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 str = PyUnicode_FromObject(str);
6344 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346 substr = PyUnicode_FromObject(substr);
6347 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 Py_DECREF(str);
6349 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 }
Tim Petersced69f82003-09-16 20:30:58 +00006351
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 (PyUnicodeObject *)substr,
6354 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355 Py_DECREF(str);
6356 Py_DECREF(substr);
6357 return result;
6358}
6359
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360/* Apply fixfct filter to the Unicode object self and return a
6361 reference to the modified object */
6362
Tim Petersced69f82003-09-16 20:30:58 +00006363static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366{
6367
6368 PyUnicodeObject *u;
6369
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006370 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006373
6374 Py_UNICODE_COPY(u->str, self->str, self->length);
6375
Tim Peters7a29bd52001-09-12 03:03:31 +00006376 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 /* fixfct should return TRUE if it modified the buffer. If
6378 FALSE, return a reference to the original buffer instead
6379 (to save space, not time) */
6380 Py_INCREF(self);
6381 Py_DECREF(u);
6382 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 }
6384 return (PyObject*) u;
6385}
6386
Tim Petersced69f82003-09-16 20:30:58 +00006387static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388int fixupper(PyUnicodeObject *self)
6389{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006390 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 Py_UNICODE *s = self->str;
6392 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006393
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006396
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 ch = Py_UNICODE_TOUPPER(*s);
6398 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 *s = ch;
6401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 s++;
6403 }
6404
6405 return status;
6406}
6407
Tim Petersced69f82003-09-16 20:30:58 +00006408static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409int fixlower(PyUnicodeObject *self)
6410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006411 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 Py_UNICODE *s = self->str;
6413 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006414
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006417
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 ch = Py_UNICODE_TOLOWER(*s);
6419 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 *s = ch;
6422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 s++;
6424 }
6425
6426 return status;
6427}
6428
Tim Petersced69f82003-09-16 20:30:58 +00006429static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430int fixswapcase(PyUnicodeObject *self)
6431{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006432 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 Py_UNICODE *s = self->str;
6434 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006435
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 while (len-- > 0) {
6437 if (Py_UNICODE_ISUPPER(*s)) {
6438 *s = Py_UNICODE_TOLOWER(*s);
6439 status = 1;
6440 } else if (Py_UNICODE_ISLOWER(*s)) {
6441 *s = Py_UNICODE_TOUPPER(*s);
6442 status = 1;
6443 }
6444 s++;
6445 }
6446
6447 return status;
6448}
6449
Tim Petersced69f82003-09-16 20:30:58 +00006450static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451int fixcapitalize(PyUnicodeObject *self)
6452{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006453 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006454 Py_UNICODE *s = self->str;
6455 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006456
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006457 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006459 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 *s = Py_UNICODE_TOUPPER(*s);
6461 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006463 s++;
6464 while (--len > 0) {
6465 if (Py_UNICODE_ISUPPER(*s)) {
6466 *s = Py_UNICODE_TOLOWER(*s);
6467 status = 1;
6468 }
6469 s++;
6470 }
6471 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472}
6473
6474static
6475int fixtitle(PyUnicodeObject *self)
6476{
6477 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6478 register Py_UNICODE *e;
6479 int previous_is_cased;
6480
6481 /* Shortcut for single character strings */
6482 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6484 if (*p != ch) {
6485 *p = ch;
6486 return 1;
6487 }
6488 else
6489 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 }
Tim Petersced69f82003-09-16 20:30:58 +00006491
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 e = p + PyUnicode_GET_SIZE(self);
6493 previous_is_cased = 0;
6494 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006496
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 if (previous_is_cased)
6498 *p = Py_UNICODE_TOLOWER(ch);
6499 else
6500 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006501
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 if (Py_UNICODE_ISLOWER(ch) ||
6503 Py_UNICODE_ISUPPER(ch) ||
6504 Py_UNICODE_ISTITLE(ch))
6505 previous_is_cased = 1;
6506 else
6507 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 }
6509 return 1;
6510}
6511
Tim Peters8ce9f162004-08-27 01:49:32 +00006512PyObject *
6513PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514{
Skip Montanaro6543b452004-09-16 03:28:13 +00006515 const Py_UNICODE blank = ' ';
6516 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006517 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006518 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006519 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6520 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006521 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6522 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006523 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006524 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525
Tim Peters05eba1f2004-08-27 21:32:02 +00006526 fseq = PySequence_Fast(seq, "");
6527 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006528 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006529 }
6530
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006531 /* NOTE: the following code can't call back into Python code,
6532 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006533 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006534
Tim Peters05eba1f2004-08-27 21:32:02 +00006535 seqlen = PySequence_Fast_GET_SIZE(fseq);
6536 /* If empty sequence, return u"". */
6537 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006538 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6539 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006540 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006541 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006542 /* If singleton sequence with an exact Unicode, return that. */
6543 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 item = items[0];
6545 if (PyUnicode_CheckExact(item)) {
6546 Py_INCREF(item);
6547 res = (PyUnicodeObject *)item;
6548 goto Done;
6549 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006550 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006551 else {
6552 /* Set up sep and seplen */
6553 if (separator == NULL) {
6554 sep = &blank;
6555 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006556 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006557 else {
6558 if (!PyUnicode_Check(separator)) {
6559 PyErr_Format(PyExc_TypeError,
6560 "separator: expected str instance,"
6561 " %.80s found",
6562 Py_TYPE(separator)->tp_name);
6563 goto onError;
6564 }
6565 sep = PyUnicode_AS_UNICODE(separator);
6566 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006567 }
6568 }
6569
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006570 /* There are at least two things to join, or else we have a subclass
6571 * of str in the sequence.
6572 * Do a pre-pass to figure out the total amount of space we'll
6573 * need (sz), and see whether all argument are strings.
6574 */
6575 sz = 0;
6576 for (i = 0; i < seqlen; i++) {
6577 const Py_ssize_t old_sz = sz;
6578 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 if (!PyUnicode_Check(item)) {
6580 PyErr_Format(PyExc_TypeError,
6581 "sequence item %zd: expected str instance,"
6582 " %.80s found",
6583 i, Py_TYPE(item)->tp_name);
6584 goto onError;
6585 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006586 sz += PyUnicode_GET_SIZE(item);
6587 if (i != 0)
6588 sz += seplen;
6589 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6590 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006592 goto onError;
6593 }
6594 }
Tim Petersced69f82003-09-16 20:30:58 +00006595
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006596 res = _PyUnicode_New(sz);
6597 if (res == NULL)
6598 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006599
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006600 /* Catenate everything. */
6601 res_p = PyUnicode_AS_UNICODE(res);
6602 for (i = 0; i < seqlen; ++i) {
6603 Py_ssize_t itemlen;
6604 item = items[i];
6605 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 /* Copy item, and maybe the separator. */
6607 if (i) {
6608 Py_UNICODE_COPY(res_p, sep, seplen);
6609 res_p += seplen;
6610 }
6611 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6612 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006613 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006614
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006616 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 return (PyObject *)res;
6618
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006620 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006621 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 return NULL;
6623}
6624
Tim Petersced69f82003-09-16 20:30:58 +00006625static
6626PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 Py_ssize_t left,
6628 Py_ssize_t right,
6629 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630{
6631 PyUnicodeObject *u;
6632
6633 if (left < 0)
6634 left = 0;
6635 if (right < 0)
6636 right = 0;
6637
Tim Peters7a29bd52001-09-12 03:03:31 +00006638 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 Py_INCREF(self);
6640 return self;
6641 }
6642
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006643 if (left > PY_SSIZE_T_MAX - self->length ||
6644 right > PY_SSIZE_T_MAX - (left + self->length)) {
6645 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6646 return NULL;
6647 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 u = _PyUnicode_New(left + self->length + right);
6649 if (u) {
6650 if (left)
6651 Py_UNICODE_FILL(u->str, fill, left);
6652 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6653 if (right)
6654 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6655 }
6656
6657 return u;
6658}
6659
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006660PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663
6664 string = PyUnicode_FromObject(string);
6665 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006668 list = stringlib_splitlines(
6669 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6670 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
6672 Py_DECREF(string);
6673 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674}
6675
Tim Petersced69f82003-09-16 20:30:58 +00006676static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 PyUnicodeObject *substring,
6679 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006682 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006685 return stringlib_split_whitespace(
6686 (PyObject*) self, self->str, self->length, maxcount
6687 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006689 return stringlib_split(
6690 (PyObject*) self, self->str, self->length,
6691 substring->str, substring->length,
6692 maxcount
6693 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694}
6695
Tim Petersced69f82003-09-16 20:30:58 +00006696static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006697PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 PyUnicodeObject *substring,
6699 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006700{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006701 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006702 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006703
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006704 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006705 return stringlib_rsplit_whitespace(
6706 (PyObject*) self, self->str, self->length, maxcount
6707 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006708
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006709 return stringlib_rsplit(
6710 (PyObject*) self, self->str, self->length,
6711 substring->str, substring->length,
6712 maxcount
6713 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006714}
6715
6716static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 PyUnicodeObject *str1,
6719 PyUnicodeObject *str2,
6720 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721{
6722 PyUnicodeObject *u;
6723
6724 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006726 else if (maxcount == 0 || self->length == 0)
6727 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728
Thomas Wouters477c8d52006-05-27 19:21:47 +00006729 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006730 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006731 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006732 if (str1->length == 0)
6733 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006734 if (str1->length == 1) {
6735 /* replace characters */
6736 Py_UNICODE u1, u2;
6737 if (!findchar(self->str, self->length, str1->str[0]))
6738 goto nothing;
6739 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6740 if (!u)
6741 return NULL;
6742 Py_UNICODE_COPY(u->str, self->str, self->length);
6743 u1 = str1->str[0];
6744 u2 = str2->str[0];
6745 for (i = 0; i < u->length; i++)
6746 if (u->str[i] == u1) {
6747 if (--maxcount < 0)
6748 break;
6749 u->str[i] = u2;
6750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006752 i = stringlib_find(
6753 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006755 if (i < 0)
6756 goto nothing;
6757 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6758 if (!u)
6759 return NULL;
6760 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006761
6762 /* change everything in-place, starting with this one */
6763 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6764 i += str1->length;
6765
6766 while ( --maxcount > 0) {
6767 i = stringlib_find(self->str+i, self->length-i,
6768 str1->str, str1->length,
6769 i);
6770 if (i == -1)
6771 break;
6772 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6773 i += str1->length;
6774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006777
6778 Py_ssize_t n, i, j, e;
6779 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 Py_UNICODE *p;
6781
6782 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006783 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6784 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006785 if (n == 0)
6786 goto nothing;
6787 /* new_size = self->length + n * (str2->length - str1->length)); */
6788 delta = (str2->length - str1->length);
6789 if (delta == 0) {
6790 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006792 product = n * (str2->length - str1->length);
6793 if ((product / (str2->length - str1->length)) != n) {
6794 PyErr_SetString(PyExc_OverflowError,
6795 "replace string is too long");
6796 return NULL;
6797 }
6798 new_size = self->length + product;
6799 if (new_size < 0) {
6800 PyErr_SetString(PyExc_OverflowError,
6801 "replace string is too long");
6802 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 }
6804 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006805 u = _PyUnicode_New(new_size);
6806 if (!u)
6807 return NULL;
6808 i = 0;
6809 p = u->str;
6810 e = self->length - str1->length;
6811 if (str1->length > 0) {
6812 while (n-- > 0) {
6813 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006814 j = stringlib_find(self->str+i, self->length-i,
6815 str1->str, str1->length,
6816 i);
6817 if (j == -1)
6818 break;
6819 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006820 /* copy unchanged part [i:j] */
6821 Py_UNICODE_COPY(p, self->str+i, j-i);
6822 p += j - i;
6823 }
6824 /* copy substitution string */
6825 if (str2->length > 0) {
6826 Py_UNICODE_COPY(p, str2->str, str2->length);
6827 p += str2->length;
6828 }
6829 i = j + str1->length;
6830 }
6831 if (i < self->length)
6832 /* copy tail [i:] */
6833 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6834 } else {
6835 /* interleave */
6836 while (n > 0) {
6837 Py_UNICODE_COPY(p, str2->str, str2->length);
6838 p += str2->length;
6839 if (--n <= 0)
6840 break;
6841 *p++ = self->str[i++];
6842 }
6843 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006847
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006849 /* nothing to replace; return original string (when possible) */
6850 if (PyUnicode_CheckExact(self)) {
6851 Py_INCREF(self);
6852 return (PyObject *) self;
6853 }
6854 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855}
6856
6857/* --- Unicode Object Methods --------------------------------------------- */
6858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006859PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861\n\
6862Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006863characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864
6865static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006866unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 return fixup(self, fixtitle);
6869}
6870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006871PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873\n\
6874Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006875have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876
6877static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006878unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 return fixup(self, fixcapitalize);
6881}
6882
6883#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006884PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886\n\
6887Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006888normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889
6890static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006891unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892{
6893 PyObject *list;
6894 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006895 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 /* Split into words */
6898 list = split(self, NULL, -1);
6899 if (!list)
6900 return NULL;
6901
6902 /* Capitalize each word */
6903 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6904 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 if (item == NULL)
6907 goto onError;
6908 Py_DECREF(PyList_GET_ITEM(list, i));
6909 PyList_SET_ITEM(list, i, item);
6910 }
6911
6912 /* Join the words to form a new string */
6913 item = PyUnicode_Join(NULL, list);
6914
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 Py_DECREF(list);
6917 return (PyObject *)item;
6918}
6919#endif
6920
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006921/* Argument converter. Coerces to a single unicode character */
6922
6923static int
6924convert_uc(PyObject *obj, void *addr)
6925{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006926 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6927 PyObject *uniobj;
6928 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006929
Benjamin Peterson14339b62009-01-31 16:36:08 +00006930 uniobj = PyUnicode_FromObject(obj);
6931 if (uniobj == NULL) {
6932 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006934 return 0;
6935 }
6936 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6937 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006939 Py_DECREF(uniobj);
6940 return 0;
6941 }
6942 unistr = PyUnicode_AS_UNICODE(uniobj);
6943 *fillcharloc = unistr[0];
6944 Py_DECREF(uniobj);
6945 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006946}
6947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006948PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006951Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006952done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953
6954static PyObject *
6955unicode_center(PyUnicodeObject *self, PyObject *args)
6956{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006957 Py_ssize_t marg, left;
6958 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006959 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
Thomas Woutersde017742006-02-16 19:34:37 +00006961 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 return NULL;
6963
Tim Peters7a29bd52001-09-12 03:03:31 +00006964 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 Py_INCREF(self);
6966 return (PyObject*) self;
6967 }
6968
6969 marg = width - self->length;
6970 left = marg / 2 + (marg & width & 1);
6971
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006972 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973}
6974
Marc-André Lemburge5034372000-08-08 08:04:29 +00006975#if 0
6976
6977/* This code should go into some future Unicode collation support
6978 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006979 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006980
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006981/* speedy UTF-16 code point order comparison */
6982/* gleaned from: */
6983/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6984
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006985static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006986{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006987 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006988 0, 0, 0, 0, 0, 0, 0, 0,
6989 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006990 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006991};
6992
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993static int
6994unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6995{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006996 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006997
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 Py_UNICODE *s1 = str1->str;
6999 Py_UNICODE *s2 = str2->str;
7000
7001 len1 = str1->length;
7002 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007003
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007005 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007006
7007 c1 = *s1++;
7008 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007009
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 if (c1 > (1<<11) * 26)
7011 c1 += utf16Fixup[c1>>11];
7012 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007013 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007014 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007015
7016 if (c1 != c2)
7017 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007018
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007019 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 }
7021
7022 return (len1 < len2) ? -1 : (len1 != len2);
7023}
7024
Marc-André Lemburge5034372000-08-08 08:04:29 +00007025#else
7026
7027static int
7028unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7029{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007030 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007031
7032 Py_UNICODE *s1 = str1->str;
7033 Py_UNICODE *s2 = str2->str;
7034
7035 len1 = str1->length;
7036 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007037
Marc-André Lemburge5034372000-08-08 08:04:29 +00007038 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007039 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007040
Fredrik Lundh45714e92001-06-26 16:39:36 +00007041 c1 = *s1++;
7042 c2 = *s2++;
7043
7044 if (c1 != c2)
7045 return (c1 < c2) ? -1 : 1;
7046
Marc-André Lemburge5034372000-08-08 08:04:29 +00007047 len1--; len2--;
7048 }
7049
7050 return (len1 < len2) ? -1 : (len1 != len2);
7051}
7052
7053#endif
7054
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007058 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7059 return unicode_compare((PyUnicodeObject *)left,
7060 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007061 PyErr_Format(PyExc_TypeError,
7062 "Can't compare %.100s and %.100s",
7063 left->ob_type->tp_name,
7064 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 return -1;
7066}
7067
Martin v. Löwis5b222132007-06-10 09:51:05 +00007068int
7069PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7070{
7071 int i;
7072 Py_UNICODE *id;
7073 assert(PyUnicode_Check(uni));
7074 id = PyUnicode_AS_UNICODE(uni);
7075 /* Compare Unicode string and source character set string */
7076 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 if (id[i] != str[i])
7078 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007079 /* This check keeps Python strings that end in '\0' from comparing equal
7080 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007081 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007083 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007085 return 0;
7086}
7087
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007088
Benjamin Peterson29060642009-01-31 22:14:21 +00007089#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007090 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007091
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007092PyObject *PyUnicode_RichCompare(PyObject *left,
7093 PyObject *right,
7094 int op)
7095{
7096 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007097
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007098 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7099 PyObject *v;
7100 if (((PyUnicodeObject *) left)->length !=
7101 ((PyUnicodeObject *) right)->length) {
7102 if (op == Py_EQ) {
7103 Py_INCREF(Py_False);
7104 return Py_False;
7105 }
7106 if (op == Py_NE) {
7107 Py_INCREF(Py_True);
7108 return Py_True;
7109 }
7110 }
7111 if (left == right)
7112 result = 0;
7113 else
7114 result = unicode_compare((PyUnicodeObject *)left,
7115 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007116
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007117 /* Convert the return value to a Boolean */
7118 switch (op) {
7119 case Py_EQ:
7120 v = TEST_COND(result == 0);
7121 break;
7122 case Py_NE:
7123 v = TEST_COND(result != 0);
7124 break;
7125 case Py_LE:
7126 v = TEST_COND(result <= 0);
7127 break;
7128 case Py_GE:
7129 v = TEST_COND(result >= 0);
7130 break;
7131 case Py_LT:
7132 v = TEST_COND(result == -1);
7133 break;
7134 case Py_GT:
7135 v = TEST_COND(result == 1);
7136 break;
7137 default:
7138 PyErr_BadArgument();
7139 return NULL;
7140 }
7141 Py_INCREF(v);
7142 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007143 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007144
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007145 Py_INCREF(Py_NotImplemented);
7146 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007147}
7148
Guido van Rossum403d68b2000-03-13 15:55:09 +00007149int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007150 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007151{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007152 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007153 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007154
7155 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007156 sub = PyUnicode_FromObject(element);
7157 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 PyErr_Format(PyExc_TypeError,
7159 "'in <string>' requires string as left operand, not %s",
7160 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007161 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007162 }
7163
Thomas Wouters477c8d52006-05-27 19:21:47 +00007164 str = PyUnicode_FromObject(container);
7165 if (!str) {
7166 Py_DECREF(sub);
7167 return -1;
7168 }
7169
7170 result = stringlib_contains_obj(str, sub);
7171
7172 Py_DECREF(str);
7173 Py_DECREF(sub);
7174
Guido van Rossum403d68b2000-03-13 15:55:09 +00007175 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007176}
7177
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178/* Concat to string or Unicode object giving a new Unicode object. */
7179
7180PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182{
7183 PyUnicodeObject *u = NULL, *v = NULL, *w;
7184
7185 /* Coerce the two arguments */
7186 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7187 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7190 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192
7193 /* Shortcuts */
7194 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 Py_DECREF(v);
7196 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197 }
7198 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 Py_DECREF(u);
7200 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 }
7202
7203 /* Concat the two Unicode strings */
7204 w = _PyUnicode_New(u->length + v->length);
7205 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 Py_UNICODE_COPY(w->str, u->str, u->length);
7208 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7209
7210 Py_DECREF(u);
7211 Py_DECREF(v);
7212 return (PyObject *)w;
7213
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 Py_XDECREF(u);
7216 Py_XDECREF(v);
7217 return NULL;
7218}
7219
Walter Dörwald1ab83302007-05-18 17:15:44 +00007220void
7221PyUnicode_Append(PyObject **pleft, PyObject *right)
7222{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007223 PyObject *new;
7224 if (*pleft == NULL)
7225 return;
7226 if (right == NULL || !PyUnicode_Check(*pleft)) {
7227 Py_DECREF(*pleft);
7228 *pleft = NULL;
7229 return;
7230 }
7231 new = PyUnicode_Concat(*pleft, right);
7232 Py_DECREF(*pleft);
7233 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007234}
7235
7236void
7237PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7238{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007239 PyUnicode_Append(pleft, right);
7240 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007241}
7242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007243PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007246Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007247string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007248interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249
7250static PyObject *
7251unicode_count(PyUnicodeObject *self, PyObject *args)
7252{
7253 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007254 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007255 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 PyObject *result;
7257
Guido van Rossumb8872e62000-05-09 14:14:27 +00007258 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 return NULL;
7261
7262 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007263 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007265 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007266
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007267 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007268 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007269 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007270 substring->str, substring->length,
7271 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007272 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273
7274 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007275
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 return result;
7277}
7278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007279PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007280 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007282Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007283to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007284handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007285a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7286'xmlcharrefreplace' as well as any other name registered with\n\
7287codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007288
7289static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007290unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007292 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293 char *encoding = NULL;
7294 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007295 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007296
Benjamin Peterson308d6372009-09-18 21:42:35 +00007297 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7298 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007300 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007301 if (v == NULL)
7302 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007303 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007304 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007305 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007306 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007307 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007308 Py_DECREF(v);
7309 return NULL;
7310 }
7311 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007312
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007314 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007315}
7316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007317PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319\n\
7320Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007321If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322
7323static PyObject*
7324unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7325{
7326 Py_UNICODE *e;
7327 Py_UNICODE *p;
7328 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007329 Py_UNICODE *qe;
7330 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331 PyUnicodeObject *u;
7332 int tabsize = 8;
7333
7334 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007336
Thomas Wouters7e474022000-07-16 12:04:32 +00007337 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007338 i = 0; /* chars up to and including most recent \n or \r */
7339 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7340 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 for (p = self->str; p < e; p++)
7342 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 if (tabsize > 0) {
7344 incr = tabsize - (j % tabsize); /* cannot overflow */
7345 if (j > PY_SSIZE_T_MAX - incr)
7346 goto overflow1;
7347 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007348 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 if (j > PY_SSIZE_T_MAX - 1)
7352 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353 j++;
7354 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 if (i > PY_SSIZE_T_MAX - j)
7356 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007358 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359 }
7360 }
7361
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007362 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007364
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 /* Second pass: create output string and fill it */
7366 u = _PyUnicode_New(i + j);
7367 if (!u)
7368 return NULL;
7369
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007370 j = 0; /* same as in first pass */
7371 q = u->str; /* next output char */
7372 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373
7374 for (p = self->str; p < e; p++)
7375 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 if (tabsize > 0) {
7377 i = tabsize - (j % tabsize);
7378 j += i;
7379 while (i--) {
7380 if (q >= qe)
7381 goto overflow2;
7382 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007383 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007385 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 else {
7387 if (q >= qe)
7388 goto overflow2;
7389 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007390 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391 if (*p == '\n' || *p == '\r')
7392 j = 0;
7393 }
7394
7395 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007396
7397 overflow2:
7398 Py_DECREF(u);
7399 overflow1:
7400 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402}
7403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007404PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406\n\
7407Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007408such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409arguments start and end are interpreted as in slice notation.\n\
7410\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007411Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412
7413static PyObject *
7414unicode_find(PyUnicodeObject *self, PyObject *args)
7415{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007416 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007417 Py_ssize_t start;
7418 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007419 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420
Christian Heimes9cd17752007-11-18 19:35:23 +00007421 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423
Thomas Wouters477c8d52006-05-27 19:21:47 +00007424 result = stringlib_find_slice(
7425 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7426 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7427 start, end
7428 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429
7430 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007431
Christian Heimes217cfd12007-12-02 14:31:20 +00007432 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433}
7434
7435static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007436unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437{
7438 if (index < 0 || index >= self->length) {
7439 PyErr_SetString(PyExc_IndexError, "string index out of range");
7440 return NULL;
7441 }
7442
7443 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7444}
7445
Guido van Rossumc2504932007-09-18 19:42:40 +00007446/* Believe it or not, this produces the same value for ASCII strings
7447 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007448static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007449unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450{
Guido van Rossumc2504932007-09-18 19:42:40 +00007451 Py_ssize_t len;
7452 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007453 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007454
7455 if (self->hash != -1)
7456 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007457 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007458 p = self->str;
7459 x = *p << 7;
7460 while (--len >= 0)
7461 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007462 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007463 if (x == -1)
7464 x = -2;
7465 self->hash = x;
7466 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467}
7468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007469PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007472Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473
7474static PyObject *
7475unicode_index(PyUnicodeObject *self, PyObject *args)
7476{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007477 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007478 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007479 Py_ssize_t start;
7480 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481
Christian Heimes9cd17752007-11-18 19:35:23 +00007482 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484
Thomas Wouters477c8d52006-05-27 19:21:47 +00007485 result = stringlib_find_slice(
7486 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7487 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7488 start, end
7489 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490
7491 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007492
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 if (result < 0) {
7494 PyErr_SetString(PyExc_ValueError, "substring not found");
7495 return NULL;
7496 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007497
Christian Heimes217cfd12007-12-02 14:31:20 +00007498 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499}
7500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007501PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007504Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007505at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506
7507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007508unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509{
7510 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7511 register const Py_UNICODE *e;
7512 int cased;
7513
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 /* Shortcut for single character strings */
7515 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007518 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007519 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007521
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 e = p + PyUnicode_GET_SIZE(self);
7523 cased = 0;
7524 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007526
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7528 return PyBool_FromLong(0);
7529 else if (!cased && Py_UNICODE_ISLOWER(ch))
7530 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007532 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533}
7534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007535PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007538Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007539at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540
7541static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007542unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543{
7544 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7545 register const Py_UNICODE *e;
7546 int cased;
7547
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 /* Shortcut for single character strings */
7549 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007552 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007553 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007554 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007555
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 e = p + PyUnicode_GET_SIZE(self);
7557 cased = 0;
7558 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007560
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7562 return PyBool_FromLong(0);
7563 else if (!cased && Py_UNICODE_ISUPPER(ch))
7564 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007566 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567}
7568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007569PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007572Return True if S is a titlecased string and there is at least one\n\
7573character in S, i.e. upper- and titlecase characters may only\n\
7574follow uncased characters and lowercase characters only cased ones.\n\
7575Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
7577static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007578unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579{
7580 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7581 register const Py_UNICODE *e;
7582 int cased, previous_is_cased;
7583
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 /* Shortcut for single character strings */
7585 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7587 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007589 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007590 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007592
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 e = p + PyUnicode_GET_SIZE(self);
7594 cased = 0;
7595 previous_is_cased = 0;
7596 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007598
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7600 if (previous_is_cased)
7601 return PyBool_FromLong(0);
7602 previous_is_cased = 1;
7603 cased = 1;
7604 }
7605 else if (Py_UNICODE_ISLOWER(ch)) {
7606 if (!previous_is_cased)
7607 return PyBool_FromLong(0);
7608 previous_is_cased = 1;
7609 cased = 1;
7610 }
7611 else
7612 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007614 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615}
7616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007617PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007620Return True if all characters in S are whitespace\n\
7621and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622
7623static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007624unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625{
7626 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7627 register const Py_UNICODE *e;
7628
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 /* Shortcut for single character strings */
7630 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 Py_UNICODE_ISSPACE(*p))
7632 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007634 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007635 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007637
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 e = p + PyUnicode_GET_SIZE(self);
7639 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 if (!Py_UNICODE_ISSPACE(*p))
7641 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007643 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644}
7645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007646PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007648\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007649Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007650and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007651
7652static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007653unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007654{
7655 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7656 register const Py_UNICODE *e;
7657
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007658 /* Shortcut for single character strings */
7659 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 Py_UNICODE_ISALPHA(*p))
7661 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007662
7663 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007664 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007666
7667 e = p + PyUnicode_GET_SIZE(self);
7668 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 if (!Py_UNICODE_ISALPHA(*p))
7670 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007671 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007672 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007673}
7674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007675PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007677\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007678Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007679and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007680
7681static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007682unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007683{
7684 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7685 register const Py_UNICODE *e;
7686
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007687 /* Shortcut for single character strings */
7688 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007689 Py_UNICODE_ISALNUM(*p))
7690 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007691
7692 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007693 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007695
7696 e = p + PyUnicode_GET_SIZE(self);
7697 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 if (!Py_UNICODE_ISALNUM(*p))
7699 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007700 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007701 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007702}
7703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007704PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007707Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007708False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709
7710static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007711unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712{
7713 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7714 register const Py_UNICODE *e;
7715
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 /* Shortcut for single character strings */
7717 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 Py_UNICODE_ISDECIMAL(*p))
7719 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007721 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007722 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007724
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 e = p + PyUnicode_GET_SIZE(self);
7726 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 if (!Py_UNICODE_ISDECIMAL(*p))
7728 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007730 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731}
7732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007733PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007736Return True if all characters in S are digits\n\
7737and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738
7739static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007740unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741{
7742 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7743 register const Py_UNICODE *e;
7744
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 /* Shortcut for single character strings */
7746 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 Py_UNICODE_ISDIGIT(*p))
7748 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007750 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007751 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007753
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 e = p + PyUnicode_GET_SIZE(self);
7755 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 if (!Py_UNICODE_ISDIGIT(*p))
7757 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007759 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760}
7761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007762PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007765Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007766False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767
7768static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007769unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770{
7771 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7772 register const Py_UNICODE *e;
7773
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774 /* Shortcut for single character strings */
7775 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 Py_UNICODE_ISNUMERIC(*p))
7777 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007779 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007780 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007782
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 e = p + PyUnicode_GET_SIZE(self);
7784 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 if (!Py_UNICODE_ISNUMERIC(*p))
7786 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007788 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789}
7790
Martin v. Löwis47383402007-08-15 07:32:56 +00007791int
7792PyUnicode_IsIdentifier(PyObject *self)
7793{
7794 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7795 register const Py_UNICODE *e;
7796
7797 /* Special case for empty strings */
7798 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007800
7801 /* PEP 3131 says that the first character must be in
7802 XID_Start and subsequent characters in XID_Continue,
7803 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007804 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007805 letters, digits, underscore). However, given the current
7806 definition of XID_Start and XID_Continue, it is sufficient
7807 to check just for these, except that _ must be allowed
7808 as starting an identifier. */
7809 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7810 return 0;
7811
7812 e = p + PyUnicode_GET_SIZE(self);
7813 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 if (!_PyUnicode_IsXidContinue(*p))
7815 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007816 }
7817 return 1;
7818}
7819
7820PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007821 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007822\n\
7823Return True if S is a valid identifier according\n\
7824to the language definition.");
7825
7826static PyObject*
7827unicode_isidentifier(PyObject *self)
7828{
7829 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7830}
7831
Georg Brandl559e5d72008-06-11 18:37:52 +00007832PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007834\n\
7835Return True if all characters in S are considered\n\
7836printable in repr() or S is empty, False otherwise.");
7837
7838static PyObject*
7839unicode_isprintable(PyObject *self)
7840{
7841 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7842 register const Py_UNICODE *e;
7843
7844 /* Shortcut for single character strings */
7845 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7846 Py_RETURN_TRUE;
7847 }
7848
7849 e = p + PyUnicode_GET_SIZE(self);
7850 for (; p < e; p++) {
7851 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7852 Py_RETURN_FALSE;
7853 }
7854 }
7855 Py_RETURN_TRUE;
7856}
7857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007858PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007859 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860\n\
7861Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007862iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863
7864static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007865unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007867 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868}
7869
Martin v. Löwis18e16552006-02-15 17:27:45 +00007870static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871unicode_length(PyUnicodeObject *self)
7872{
7873 return self->length;
7874}
7875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007876PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007879Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007880done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881
7882static PyObject *
7883unicode_ljust(PyUnicodeObject *self, PyObject *args)
7884{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007885 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007886 Py_UNICODE fillchar = ' ';
7887
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007888 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889 return NULL;
7890
Tim Peters7a29bd52001-09-12 03:03:31 +00007891 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892 Py_INCREF(self);
7893 return (PyObject*) self;
7894 }
7895
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007896 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897}
7898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007899PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007902Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903
7904static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007905unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907 return fixup(self, fixlower);
7908}
7909
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007910#define LEFTSTRIP 0
7911#define RIGHTSTRIP 1
7912#define BOTHSTRIP 2
7913
7914/* Arrays indexed by above */
7915static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7916
7917#define STRIPNAME(i) (stripformat[i]+3)
7918
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007919/* externally visible for str.strip(unicode) */
7920PyObject *
7921_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7922{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007923 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7924 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7925 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7926 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7927 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007928
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007930
Benjamin Peterson14339b62009-01-31 16:36:08 +00007931 i = 0;
7932 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7934 i++;
7935 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007936 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007937
Benjamin Peterson14339b62009-01-31 16:36:08 +00007938 j = len;
7939 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 do {
7941 j--;
7942 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7943 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007944 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007945
Benjamin Peterson14339b62009-01-31 16:36:08 +00007946 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 Py_INCREF(self);
7948 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007949 }
7950 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007952}
7953
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954
7955static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007956do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007958 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7959 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007960
Benjamin Peterson14339b62009-01-31 16:36:08 +00007961 i = 0;
7962 if (striptype != RIGHTSTRIP) {
7963 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7964 i++;
7965 }
7966 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007967
Benjamin Peterson14339b62009-01-31 16:36:08 +00007968 j = len;
7969 if (striptype != LEFTSTRIP) {
7970 do {
7971 j--;
7972 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7973 j++;
7974 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007975
Benjamin Peterson14339b62009-01-31 16:36:08 +00007976 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7977 Py_INCREF(self);
7978 return (PyObject*)self;
7979 }
7980 else
7981 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982}
7983
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007984
7985static PyObject *
7986do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7987{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007988 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007989
Benjamin Peterson14339b62009-01-31 16:36:08 +00007990 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7991 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007992
Benjamin Peterson14339b62009-01-31 16:36:08 +00007993 if (sep != NULL && sep != Py_None) {
7994 if (PyUnicode_Check(sep))
7995 return _PyUnicode_XStrip(self, striptype, sep);
7996 else {
7997 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 "%s arg must be None or str",
7999 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008000 return NULL;
8001 }
8002 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008003
Benjamin Peterson14339b62009-01-31 16:36:08 +00008004 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008005}
8006
8007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008008PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008010\n\
8011Return a copy of the string S with leading and trailing\n\
8012whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008013If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008014
8015static PyObject *
8016unicode_strip(PyUnicodeObject *self, PyObject *args)
8017{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008018 if (PyTuple_GET_SIZE(args) == 0)
8019 return do_strip(self, BOTHSTRIP); /* Common case */
8020 else
8021 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008022}
8023
8024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008025PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008027\n\
8028Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008029If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008030
8031static PyObject *
8032unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8033{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008034 if (PyTuple_GET_SIZE(args) == 0)
8035 return do_strip(self, LEFTSTRIP); /* Common case */
8036 else
8037 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008038}
8039
8040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008041PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008043\n\
8044Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008045If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008046
8047static PyObject *
8048unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8049{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008050 if (PyTuple_GET_SIZE(args) == 0)
8051 return do_strip(self, RIGHTSTRIP); /* Common case */
8052 else
8053 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008054}
8055
8056
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008058unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059{
8060 PyUnicodeObject *u;
8061 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008062 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008063 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064
Georg Brandl222de0f2009-04-12 12:01:50 +00008065 if (len < 1) {
8066 Py_INCREF(unicode_empty);
8067 return (PyObject *)unicode_empty;
8068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069
Tim Peters7a29bd52001-09-12 03:03:31 +00008070 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 /* no repeat, return original string */
8072 Py_INCREF(str);
8073 return (PyObject*) str;
8074 }
Tim Peters8f422462000-09-09 06:13:41 +00008075
8076 /* ensure # of chars needed doesn't overflow int and # of bytes
8077 * needed doesn't overflow size_t
8078 */
8079 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008080 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008081 PyErr_SetString(PyExc_OverflowError,
8082 "repeated string is too long");
8083 return NULL;
8084 }
8085 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8086 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8087 PyErr_SetString(PyExc_OverflowError,
8088 "repeated string is too long");
8089 return NULL;
8090 }
8091 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092 if (!u)
8093 return NULL;
8094
8095 p = u->str;
8096
Georg Brandl222de0f2009-04-12 12:01:50 +00008097 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008098 Py_UNICODE_FILL(p, str->str[0], len);
8099 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008100 Py_ssize_t done = str->length; /* number of characters copied this far */
8101 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008103 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008104 Py_UNICODE_COPY(p+done, p, n);
8105 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107 }
8108
8109 return (PyObject*) u;
8110}
8111
8112PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 PyObject *subobj,
8114 PyObject *replobj,
8115 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116{
8117 PyObject *self;
8118 PyObject *str1;
8119 PyObject *str2;
8120 PyObject *result;
8121
8122 self = PyUnicode_FromObject(obj);
8123 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125 str1 = PyUnicode_FromObject(subobj);
8126 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 Py_DECREF(self);
8128 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 }
8130 str2 = PyUnicode_FromObject(replobj);
8131 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 Py_DECREF(self);
8133 Py_DECREF(str1);
8134 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 }
Tim Petersced69f82003-09-16 20:30:58 +00008136 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 (PyUnicodeObject *)str1,
8138 (PyUnicodeObject *)str2,
8139 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 Py_DECREF(self);
8141 Py_DECREF(str1);
8142 Py_DECREF(str2);
8143 return result;
8144}
8145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008146PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008147 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148\n\
8149Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008150old replaced by new. If the optional argument count is\n\
8151given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152
8153static PyObject*
8154unicode_replace(PyUnicodeObject *self, PyObject *args)
8155{
8156 PyUnicodeObject *str1;
8157 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008158 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 PyObject *result;
8160
Martin v. Löwis18e16552006-02-15 17:27:45 +00008161 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 return NULL;
8163 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8164 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008167 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 Py_DECREF(str1);
8169 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171
8172 result = replace(self, str1, str2, maxcount);
8173
8174 Py_DECREF(str1);
8175 Py_DECREF(str2);
8176 return result;
8177}
8178
8179static
8180PyObject *unicode_repr(PyObject *unicode)
8181{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008182 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008183 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008184 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8185 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8186
8187 /* XXX(nnorwitz): rather than over-allocating, it would be
8188 better to choose a different scheme. Perhaps scan the
8189 first N-chars of the string and allocate based on that size.
8190 */
8191 /* Initial allocation is based on the longest-possible unichr
8192 escape.
8193
8194 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8195 unichr, so in this case it's the longest unichr escape. In
8196 narrow (UTF-16) builds this is five chars per source unichr
8197 since there are two unichrs in the surrogate pair, so in narrow
8198 (UTF-16) builds it's not the longest unichr escape.
8199
8200 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8201 so in the narrow (UTF-16) build case it's the longest unichr
8202 escape.
8203 */
8204
Walter Dörwald1ab83302007-05-18 17:15:44 +00008205 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008207#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008209#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008211#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008213 if (repr == NULL)
8214 return NULL;
8215
Walter Dörwald1ab83302007-05-18 17:15:44 +00008216 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008217
8218 /* Add quote */
8219 *p++ = (findchar(s, size, '\'') &&
8220 !findchar(s, size, '"')) ? '"' : '\'';
8221 while (size-- > 0) {
8222 Py_UNICODE ch = *s++;
8223
8224 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008225 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008226 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008227 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008228 continue;
8229 }
8230
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008232 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008233 *p++ = '\\';
8234 *p++ = 't';
8235 }
8236 else if (ch == '\n') {
8237 *p++ = '\\';
8238 *p++ = 'n';
8239 }
8240 else if (ch == '\r') {
8241 *p++ = '\\';
8242 *p++ = 'r';
8243 }
8244
8245 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008246 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008247 *p++ = '\\';
8248 *p++ = 'x';
8249 *p++ = hexdigits[(ch >> 4) & 0x000F];
8250 *p++ = hexdigits[ch & 0x000F];
8251 }
8252
Georg Brandl559e5d72008-06-11 18:37:52 +00008253 /* Copy ASCII characters as-is */
8254 else if (ch < 0x7F) {
8255 *p++ = ch;
8256 }
8257
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008259 else {
8260 Py_UCS4 ucs = ch;
8261
8262#ifndef Py_UNICODE_WIDE
8263 Py_UNICODE ch2 = 0;
8264 /* Get code point from surrogate pair */
8265 if (size > 0) {
8266 ch2 = *s;
8267 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008269 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008271 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008272 size--;
8273 }
8274 }
8275#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008276 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008277 (categories Z* and C* except ASCII space)
8278 */
8279 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8280 /* Map 8-bit characters to '\xhh' */
8281 if (ucs <= 0xff) {
8282 *p++ = '\\';
8283 *p++ = 'x';
8284 *p++ = hexdigits[(ch >> 4) & 0x000F];
8285 *p++ = hexdigits[ch & 0x000F];
8286 }
8287 /* Map 21-bit characters to '\U00xxxxxx' */
8288 else if (ucs >= 0x10000) {
8289 *p++ = '\\';
8290 *p++ = 'U';
8291 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8292 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8293 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8294 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8295 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8296 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8297 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8298 *p++ = hexdigits[ucs & 0x0000000F];
8299 }
8300 /* Map 16-bit characters to '\uxxxx' */
8301 else {
8302 *p++ = '\\';
8303 *p++ = 'u';
8304 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8305 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8306 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8307 *p++ = hexdigits[ucs & 0x000F];
8308 }
8309 }
8310 /* Copy characters as-is */
8311 else {
8312 *p++ = ch;
8313#ifndef Py_UNICODE_WIDE
8314 if (ucs >= 0x10000)
8315 *p++ = ch2;
8316#endif
8317 }
8318 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008319 }
8320 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008321 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008322
8323 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008324 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008325 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326}
8327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008328PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330\n\
8331Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008332such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333arguments start and end are interpreted as in slice notation.\n\
8334\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008335Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336
8337static PyObject *
8338unicode_rfind(PyUnicodeObject *self, PyObject *args)
8339{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008340 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008341 Py_ssize_t start;
8342 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008343 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344
Christian Heimes9cd17752007-11-18 19:35:23 +00008345 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347
Thomas Wouters477c8d52006-05-27 19:21:47 +00008348 result = stringlib_rfind_slice(
8349 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8350 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8351 start, end
8352 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353
8354 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008355
Christian Heimes217cfd12007-12-02 14:31:20 +00008356 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008357}
8358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008359PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008362Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363
8364static PyObject *
8365unicode_rindex(PyUnicodeObject *self, PyObject *args)
8366{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008367 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008368 Py_ssize_t start;
8369 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008370 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371
Christian Heimes9cd17752007-11-18 19:35:23 +00008372 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008373 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374
Thomas Wouters477c8d52006-05-27 19:21:47 +00008375 result = stringlib_rfind_slice(
8376 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8377 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8378 start, end
8379 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380
8381 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008382
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 if (result < 0) {
8384 PyErr_SetString(PyExc_ValueError, "substring not found");
8385 return NULL;
8386 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008387 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388}
8389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008390PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008393Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008394done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395
8396static PyObject *
8397unicode_rjust(PyUnicodeObject *self, PyObject *args)
8398{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008399 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008400 Py_UNICODE fillchar = ' ';
8401
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008402 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403 return NULL;
8404
Tim Peters7a29bd52001-09-12 03:03:31 +00008405 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406 Py_INCREF(self);
8407 return (PyObject*) self;
8408 }
8409
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008410 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411}
8412
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 PyObject *sep,
8415 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416{
8417 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008418
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 s = PyUnicode_FromObject(s);
8420 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 if (sep != NULL) {
8423 sep = PyUnicode_FromObject(sep);
8424 if (sep == NULL) {
8425 Py_DECREF(s);
8426 return NULL;
8427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 }
8429
8430 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8431
8432 Py_DECREF(s);
8433 Py_XDECREF(sep);
8434 return result;
8435}
8436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008437PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439\n\
8440Return a list of the words in S, using sep as the\n\
8441delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008442splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008443whitespace string is a separator and empty strings are\n\
8444removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445
8446static PyObject*
8447unicode_split(PyUnicodeObject *self, PyObject *args)
8448{
8449 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008450 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451
Martin v. Löwis18e16552006-02-15 17:27:45 +00008452 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 return NULL;
8454
8455 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461}
8462
Thomas Wouters477c8d52006-05-27 19:21:47 +00008463PyObject *
8464PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8465{
8466 PyObject* str_obj;
8467 PyObject* sep_obj;
8468 PyObject* out;
8469
8470 str_obj = PyUnicode_FromObject(str_in);
8471 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008473 sep_obj = PyUnicode_FromObject(sep_in);
8474 if (!sep_obj) {
8475 Py_DECREF(str_obj);
8476 return NULL;
8477 }
8478
8479 out = stringlib_partition(
8480 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8481 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8482 );
8483
8484 Py_DECREF(sep_obj);
8485 Py_DECREF(str_obj);
8486
8487 return out;
8488}
8489
8490
8491PyObject *
8492PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8493{
8494 PyObject* str_obj;
8495 PyObject* sep_obj;
8496 PyObject* out;
8497
8498 str_obj = PyUnicode_FromObject(str_in);
8499 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008501 sep_obj = PyUnicode_FromObject(sep_in);
8502 if (!sep_obj) {
8503 Py_DECREF(str_obj);
8504 return NULL;
8505 }
8506
8507 out = stringlib_rpartition(
8508 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8509 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8510 );
8511
8512 Py_DECREF(sep_obj);
8513 Py_DECREF(str_obj);
8514
8515 return out;
8516}
8517
8518PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008520\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008521Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008522the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008523found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008524
8525static PyObject*
8526unicode_partition(PyUnicodeObject *self, PyObject *separator)
8527{
8528 return PyUnicode_Partition((PyObject *)self, separator);
8529}
8530
8531PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008532 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008533\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008534Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008535the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008536separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008537
8538static PyObject*
8539unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8540{
8541 return PyUnicode_RPartition((PyObject *)self, separator);
8542}
8543
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008544PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 PyObject *sep,
8546 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008547{
8548 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008549
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008550 s = PyUnicode_FromObject(s);
8551 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008552 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 if (sep != NULL) {
8554 sep = PyUnicode_FromObject(sep);
8555 if (sep == NULL) {
8556 Py_DECREF(s);
8557 return NULL;
8558 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008559 }
8560
8561 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8562
8563 Py_DECREF(s);
8564 Py_XDECREF(sep);
8565 return result;
8566}
8567
8568PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008570\n\
8571Return a list of the words in S, using sep as the\n\
8572delimiter string, starting at the end of the string and\n\
8573working to the front. If maxsplit is given, at most maxsplit\n\
8574splits are done. If sep is not specified, any whitespace string\n\
8575is a separator.");
8576
8577static PyObject*
8578unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8579{
8580 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008581 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008582
Martin v. Löwis18e16552006-02-15 17:27:45 +00008583 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008584 return NULL;
8585
8586 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008588 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008590 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008592}
8593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008594PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596\n\
8597Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008598Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008599is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600
8601static PyObject*
8602unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8603{
Guido van Rossum86662912000-04-11 15:38:46 +00008604 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605
Guido van Rossum86662912000-04-11 15:38:46 +00008606 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607 return NULL;
8608
Guido van Rossum86662912000-04-11 15:38:46 +00008609 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610}
8611
8612static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008613PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614{
Walter Dörwald346737f2007-05-31 10:44:43 +00008615 if (PyUnicode_CheckExact(self)) {
8616 Py_INCREF(self);
8617 return self;
8618 } else
8619 /* Subtype -- return genuine unicode string with the same value. */
8620 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8621 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622}
8623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008624PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626\n\
8627Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008628and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629
8630static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008631unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 return fixup(self, fixswapcase);
8634}
8635
Georg Brandlceee0772007-11-27 23:48:05 +00008636PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008638\n\
8639Return a translation table usable for str.translate().\n\
8640If there is only one argument, it must be a dictionary mapping Unicode\n\
8641ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008642Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008643If there are two arguments, they must be strings of equal length, and\n\
8644in the resulting dictionary, each character in x will be mapped to the\n\
8645character at the same position in y. If there is a third argument, it\n\
8646must be a string, whose characters will be mapped to None in the result.");
8647
8648static PyObject*
8649unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8650{
8651 PyObject *x, *y = NULL, *z = NULL;
8652 PyObject *new = NULL, *key, *value;
8653 Py_ssize_t i = 0;
8654 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008655
Georg Brandlceee0772007-11-27 23:48:05 +00008656 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8657 return NULL;
8658 new = PyDict_New();
8659 if (!new)
8660 return NULL;
8661 if (y != NULL) {
8662 /* x must be a string too, of equal length */
8663 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8664 if (!PyUnicode_Check(x)) {
8665 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8666 "be a string if there is a second argument");
8667 goto err;
8668 }
8669 if (PyUnicode_GET_SIZE(x) != ylen) {
8670 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8671 "arguments must have equal length");
8672 goto err;
8673 }
8674 /* create entries for translating chars in x to those in y */
8675 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008676 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8677 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008678 if (!key || !value)
8679 goto err;
8680 res = PyDict_SetItem(new, key, value);
8681 Py_DECREF(key);
8682 Py_DECREF(value);
8683 if (res < 0)
8684 goto err;
8685 }
8686 /* create entries for deleting chars in z */
8687 if (z != NULL) {
8688 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008689 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008690 if (!key)
8691 goto err;
8692 res = PyDict_SetItem(new, key, Py_None);
8693 Py_DECREF(key);
8694 if (res < 0)
8695 goto err;
8696 }
8697 }
8698 } else {
8699 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008700 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008701 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8702 "to maketrans it must be a dict");
8703 goto err;
8704 }
8705 /* copy entries into the new dict, converting string keys to int keys */
8706 while (PyDict_Next(x, &i, &key, &value)) {
8707 if (PyUnicode_Check(key)) {
8708 /* convert string keys to integer keys */
8709 PyObject *newkey;
8710 if (PyUnicode_GET_SIZE(key) != 1) {
8711 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8712 "table must be of length 1");
8713 goto err;
8714 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008715 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008716 if (!newkey)
8717 goto err;
8718 res = PyDict_SetItem(new, newkey, value);
8719 Py_DECREF(newkey);
8720 if (res < 0)
8721 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008722 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008723 /* just keep integer keys */
8724 if (PyDict_SetItem(new, key, value) < 0)
8725 goto err;
8726 } else {
8727 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8728 "be strings or integers");
8729 goto err;
8730 }
8731 }
8732 }
8733 return new;
8734 err:
8735 Py_DECREF(new);
8736 return NULL;
8737}
8738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008739PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741\n\
8742Return a copy of the string S, where all characters have been mapped\n\
8743through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008744Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008745Unmapped characters are left untouched. Characters mapped to None\n\
8746are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747
8748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008749unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750{
Georg Brandlceee0772007-11-27 23:48:05 +00008751 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752}
8753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008754PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008757Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758
8759static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008760unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762 return fixup(self, fixupper);
8763}
8764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008765PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008768Pad a numeric string S with zeros on the left, to fill a field\n\
8769of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770
8771static PyObject *
8772unicode_zfill(PyUnicodeObject *self, PyObject *args)
8773{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008774 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775 PyUnicodeObject *u;
8776
Martin v. Löwis18e16552006-02-15 17:27:45 +00008777 Py_ssize_t width;
8778 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779 return NULL;
8780
8781 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008782 if (PyUnicode_CheckExact(self)) {
8783 Py_INCREF(self);
8784 return (PyObject*) self;
8785 }
8786 else
8787 return PyUnicode_FromUnicode(
8788 PyUnicode_AS_UNICODE(self),
8789 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 }
8792
8793 fill = width - self->length;
8794
8795 u = pad(self, fill, 0, '0');
8796
Walter Dörwald068325e2002-04-15 13:36:47 +00008797 if (u == NULL)
8798 return NULL;
8799
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 if (u->str[fill] == '+' || u->str[fill] == '-') {
8801 /* move sign to beginning of string */
8802 u->str[0] = u->str[fill];
8803 u->str[fill] = '0';
8804 }
8805
8806 return (PyObject*) u;
8807}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808
8809#if 0
8810static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008811unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812{
Christian Heimes2202f872008-02-06 14:31:34 +00008813 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814}
8815#endif
8816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008817PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008820Return True if S starts with the specified prefix, False otherwise.\n\
8821With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008822With optional end, stop comparing S at that position.\n\
8823prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824
8825static PyObject *
8826unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008829 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008831 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008832 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008833 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008835 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8837 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008838 if (PyTuple_Check(subobj)) {
8839 Py_ssize_t i;
8840 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8841 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008843 if (substring == NULL)
8844 return NULL;
8845 result = tailmatch(self, substring, start, end, -1);
8846 Py_DECREF(substring);
8847 if (result) {
8848 Py_RETURN_TRUE;
8849 }
8850 }
8851 /* nothing matched */
8852 Py_RETURN_FALSE;
8853 }
8854 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008856 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008857 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008859 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860}
8861
8862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008863PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008866Return True if S ends with the specified suffix, False otherwise.\n\
8867With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008868With optional end, stop comparing S at that position.\n\
8869suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870
8871static PyObject *
8872unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008875 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008877 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008878 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008879 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008881 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8883 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008884 if (PyTuple_Check(subobj)) {
8885 Py_ssize_t i;
8886 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8887 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008889 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008891 result = tailmatch(self, substring, start, end, +1);
8892 Py_DECREF(substring);
8893 if (result) {
8894 Py_RETURN_TRUE;
8895 }
8896 }
8897 Py_RETURN_FALSE;
8898 }
8899 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008903 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008905 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906}
8907
Eric Smith8c663262007-08-25 02:26:07 +00008908#include "stringlib/string_format.h"
8909
8910PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008912\n\
8913");
8914
Eric Smith4a7d76d2008-05-30 18:10:19 +00008915static PyObject *
8916unicode__format__(PyObject* self, PyObject* args)
8917{
8918 PyObject *format_spec;
8919
8920 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8921 return NULL;
8922
8923 return _PyUnicode_FormatAdvanced(self,
8924 PyUnicode_AS_UNICODE(format_spec),
8925 PyUnicode_GET_SIZE(format_spec));
8926}
8927
Eric Smith8c663262007-08-25 02:26:07 +00008928PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008930\n\
8931");
8932
8933static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008934unicode__sizeof__(PyUnicodeObject *v)
8935{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008936 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8937 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008938}
8939
8940PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008942
8943static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008944unicode_getnewargs(PyUnicodeObject *v)
8945{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008946 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008947}
8948
8949
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950static PyMethodDef unicode_methods[] = {
8951
8952 /* Order is according to common usage: often used methods should
8953 appear first, since lookup is done sequentially. */
8954
Benjamin Peterson308d6372009-09-18 21:42:35 +00008955 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008956 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8957 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008958 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008959 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8960 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8961 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8962 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8963 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8964 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8965 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008966 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008967 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8968 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8969 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008970 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008971 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8972 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8973 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008974 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008975 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008976 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008977 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008978 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8979 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8980 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8981 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8982 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8983 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8984 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8985 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8986 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8987 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8988 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8989 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8990 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8991 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008992 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008993 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008994 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008995 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008996 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00008997 {"maketrans", (PyCFunction) unicode_maketrans,
8998 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008999 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009000#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009001 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002#endif
9003
9004#if 0
9005 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009006 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007#endif
9008
Benjamin Peterson14339b62009-01-31 16:36:08 +00009009 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010 {NULL, NULL}
9011};
9012
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009013static PyObject *
9014unicode_mod(PyObject *v, PyObject *w)
9015{
Benjamin Peterson29060642009-01-31 22:14:21 +00009016 if (!PyUnicode_Check(v)) {
9017 Py_INCREF(Py_NotImplemented);
9018 return Py_NotImplemented;
9019 }
9020 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009021}
9022
9023static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009024 0, /*nb_add*/
9025 0, /*nb_subtract*/
9026 0, /*nb_multiply*/
9027 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009028};
9029
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009031 (lenfunc) unicode_length, /* sq_length */
9032 PyUnicode_Concat, /* sq_concat */
9033 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9034 (ssizeargfunc) unicode_getitem, /* sq_item */
9035 0, /* sq_slice */
9036 0, /* sq_ass_item */
9037 0, /* sq_ass_slice */
9038 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039};
9040
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009041static PyObject*
9042unicode_subscript(PyUnicodeObject* self, PyObject* item)
9043{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009044 if (PyIndex_Check(item)) {
9045 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009046 if (i == -1 && PyErr_Occurred())
9047 return NULL;
9048 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009049 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009050 return unicode_getitem(self, i);
9051 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009052 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009053 Py_UNICODE* source_buf;
9054 Py_UNICODE* result_buf;
9055 PyObject* result;
9056
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009057 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009058 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009059 return NULL;
9060 }
9061
9062 if (slicelength <= 0) {
9063 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009064 } else if (start == 0 && step == 1 && slicelength == self->length &&
9065 PyUnicode_CheckExact(self)) {
9066 Py_INCREF(self);
9067 return (PyObject *)self;
9068 } else if (step == 1) {
9069 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009070 } else {
9071 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009072 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9073 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009074
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 if (result_buf == NULL)
9076 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009077
9078 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9079 result_buf[i] = source_buf[cur];
9080 }
Tim Petersced69f82003-09-16 20:30:58 +00009081
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009082 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009083 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009084 return result;
9085 }
9086 } else {
9087 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9088 return NULL;
9089 }
9090}
9091
9092static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009093 (lenfunc)unicode_length, /* mp_length */
9094 (binaryfunc)unicode_subscript, /* mp_subscript */
9095 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009096};
9097
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099/* Helpers for PyUnicode_Format() */
9100
9101static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009102getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009104 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 (*p_argidx)++;
9107 if (arglen < 0)
9108 return args;
9109 else
9110 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 }
9112 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114 return NULL;
9115}
9116
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009117/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009119static PyObject *
9120formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009122 char *p;
9123 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009125
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126 x = PyFloat_AsDouble(v);
9127 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009128 return NULL;
9129
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009132
Eric Smith0923d1d2009-04-16 20:16:10 +00009133 p = PyOS_double_to_string(x, type, prec,
9134 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009135 if (p == NULL)
9136 return NULL;
9137 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009138 PyMem_Free(p);
9139 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140}
9141
Tim Peters38fd5b62000-09-21 05:43:11 +00009142static PyObject*
9143formatlong(PyObject *val, int flags, int prec, int type)
9144{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009145 char *buf;
9146 int len;
9147 PyObject *str; /* temporary string object. */
9148 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009149
Benjamin Peterson14339b62009-01-31 16:36:08 +00009150 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9151 if (!str)
9152 return NULL;
9153 result = PyUnicode_FromStringAndSize(buf, len);
9154 Py_DECREF(str);
9155 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009156}
9157
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158static int
9159formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009160 size_t buflen,
9161 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009163 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009164 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009165 if (PyUnicode_GET_SIZE(v) == 1) {
9166 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9167 buf[1] = '\0';
9168 return 1;
9169 }
9170#ifndef Py_UNICODE_WIDE
9171 if (PyUnicode_GET_SIZE(v) == 2) {
9172 /* Decode a valid surrogate pair */
9173 int c0 = PyUnicode_AS_UNICODE(v)[0];
9174 int c1 = PyUnicode_AS_UNICODE(v)[1];
9175 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9176 0xDC00 <= c1 && c1 <= 0xDFFF) {
9177 buf[0] = c0;
9178 buf[1] = c1;
9179 buf[2] = '\0';
9180 return 2;
9181 }
9182 }
9183#endif
9184 goto onError;
9185 }
9186 else {
9187 /* Integer input truncated to a character */
9188 long x;
9189 x = PyLong_AsLong(v);
9190 if (x == -1 && PyErr_Occurred())
9191 goto onError;
9192
9193 if (x < 0 || x > 0x10ffff) {
9194 PyErr_SetString(PyExc_OverflowError,
9195 "%c arg not in range(0x110000)");
9196 return -1;
9197 }
9198
9199#ifndef Py_UNICODE_WIDE
9200 if (x > 0xffff) {
9201 x -= 0x10000;
9202 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9203 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9204 return 2;
9205 }
9206#endif
9207 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009208 buf[1] = '\0';
9209 return 1;
9210 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009211
Benjamin Peterson29060642009-01-31 22:14:21 +00009212 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009213 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009214 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009215 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216}
9217
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009218/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009219 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009220*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009221#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009222
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225{
9226 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009227 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228 int args_owned = 0;
9229 PyUnicodeObject *result = NULL;
9230 PyObject *dict = NULL;
9231 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009232
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009234 PyErr_BadInternalCall();
9235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236 }
9237 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009238 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240 fmt = PyUnicode_AS_UNICODE(uformat);
9241 fmtcnt = PyUnicode_GET_SIZE(uformat);
9242
9243 reslen = rescnt = fmtcnt + 100;
9244 result = _PyUnicode_New(reslen);
9245 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247 res = PyUnicode_AS_UNICODE(result);
9248
9249 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009250 arglen = PyTuple_Size(args);
9251 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252 }
9253 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009254 arglen = -1;
9255 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009257 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009258 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009259 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260
9261 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009262 if (*fmt != '%') {
9263 if (--rescnt < 0) {
9264 rescnt = fmtcnt + 100;
9265 reslen += rescnt;
9266 if (_PyUnicode_Resize(&result, reslen) < 0)
9267 goto onError;
9268 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9269 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009270 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009271 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009272 }
9273 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009274 /* Got a format specifier */
9275 int flags = 0;
9276 Py_ssize_t width = -1;
9277 int prec = -1;
9278 Py_UNICODE c = '\0';
9279 Py_UNICODE fill;
9280 int isnumok;
9281 PyObject *v = NULL;
9282 PyObject *temp = NULL;
9283 Py_UNICODE *pbuf;
9284 Py_UNICODE sign;
9285 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009286 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287
Benjamin Peterson29060642009-01-31 22:14:21 +00009288 fmt++;
9289 if (*fmt == '(') {
9290 Py_UNICODE *keystart;
9291 Py_ssize_t keylen;
9292 PyObject *key;
9293 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009294
Benjamin Peterson29060642009-01-31 22:14:21 +00009295 if (dict == NULL) {
9296 PyErr_SetString(PyExc_TypeError,
9297 "format requires a mapping");
9298 goto onError;
9299 }
9300 ++fmt;
9301 --fmtcnt;
9302 keystart = fmt;
9303 /* Skip over balanced parentheses */
9304 while (pcount > 0 && --fmtcnt >= 0) {
9305 if (*fmt == ')')
9306 --pcount;
9307 else if (*fmt == '(')
9308 ++pcount;
9309 fmt++;
9310 }
9311 keylen = fmt - keystart - 1;
9312 if (fmtcnt < 0 || pcount > 0) {
9313 PyErr_SetString(PyExc_ValueError,
9314 "incomplete format key");
9315 goto onError;
9316 }
9317#if 0
9318 /* keys are converted to strings using UTF-8 and
9319 then looked up since Python uses strings to hold
9320 variables names etc. in its namespaces and we
9321 wouldn't want to break common idioms. */
9322 key = PyUnicode_EncodeUTF8(keystart,
9323 keylen,
9324 NULL);
9325#else
9326 key = PyUnicode_FromUnicode(keystart, keylen);
9327#endif
9328 if (key == NULL)
9329 goto onError;
9330 if (args_owned) {
9331 Py_DECREF(args);
9332 args_owned = 0;
9333 }
9334 args = PyObject_GetItem(dict, key);
9335 Py_DECREF(key);
9336 if (args == NULL) {
9337 goto onError;
9338 }
9339 args_owned = 1;
9340 arglen = -1;
9341 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009342 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 while (--fmtcnt >= 0) {
9344 switch (c = *fmt++) {
9345 case '-': flags |= F_LJUST; continue;
9346 case '+': flags |= F_SIGN; continue;
9347 case ' ': flags |= F_BLANK; continue;
9348 case '#': flags |= F_ALT; continue;
9349 case '0': flags |= F_ZERO; continue;
9350 }
9351 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009352 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009353 if (c == '*') {
9354 v = getnextarg(args, arglen, &argidx);
9355 if (v == NULL)
9356 goto onError;
9357 if (!PyLong_Check(v)) {
9358 PyErr_SetString(PyExc_TypeError,
9359 "* wants int");
9360 goto onError;
9361 }
9362 width = PyLong_AsLong(v);
9363 if (width == -1 && PyErr_Occurred())
9364 goto onError;
9365 if (width < 0) {
9366 flags |= F_LJUST;
9367 width = -width;
9368 }
9369 if (--fmtcnt >= 0)
9370 c = *fmt++;
9371 }
9372 else if (c >= '0' && c <= '9') {
9373 width = c - '0';
9374 while (--fmtcnt >= 0) {
9375 c = *fmt++;
9376 if (c < '0' || c > '9')
9377 break;
9378 if ((width*10) / 10 != width) {
9379 PyErr_SetString(PyExc_ValueError,
9380 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009381 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 }
9383 width = width*10 + (c - '0');
9384 }
9385 }
9386 if (c == '.') {
9387 prec = 0;
9388 if (--fmtcnt >= 0)
9389 c = *fmt++;
9390 if (c == '*') {
9391 v = getnextarg(args, arglen, &argidx);
9392 if (v == NULL)
9393 goto onError;
9394 if (!PyLong_Check(v)) {
9395 PyErr_SetString(PyExc_TypeError,
9396 "* wants int");
9397 goto onError;
9398 }
9399 prec = PyLong_AsLong(v);
9400 if (prec == -1 && PyErr_Occurred())
9401 goto onError;
9402 if (prec < 0)
9403 prec = 0;
9404 if (--fmtcnt >= 0)
9405 c = *fmt++;
9406 }
9407 else if (c >= '0' && c <= '9') {
9408 prec = c - '0';
9409 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009410 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009411 if (c < '0' || c > '9')
9412 break;
9413 if ((prec*10) / 10 != prec) {
9414 PyErr_SetString(PyExc_ValueError,
9415 "prec too big");
9416 goto onError;
9417 }
9418 prec = prec*10 + (c - '0');
9419 }
9420 }
9421 } /* prec */
9422 if (fmtcnt >= 0) {
9423 if (c == 'h' || c == 'l' || c == 'L') {
9424 if (--fmtcnt >= 0)
9425 c = *fmt++;
9426 }
9427 }
9428 if (fmtcnt < 0) {
9429 PyErr_SetString(PyExc_ValueError,
9430 "incomplete format");
9431 goto onError;
9432 }
9433 if (c != '%') {
9434 v = getnextarg(args, arglen, &argidx);
9435 if (v == NULL)
9436 goto onError;
9437 }
9438 sign = 0;
9439 fill = ' ';
9440 switch (c) {
9441
9442 case '%':
9443 pbuf = formatbuf;
9444 /* presume that buffer length is at least 1 */
9445 pbuf[0] = '%';
9446 len = 1;
9447 break;
9448
9449 case 's':
9450 case 'r':
9451 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009452 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009453 temp = v;
9454 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009455 }
9456 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009457 if (c == 's')
9458 temp = PyObject_Str(v);
9459 else if (c == 'r')
9460 temp = PyObject_Repr(v);
9461 else
9462 temp = PyObject_ASCII(v);
9463 if (temp == NULL)
9464 goto onError;
9465 if (PyUnicode_Check(temp))
9466 /* nothing to do */;
9467 else {
9468 Py_DECREF(temp);
9469 PyErr_SetString(PyExc_TypeError,
9470 "%s argument has non-string str()");
9471 goto onError;
9472 }
9473 }
9474 pbuf = PyUnicode_AS_UNICODE(temp);
9475 len = PyUnicode_GET_SIZE(temp);
9476 if (prec >= 0 && len > prec)
9477 len = prec;
9478 break;
9479
9480 case 'i':
9481 case 'd':
9482 case 'u':
9483 case 'o':
9484 case 'x':
9485 case 'X':
9486 if (c == 'i')
9487 c = 'd';
9488 isnumok = 0;
9489 if (PyNumber_Check(v)) {
9490 PyObject *iobj=NULL;
9491
9492 if (PyLong_Check(v)) {
9493 iobj = v;
9494 Py_INCREF(iobj);
9495 }
9496 else {
9497 iobj = PyNumber_Long(v);
9498 }
9499 if (iobj!=NULL) {
9500 if (PyLong_Check(iobj)) {
9501 isnumok = 1;
9502 temp = formatlong(iobj, flags, prec, c);
9503 Py_DECREF(iobj);
9504 if (!temp)
9505 goto onError;
9506 pbuf = PyUnicode_AS_UNICODE(temp);
9507 len = PyUnicode_GET_SIZE(temp);
9508 sign = 1;
9509 }
9510 else {
9511 Py_DECREF(iobj);
9512 }
9513 }
9514 }
9515 if (!isnumok) {
9516 PyErr_Format(PyExc_TypeError,
9517 "%%%c format: a number is required, "
9518 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9519 goto onError;
9520 }
9521 if (flags & F_ZERO)
9522 fill = '0';
9523 break;
9524
9525 case 'e':
9526 case 'E':
9527 case 'f':
9528 case 'F':
9529 case 'g':
9530 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009531 temp = formatfloat(v, flags, prec, c);
9532 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009534 pbuf = PyUnicode_AS_UNICODE(temp);
9535 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009536 sign = 1;
9537 if (flags & F_ZERO)
9538 fill = '0';
9539 break;
9540
9541 case 'c':
9542 pbuf = formatbuf;
9543 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9544 if (len < 0)
9545 goto onError;
9546 break;
9547
9548 default:
9549 PyErr_Format(PyExc_ValueError,
9550 "unsupported format character '%c' (0x%x) "
9551 "at index %zd",
9552 (31<=c && c<=126) ? (char)c : '?',
9553 (int)c,
9554 (Py_ssize_t)(fmt - 1 -
9555 PyUnicode_AS_UNICODE(uformat)));
9556 goto onError;
9557 }
9558 if (sign) {
9559 if (*pbuf == '-' || *pbuf == '+') {
9560 sign = *pbuf++;
9561 len--;
9562 }
9563 else if (flags & F_SIGN)
9564 sign = '+';
9565 else if (flags & F_BLANK)
9566 sign = ' ';
9567 else
9568 sign = 0;
9569 }
9570 if (width < len)
9571 width = len;
9572 if (rescnt - (sign != 0) < width) {
9573 reslen -= rescnt;
9574 rescnt = width + fmtcnt + 100;
9575 reslen += rescnt;
9576 if (reslen < 0) {
9577 Py_XDECREF(temp);
9578 PyErr_NoMemory();
9579 goto onError;
9580 }
9581 if (_PyUnicode_Resize(&result, reslen) < 0) {
9582 Py_XDECREF(temp);
9583 goto onError;
9584 }
9585 res = PyUnicode_AS_UNICODE(result)
9586 + reslen - rescnt;
9587 }
9588 if (sign) {
9589 if (fill != ' ')
9590 *res++ = sign;
9591 rescnt--;
9592 if (width > len)
9593 width--;
9594 }
9595 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9596 assert(pbuf[0] == '0');
9597 assert(pbuf[1] == c);
9598 if (fill != ' ') {
9599 *res++ = *pbuf++;
9600 *res++ = *pbuf++;
9601 }
9602 rescnt -= 2;
9603 width -= 2;
9604 if (width < 0)
9605 width = 0;
9606 len -= 2;
9607 }
9608 if (width > len && !(flags & F_LJUST)) {
9609 do {
9610 --rescnt;
9611 *res++ = fill;
9612 } while (--width > len);
9613 }
9614 if (fill == ' ') {
9615 if (sign)
9616 *res++ = sign;
9617 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9618 assert(pbuf[0] == '0');
9619 assert(pbuf[1] == c);
9620 *res++ = *pbuf++;
9621 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009622 }
9623 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009624 Py_UNICODE_COPY(res, pbuf, len);
9625 res += len;
9626 rescnt -= len;
9627 while (--width >= len) {
9628 --rescnt;
9629 *res++ = ' ';
9630 }
9631 if (dict && (argidx < arglen) && c != '%') {
9632 PyErr_SetString(PyExc_TypeError,
9633 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009634 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009635 goto onError;
9636 }
9637 Py_XDECREF(temp);
9638 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639 } /* until end */
9640 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009641 PyErr_SetString(PyExc_TypeError,
9642 "not all arguments converted during string formatting");
9643 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644 }
9645
Thomas Woutersa96affe2006-03-12 00:29:36 +00009646 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009647 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009649 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650 }
9651 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652 return (PyObject *)result;
9653
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 Py_XDECREF(result);
9656 Py_DECREF(uformat);
9657 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009658 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659 }
9660 return NULL;
9661}
9662
Jeremy Hylton938ace62002-07-17 16:30:39 +00009663static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009664unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9665
Tim Peters6d6c1a32001-08-02 04:15:00 +00009666static PyObject *
9667unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9668{
Benjamin Peterson29060642009-01-31 22:14:21 +00009669 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009670 static char *kwlist[] = {"object", "encoding", "errors", 0};
9671 char *encoding = NULL;
9672 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009673
Benjamin Peterson14339b62009-01-31 16:36:08 +00009674 if (type != &PyUnicode_Type)
9675 return unicode_subtype_new(type, args, kwds);
9676 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009677 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009678 return NULL;
9679 if (x == NULL)
9680 return (PyObject *)_PyUnicode_New(0);
9681 if (encoding == NULL && errors == NULL)
9682 return PyObject_Str(x);
9683 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009684 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009685}
9686
Guido van Rossume023fe02001-08-30 03:12:59 +00009687static PyObject *
9688unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9689{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009690 PyUnicodeObject *tmp, *pnew;
9691 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009692
Benjamin Peterson14339b62009-01-31 16:36:08 +00009693 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9694 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9695 if (tmp == NULL)
9696 return NULL;
9697 assert(PyUnicode_Check(tmp));
9698 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9699 if (pnew == NULL) {
9700 Py_DECREF(tmp);
9701 return NULL;
9702 }
9703 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9704 if (pnew->str == NULL) {
9705 _Py_ForgetReference((PyObject *)pnew);
9706 PyObject_Del(pnew);
9707 Py_DECREF(tmp);
9708 return PyErr_NoMemory();
9709 }
9710 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9711 pnew->length = n;
9712 pnew->hash = tmp->hash;
9713 Py_DECREF(tmp);
9714 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009715}
9716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009717PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009718 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009719\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009720Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009721encoding defaults to the current default string encoding.\n\
9722errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009723
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009724static PyObject *unicode_iter(PyObject *seq);
9725
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009727 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009728 "str", /* tp_name */
9729 sizeof(PyUnicodeObject), /* tp_size */
9730 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009732 (destructor)unicode_dealloc, /* tp_dealloc */
9733 0, /* tp_print */
9734 0, /* tp_getattr */
9735 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009736 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009737 unicode_repr, /* tp_repr */
9738 &unicode_as_number, /* tp_as_number */
9739 &unicode_as_sequence, /* tp_as_sequence */
9740 &unicode_as_mapping, /* tp_as_mapping */
9741 (hashfunc) unicode_hash, /* tp_hash*/
9742 0, /* tp_call*/
9743 (reprfunc) unicode_str, /* tp_str */
9744 PyObject_GenericGetAttr, /* tp_getattro */
9745 0, /* tp_setattro */
9746 0, /* tp_as_buffer */
9747 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009748 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009749 unicode_doc, /* tp_doc */
9750 0, /* tp_traverse */
9751 0, /* tp_clear */
9752 PyUnicode_RichCompare, /* tp_richcompare */
9753 0, /* tp_weaklistoffset */
9754 unicode_iter, /* tp_iter */
9755 0, /* tp_iternext */
9756 unicode_methods, /* tp_methods */
9757 0, /* tp_members */
9758 0, /* tp_getset */
9759 &PyBaseObject_Type, /* tp_base */
9760 0, /* tp_dict */
9761 0, /* tp_descr_get */
9762 0, /* tp_descr_set */
9763 0, /* tp_dictoffset */
9764 0, /* tp_init */
9765 0, /* tp_alloc */
9766 unicode_new, /* tp_new */
9767 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768};
9769
9770/* Initialize the Unicode implementation */
9771
Thomas Wouters78890102000-07-22 19:25:51 +00009772void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009774 int i;
9775
Thomas Wouters477c8d52006-05-27 19:21:47 +00009776 /* XXX - move this array to unicodectype.c ? */
9777 Py_UNICODE linebreak[] = {
9778 0x000A, /* LINE FEED */
9779 0x000D, /* CARRIAGE RETURN */
9780 0x001C, /* FILE SEPARATOR */
9781 0x001D, /* GROUP SEPARATOR */
9782 0x001E, /* RECORD SEPARATOR */
9783 0x0085, /* NEXT LINE */
9784 0x2028, /* LINE SEPARATOR */
9785 0x2029, /* PARAGRAPH SEPARATOR */
9786 };
9787
Fred Drakee4315f52000-05-09 19:53:39 +00009788 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009789 free_list = NULL;
9790 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009791 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009792 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009793 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009794
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009795 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009796 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009797 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009798 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009799
9800 /* initialize the linebreak bloom filter */
9801 bloom_linebreak = make_bloom_mask(
9802 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9803 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009804
9805 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806}
9807
9808/* Finalize the Unicode implementation */
9809
Christian Heimesa156e092008-02-16 07:38:31 +00009810int
9811PyUnicode_ClearFreeList(void)
9812{
9813 int freelist_size = numfree;
9814 PyUnicodeObject *u;
9815
9816 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009817 PyUnicodeObject *v = u;
9818 u = *(PyUnicodeObject **)u;
9819 if (v->str)
9820 PyObject_DEL(v->str);
9821 Py_XDECREF(v->defenc);
9822 PyObject_Del(v);
9823 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009824 }
9825 free_list = NULL;
9826 assert(numfree == 0);
9827 return freelist_size;
9828}
9829
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830void
Thomas Wouters78890102000-07-22 19:25:51 +00009831_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009833 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009835 Py_XDECREF(unicode_empty);
9836 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009837
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009838 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009839 if (unicode_latin1[i]) {
9840 Py_DECREF(unicode_latin1[i]);
9841 unicode_latin1[i] = NULL;
9842 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009843 }
Christian Heimesa156e092008-02-16 07:38:31 +00009844 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009846
Walter Dörwald16807132007-05-25 13:52:07 +00009847void
9848PyUnicode_InternInPlace(PyObject **p)
9849{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009850 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9851 PyObject *t;
9852 if (s == NULL || !PyUnicode_Check(s))
9853 Py_FatalError(
9854 "PyUnicode_InternInPlace: unicode strings only please!");
9855 /* If it's a subclass, we don't really know what putting
9856 it in the interned dict might do. */
9857 if (!PyUnicode_CheckExact(s))
9858 return;
9859 if (PyUnicode_CHECK_INTERNED(s))
9860 return;
9861 if (interned == NULL) {
9862 interned = PyDict_New();
9863 if (interned == NULL) {
9864 PyErr_Clear(); /* Don't leave an exception */
9865 return;
9866 }
9867 }
9868 /* It might be that the GetItem call fails even
9869 though the key is present in the dictionary,
9870 namely when this happens during a stack overflow. */
9871 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009872 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009873 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009874
Benjamin Peterson29060642009-01-31 22:14:21 +00009875 if (t) {
9876 Py_INCREF(t);
9877 Py_DECREF(*p);
9878 *p = t;
9879 return;
9880 }
Walter Dörwald16807132007-05-25 13:52:07 +00009881
Benjamin Peterson14339b62009-01-31 16:36:08 +00009882 PyThreadState_GET()->recursion_critical = 1;
9883 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9884 PyErr_Clear();
9885 PyThreadState_GET()->recursion_critical = 0;
9886 return;
9887 }
9888 PyThreadState_GET()->recursion_critical = 0;
9889 /* The two references in interned are not counted by refcnt.
9890 The deallocator will take care of this */
9891 Py_REFCNT(s) -= 2;
9892 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009893}
9894
9895void
9896PyUnicode_InternImmortal(PyObject **p)
9897{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009898 PyUnicode_InternInPlace(p);
9899 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9900 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9901 Py_INCREF(*p);
9902 }
Walter Dörwald16807132007-05-25 13:52:07 +00009903}
9904
9905PyObject *
9906PyUnicode_InternFromString(const char *cp)
9907{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009908 PyObject *s = PyUnicode_FromString(cp);
9909 if (s == NULL)
9910 return NULL;
9911 PyUnicode_InternInPlace(&s);
9912 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009913}
9914
9915void _Py_ReleaseInternedUnicodeStrings(void)
9916{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009917 PyObject *keys;
9918 PyUnicodeObject *s;
9919 Py_ssize_t i, n;
9920 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009921
Benjamin Peterson14339b62009-01-31 16:36:08 +00009922 if (interned == NULL || !PyDict_Check(interned))
9923 return;
9924 keys = PyDict_Keys(interned);
9925 if (keys == NULL || !PyList_Check(keys)) {
9926 PyErr_Clear();
9927 return;
9928 }
Walter Dörwald16807132007-05-25 13:52:07 +00009929
Benjamin Peterson14339b62009-01-31 16:36:08 +00009930 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9931 detector, interned unicode strings are not forcibly deallocated;
9932 rather, we give them their stolen references back, and then clear
9933 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009934
Benjamin Peterson14339b62009-01-31 16:36:08 +00009935 n = PyList_GET_SIZE(keys);
9936 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009938 for (i = 0; i < n; i++) {
9939 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9940 switch (s->state) {
9941 case SSTATE_NOT_INTERNED:
9942 /* XXX Shouldn't happen */
9943 break;
9944 case SSTATE_INTERNED_IMMORTAL:
9945 Py_REFCNT(s) += 1;
9946 immortal_size += s->length;
9947 break;
9948 case SSTATE_INTERNED_MORTAL:
9949 Py_REFCNT(s) += 2;
9950 mortal_size += s->length;
9951 break;
9952 default:
9953 Py_FatalError("Inconsistent interned string state.");
9954 }
9955 s->state = SSTATE_NOT_INTERNED;
9956 }
9957 fprintf(stderr, "total size of all interned strings: "
9958 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9959 "mortal/immortal\n", mortal_size, immortal_size);
9960 Py_DECREF(keys);
9961 PyDict_Clear(interned);
9962 Py_DECREF(interned);
9963 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009964}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009965
9966
9967/********************* Unicode Iterator **************************/
9968
9969typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009970 PyObject_HEAD
9971 Py_ssize_t it_index;
9972 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009973} unicodeiterobject;
9974
9975static void
9976unicodeiter_dealloc(unicodeiterobject *it)
9977{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009978 _PyObject_GC_UNTRACK(it);
9979 Py_XDECREF(it->it_seq);
9980 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009981}
9982
9983static int
9984unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9985{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009986 Py_VISIT(it->it_seq);
9987 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009988}
9989
9990static PyObject *
9991unicodeiter_next(unicodeiterobject *it)
9992{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009993 PyUnicodeObject *seq;
9994 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009995
Benjamin Peterson14339b62009-01-31 16:36:08 +00009996 assert(it != NULL);
9997 seq = it->it_seq;
9998 if (seq == NULL)
9999 return NULL;
10000 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010001
Benjamin Peterson14339b62009-01-31 16:36:08 +000010002 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10003 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010004 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010005 if (item != NULL)
10006 ++it->it_index;
10007 return item;
10008 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010009
Benjamin Peterson14339b62009-01-31 16:36:08 +000010010 Py_DECREF(seq);
10011 it->it_seq = NULL;
10012 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010013}
10014
10015static PyObject *
10016unicodeiter_len(unicodeiterobject *it)
10017{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010018 Py_ssize_t len = 0;
10019 if (it->it_seq)
10020 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10021 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010022}
10023
10024PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10025
10026static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010027 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010028 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010029 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010030};
10031
10032PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010033 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10034 "str_iterator", /* tp_name */
10035 sizeof(unicodeiterobject), /* tp_basicsize */
10036 0, /* tp_itemsize */
10037 /* methods */
10038 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10039 0, /* tp_print */
10040 0, /* tp_getattr */
10041 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010042 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010043 0, /* tp_repr */
10044 0, /* tp_as_number */
10045 0, /* tp_as_sequence */
10046 0, /* tp_as_mapping */
10047 0, /* tp_hash */
10048 0, /* tp_call */
10049 0, /* tp_str */
10050 PyObject_GenericGetAttr, /* tp_getattro */
10051 0, /* tp_setattro */
10052 0, /* tp_as_buffer */
10053 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10054 0, /* tp_doc */
10055 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10056 0, /* tp_clear */
10057 0, /* tp_richcompare */
10058 0, /* tp_weaklistoffset */
10059 PyObject_SelfIter, /* tp_iter */
10060 (iternextfunc)unicodeiter_next, /* tp_iternext */
10061 unicodeiter_methods, /* tp_methods */
10062 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010063};
10064
10065static PyObject *
10066unicode_iter(PyObject *seq)
10067{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010068 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010069
Benjamin Peterson14339b62009-01-31 16:36:08 +000010070 if (!PyUnicode_Check(seq)) {
10071 PyErr_BadInternalCall();
10072 return NULL;
10073 }
10074 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10075 if (it == NULL)
10076 return NULL;
10077 it->it_index = 0;
10078 Py_INCREF(seq);
10079 it->it_seq = (PyUnicodeObject *)seq;
10080 _PyObject_GC_TRACK(it);
10081 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010082}
10083
Martin v. Löwis5b222132007-06-10 09:51:05 +000010084size_t
10085Py_UNICODE_strlen(const Py_UNICODE *u)
10086{
10087 int res = 0;
10088 while(*u++)
10089 res++;
10090 return res;
10091}
10092
10093Py_UNICODE*
10094Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10095{
10096 Py_UNICODE *u = s1;
10097 while ((*u++ = *s2++));
10098 return s1;
10099}
10100
10101Py_UNICODE*
10102Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10103{
10104 Py_UNICODE *u = s1;
10105 while ((*u++ = *s2++))
10106 if (n-- == 0)
10107 break;
10108 return s1;
10109}
10110
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010111Py_UNICODE*
10112Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10113{
10114 Py_UNICODE *u1 = s1;
10115 u1 += Py_UNICODE_strlen(u1);
10116 Py_UNICODE_strcpy(u1, s2);
10117 return s1;
10118}
10119
Martin v. Löwis5b222132007-06-10 09:51:05 +000010120int
10121Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10122{
10123 while (*s1 && *s2 && *s1 == *s2)
10124 s1++, s2++;
10125 if (*s1 && *s2)
10126 return (*s1 < *s2) ? -1 : +1;
10127 if (*s1)
10128 return 1;
10129 if (*s2)
10130 return -1;
10131 return 0;
10132}
10133
Victor Stinneref8d95c2010-08-16 22:03:11 +000010134int
10135Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10136{
10137 register Py_UNICODE u1, u2;
10138 for (; n != 0; n--) {
10139 u1 = *s1;
10140 u2 = *s2;
10141 if (u1 != u2)
10142 return (u1 < u2) ? -1 : +1;
10143 if (u1 == '\0')
10144 return 0;
10145 s1++;
10146 s2++;
10147 }
10148 return 0;
10149}
10150
Martin v. Löwis5b222132007-06-10 09:51:05 +000010151Py_UNICODE*
10152Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10153{
10154 const Py_UNICODE *p;
10155 for (p = s; *p; p++)
10156 if (*p == c)
10157 return (Py_UNICODE*)p;
10158 return NULL;
10159}
10160
Victor Stinner331ea922010-08-10 16:37:20 +000010161Py_UNICODE*
10162Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10163{
10164 const Py_UNICODE *p;
10165 p = s + Py_UNICODE_strlen(s);
10166 while (p != s) {
10167 p--;
10168 if (*p == c)
10169 return (Py_UNICODE*)p;
10170 }
10171 return NULL;
10172}
10173
Victor Stinner71133ff2010-09-01 23:43:53 +000010174Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010175PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010176{
10177 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10178 Py_UNICODE *copy;
10179 Py_ssize_t size;
10180
10181 /* Ensure we won't overflow the size. */
10182 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10183 PyErr_NoMemory();
10184 return NULL;
10185 }
10186 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10187 size *= sizeof(Py_UNICODE);
10188 copy = PyMem_Malloc(size);
10189 if (copy == NULL) {
10190 PyErr_NoMemory();
10191 return NULL;
10192 }
10193 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10194 return copy;
10195}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010196
Georg Brandl66c221e2010-10-14 07:04:07 +000010197/* A _string module, to export formatter_parser and formatter_field_name_split
10198 to the string.Formatter class implemented in Python. */
10199
10200static PyMethodDef _string_methods[] = {
10201 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10202 METH_O, PyDoc_STR("split the argument as a field name")},
10203 {"formatter_parser", (PyCFunction) formatter_parser,
10204 METH_O, PyDoc_STR("parse the argument as a format string")},
10205 {NULL, NULL}
10206};
10207
10208static struct PyModuleDef _string_module = {
10209 PyModuleDef_HEAD_INIT,
10210 "_string",
10211 PyDoc_STR("string helper module"),
10212 0,
10213 _string_methods,
10214 NULL,
10215 NULL,
10216 NULL,
10217 NULL
10218};
10219
10220PyMODINIT_FUNC
10221PyInit__string(void)
10222{
10223 return PyModule_Create(&_string_module);
10224}
10225
10226
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010227#ifdef __cplusplus
10228}
10229#endif