blob: c010b1b246cff91c46c7ac7fadfd0a62ab2477c4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Christian Heimes190d79e2008-01-30 11:58:22 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000119 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000120/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000121/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000122/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000123/* case 0x000C: * FORM FEED */
124/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000127/* case 0x001C: * FILE SEPARATOR */
128/* case 0x001D: * GROUP SEPARATOR */
129/* case 0x001E: * RECORD SEPARATOR */
130/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000131 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson14339b62009-01-31 16:36:08 +0000138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000146};
147
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149 PyObject **errorHandler,const char *encoding, const char *reason,
150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
Victor Stinner31be90b2010-04-22 19:38:16 +0000153static void raise_encode_exception(PyObject **exceptionObject,
154 const char *encoding,
155 const Py_UNICODE *unicode, Py_ssize_t size,
156 Py_ssize_t startpos, Py_ssize_t endpos,
157 const char *reason);
158
Christian Heimes190d79e2008-01-30 11:58:22 +0000159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrouf068f942010-01-13 14:19:12 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
Antoine Pitrouf068f942010-01-13 14:19:12 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
Benjamin Peterson29060642009-01-31 22:14:21 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252}
253
Benjamin Peterson29060642009-01-31 22:14:21 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000278 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000309 Ux0000 terminated; some code (e.g. new_identifier)
310 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000313 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314
315*/
316
317static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319{
320 register PyUnicodeObject *unicode;
321
Thomas Wouters477c8d52006-05-27 19:21:47 +0000322 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 if (length == 0 && unicode_empty != NULL) {
324 Py_INCREF(unicode_empty);
325 return unicode_empty;
326 }
327
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000328 /* Ensure we won't overflow the size. */
329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330 return (PyUnicodeObject *)PyErr_NoMemory();
331 }
332
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000334 if (free_list) {
335 unicode = free_list;
336 free_list = *(PyUnicodeObject **)unicode;
337 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000338 if (unicode->str) {
339 /* Keep-Alive optimization: we only upsize the buffer,
340 never downsize it. */
341 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000342 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000343 PyObject_DEL(unicode->str);
344 unicode->str = NULL;
345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000350 }
351 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 }
353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 if (unicode == NULL)
357 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 }
361
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 PyErr_NoMemory();
364 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000365 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000366 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000367 * the caller fails before initializing str -- unicode_resize()
368 * reads str[0], and the Keep-Alive optimization can keep memory
369 * allocated for str alive across a call to unicode_dealloc(unicode).
370 * We don't want unicode_resize to read uninitialized memory in
371 * that case.
372 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000373 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000377 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000378 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000380
Benjamin Peterson29060642009-01-31 22:14:21 +0000381 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000382 /* XXX UNREF/NEWREF interface should be more symmetrical */
383 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000385 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387}
388
389static
Guido van Rossum9475a232001-10-05 20:51:39 +0000390void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391{
Walter Dörwald16807132007-05-25 13:52:07 +0000392 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_NOT_INTERNED:
394 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000395
Benjamin Peterson29060642009-01-31 22:14:21 +0000396 case SSTATE_INTERNED_MORTAL:
397 /* revive dead object temporarily for DelItem */
398 Py_REFCNT(unicode) = 3;
399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400 Py_FatalError(
401 "deletion of interned string failed");
402 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 case SSTATE_INTERNED_IMMORTAL:
405 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406
Benjamin Peterson29060642009-01-31 22:14:21 +0000407 default:
408 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000409 }
410
Guido van Rossum604ddf82001-12-06 20:03:56 +0000411 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000412 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000413 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415 PyObject_DEL(unicode->str);
416 unicode->str = NULL;
417 unicode->length = 0;
418 }
419 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000420 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 }
422 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000423 *(PyUnicodeObject **)unicode = free_list;
424 free_list = unicode;
425 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000426 }
427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyObject_DEL(unicode->str);
429 Py_XDECREF(unicode->defenc);
430 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431 }
432}
433
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000436{
437 register PyUnicodeObject *v;
438
439 /* Argument checks */
440 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 PyErr_BadInternalCall();
442 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000443 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 PyErr_BadInternalCall();
447 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 }
449
450 /* Resizing unicode_empty and single character objects is not
451 possible since these are being shared. We simply return a fresh
452 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000453 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000454 (v == unicode_empty || v->length == 1)) {
455 PyUnicodeObject *w = _PyUnicode_New(length);
456 if (w == NULL)
457 return -1;
458 Py_UNICODE_COPY(w->str, v->str,
459 length < v->length ? length : v->length);
460 Py_DECREF(*unicode);
461 *unicode = w;
462 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000463 }
464
465 /* Note that we don't have to modify *unicode for unshared Unicode
466 objects, since we can modify them in-place. */
467 return unicode_resize(v, length);
468}
469
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477{
478 PyUnicodeObject *unicode;
479
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects. */
482 if (u != NULL) {
483
Benjamin Peterson29060642009-01-31 22:14:21 +0000484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000489
490 /* Single character Unicode objects in the Latin-1 range are
491 shared when using this constructor */
492 if (size == 1 && *u < 256) {
493 unicode = unicode_latin1[*u];
494 if (!unicode) {
495 unicode = _PyUnicode_New(1);
496 if (!unicode)
497 return NULL;
498 unicode->str[0] = *u;
499 unicode_latin1[*u] = unicode;
500 }
501 Py_INCREF(unicode);
502 return (PyObject *)unicode;
503 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000504 }
Tim Petersced69f82003-09-16 20:30:58 +0000505
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 unicode = _PyUnicode_New(size);
507 if (!unicode)
508 return NULL;
509
510 /* Copy the Unicode data into the new object */
511 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000512 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 return (PyObject *)unicode;
515}
516
Walter Dörwaldd2034312007-05-18 16:29:38 +0000517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518{
519 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000520
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 if (size < 0) {
522 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 return NULL;
525 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000526
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000528 some optimizations which share commonly used objects.
529 Also, this means the input must be UTF-8, so fall back to the
530 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (u != NULL) {
532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533 /* Optimization for empty strings */
534 if (size == 0 && unicode_empty != NULL) {
535 Py_INCREF(unicode_empty);
536 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000537 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000538
539 /* Single characters are shared when using this constructor.
540 Restrict to ASCII, since the input must be UTF-8. */
541 if (size == 1 && Py_CHARMASK(*u) < 128) {
542 unicode = unicode_latin1[Py_CHARMASK(*u)];
543 if (!unicode) {
544 unicode = _PyUnicode_New(1);
545 if (!unicode)
546 return NULL;
547 unicode->str[0] = Py_CHARMASK(*u);
548 unicode_latin1[Py_CHARMASK(*u)] = unicode;
549 }
550 Py_INCREF(unicode);
551 return (PyObject *)unicode;
552 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000553
554 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 }
556
Walter Dörwald55507312007-05-18 13:12:10 +0000557 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 if (!unicode)
559 return NULL;
560
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 return (PyObject *)unicode;
562}
563
Walter Dörwaldd2034312007-05-18 16:29:38 +0000564PyObject *PyUnicode_FromString(const char *u)
565{
566 size_t size = strlen(u);
567 if (size > PY_SSIZE_T_MAX) {
568 PyErr_SetString(PyExc_OverflowError, "input too long");
569 return NULL;
570 }
571
572 return PyUnicode_FromStringAndSize(u, size);
573}
574
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575#ifdef HAVE_WCHAR_H
576
Mark Dickinson081dfee2009-03-18 14:47:41 +0000577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584 to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
588{
589 PyUnicodeObject *unicode;
590 register Py_ssize_t i;
591 Py_ssize_t alloc;
592 const wchar_t *orig_w;
593
594 if (w == NULL) {
595 if (size == 0)
596 return PyUnicode_FromStringAndSize(NULL, 0);
597 PyErr_BadInternalCall();
598 return NULL;
599 }
600
601 if (size == -1) {
602 size = wcslen(w);
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000643 if (size == 0)
644 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000645 PyErr_BadInternalCall();
646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647 }
648
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == -1) {
650 size = wcslen(w);
651 }
652
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 unicode = _PyUnicode_New(size);
654 if (!unicode)
655 return NULL;
656
657 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000660#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000662 register Py_UNICODE *u;
663 register Py_ssize_t i;
664 u = PyUnicode_AS_UNICODE(unicode);
665 for (i = size; i > 0; i--)
666 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 }
668#endif
669
670 return (PyObject *)unicode;
671}
672
Mark Dickinson081dfee2009-03-18 14:47:41 +0000673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
Walter Dörwald346737f2007-05-31 10:44:43 +0000677static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000680{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000681 *fmt++ = '%';
682 if (width) {
683 if (zeropad)
684 *fmt++ = '0';
685 fmt += sprintf(fmt, "%d", width);
686 }
687 if (precision)
688 fmt += sprintf(fmt, ".%d", precision);
689 if (longflag)
690 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000691 else if (longlongflag) {
692 /* longlongflag should only ever be nonzero on machines with
693 HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695 char *f = PY_FORMAT_LONG_LONG;
696 while (*f)
697 *fmt++ = *f++;
698#else
699 /* we shouldn't ever get here */
700 assert(0);
701 *fmt++ = 'l';
702#endif
703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000704 else if (size_tflag) {
705 char *f = PY_FORMAT_SIZE_T;
706 while (*f)
707 *fmt++ = *f++;
708 }
709 *fmt++ = c;
710 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000711}
712
Walter Dörwaldd2034312007-05-18 16:29:38 +0000713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld. 21 characters
718 allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000728 va_list count;
729 Py_ssize_t callcount = 0;
730 PyObject **callresults = NULL;
731 PyObject **callresult = NULL;
732 Py_ssize_t n = 0;
733 int width = 0;
734 int precision = 0;
735 int zeropad;
736 const char* f;
737 Py_UNICODE *s;
738 PyObject *string;
739 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000740 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000741 /* use abuffer instead of buffer, if we need more space
742 * (which can happen if there's a format specifier with width). */
743 char *abuffer = NULL;
744 char *realbuffer;
745 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000749 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000750 /* step 1: count the number of %S/%R/%A/%s format specifications
751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000754 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000755 if (*f == '%') {
756 if (*(f+1)=='%')
757 continue;
758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759 ++callcount;
760 while (ISDIGIT((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763 ;
764 if (*f == 's')
765 ++callcount;
766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000767 }
768 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000769 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000770 if (callcount) {
771 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
772 if (!callresults) {
773 PyErr_NoMemory();
774 return NULL;
775 }
776 callresult = callresults;
777 }
778 /* step 3: figure out how large a buffer we need */
779 for (f = format; *f; f++) {
780 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000781#ifdef HAVE_LONG_LONG
782 int longlongflag = 0;
783#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000784 const char* p = f;
785 width = 0;
786 while (ISDIGIT((unsigned)*f))
787 width = (width*10) + *f++ - '0';
788 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
789 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000790
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
792 * they don't affect the amount of space we reserve.
793 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000794 if (*f == 'l') {
795 if (f[1] == 'd' || f[1] == 'u') {
796 ++f;
797 }
798#ifdef HAVE_LONG_LONG
799 else if (f[1] == 'l' &&
800 (f[2] == 'd' || f[2] == 'u')) {
801 longlongflag = 1;
802 f += 2;
803 }
804#endif
805 }
806 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000807 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000808 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000809
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 switch (*f) {
811 case 'c':
812 (void)va_arg(count, int);
813 /* fall through... */
814 case '%':
815 n++;
816 break;
817 case 'd': case 'u': case 'i': case 'x':
818 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000819#ifdef HAVE_LONG_LONG
820 if (longlongflag) {
821 if (width < MAX_LONG_LONG_CHARS)
822 width = MAX_LONG_LONG_CHARS;
823 }
824 else
825#endif
826 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
827 including sign. Decimal takes the most space. This
828 isn't enough for octal. If a width is specified we
829 need more (which we allocate later). */
830 if (width < MAX_LONG_CHARS)
831 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000832 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000833 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000834 if (abuffersize < width)
835 abuffersize = width;
836 break;
837 case 's':
838 {
839 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000840 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000841 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
842 if (!str)
843 goto fail;
844 n += PyUnicode_GET_SIZE(str);
845 /* Remember the str and switch to the next slot */
846 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 break;
848 }
849 case 'U':
850 {
851 PyObject *obj = va_arg(count, PyObject *);
852 assert(obj && PyUnicode_Check(obj));
853 n += PyUnicode_GET_SIZE(obj);
854 break;
855 }
856 case 'V':
857 {
858 PyObject *obj = va_arg(count, PyObject *);
859 const char *str = va_arg(count, const char *);
860 assert(obj || str);
861 assert(!obj || PyUnicode_Check(obj));
862 if (obj)
863 n += PyUnicode_GET_SIZE(obj);
864 else
865 n += strlen(str);
866 break;
867 }
868 case 'S':
869 {
870 PyObject *obj = va_arg(count, PyObject *);
871 PyObject *str;
872 assert(obj);
873 str = PyObject_Str(obj);
874 if (!str)
875 goto fail;
876 n += PyUnicode_GET_SIZE(str);
877 /* Remember the str and switch to the next slot */
878 *callresult++ = str;
879 break;
880 }
881 case 'R':
882 {
883 PyObject *obj = va_arg(count, PyObject *);
884 PyObject *repr;
885 assert(obj);
886 repr = PyObject_Repr(obj);
887 if (!repr)
888 goto fail;
889 n += PyUnicode_GET_SIZE(repr);
890 /* Remember the repr and switch to the next slot */
891 *callresult++ = repr;
892 break;
893 }
894 case 'A':
895 {
896 PyObject *obj = va_arg(count, PyObject *);
897 PyObject *ascii;
898 assert(obj);
899 ascii = PyObject_ASCII(obj);
900 if (!ascii)
901 goto fail;
902 n += PyUnicode_GET_SIZE(ascii);
903 /* Remember the repr and switch to the next slot */
904 *callresult++ = ascii;
905 break;
906 }
907 case 'p':
908 (void) va_arg(count, int);
909 /* maximum 64-bit pointer representation:
910 * 0xffffffffffffffff
911 * so 19 characters is enough.
912 * XXX I count 18 -- what's the extra for?
913 */
914 n += 19;
915 break;
916 default:
917 /* if we stumble upon an unknown
918 formatting code, copy the rest of
919 the format string to the output
920 string. (we cannot just skip the
921 code, since there's no way to know
922 what's in the argument list) */
923 n += strlen(p);
924 goto expand;
925 }
926 } else
927 n++;
928 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000929 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000930 if (abuffersize > ITEM_BUFFER_LEN) {
931 /* add 1 for sprintf's trailing null byte */
932 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000933 if (!abuffer) {
934 PyErr_NoMemory();
935 goto fail;
936 }
937 realbuffer = abuffer;
938 }
939 else
940 realbuffer = buffer;
941 /* step 4: fill the buffer */
942 /* Since we've analyzed how much space we need for the worst case,
943 we don't have to resize the string.
944 There can be no errors beyond this point. */
945 string = PyUnicode_FromUnicode(NULL, n);
946 if (!string)
947 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000948
Benjamin Peterson14339b62009-01-31 16:36:08 +0000949 s = PyUnicode_AS_UNICODE(string);
950 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000951
Benjamin Peterson14339b62009-01-31 16:36:08 +0000952 for (f = format; *f; f++) {
953 if (*f == '%') {
954 const char* p = f++;
955 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000956 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000957 int size_tflag = 0;
958 zeropad = (*f == '0');
959 /* parse the width.precision part */
960 width = 0;
961 while (ISDIGIT((unsigned)*f))
962 width = (width*10) + *f++ - '0';
963 precision = 0;
964 if (*f == '.') {
965 f++;
966 while (ISDIGIT((unsigned)*f))
967 precision = (precision*10) + *f++ - '0';
968 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000969 /* Handle %ld, %lu, %lld and %llu. */
970 if (*f == 'l') {
971 if (f[1] == 'd' || f[1] == 'u') {
972 longflag = 1;
973 ++f;
974 }
975#ifdef HAVE_LONG_LONG
976 else if (f[1] == 'l' &&
977 (f[2] == 'd' || f[2] == 'u')) {
978 longlongflag = 1;
979 f += 2;
980 }
981#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000982 }
983 /* handle the size_t flag. */
984 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
985 size_tflag = 1;
986 ++f;
987 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000988
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 switch (*f) {
990 case 'c':
991 *s++ = va_arg(vargs, int);
992 break;
993 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000994 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
995 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +0000996 if (longflag)
997 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000998#ifdef HAVE_LONG_LONG
999 else if (longlongflag)
1000 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1001#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 else if (size_tflag)
1003 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1004 else
1005 sprintf(realbuffer, fmt, va_arg(vargs, int));
1006 appendstring(realbuffer);
1007 break;
1008 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001009 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1010 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001011 if (longflag)
1012 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001013#ifdef HAVE_LONG_LONG
1014 else if (longlongflag)
1015 sprintf(realbuffer, fmt, va_arg(vargs,
1016 unsigned PY_LONG_LONG));
1017#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 else if (size_tflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1020 else
1021 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1022 appendstring(realbuffer);
1023 break;
1024 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001025 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001026 sprintf(realbuffer, fmt, va_arg(vargs, int));
1027 appendstring(realbuffer);
1028 break;
1029 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001030 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001031 sprintf(realbuffer, fmt, va_arg(vargs, int));
1032 appendstring(realbuffer);
1033 break;
1034 case 's':
1035 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001036 /* unused, since we already have the result */
1037 (void) va_arg(vargs, char *);
1038 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1039 PyUnicode_GET_SIZE(*callresult));
1040 s += PyUnicode_GET_SIZE(*callresult);
1041 /* We're done with the unicode()/repr() => forget it */
1042 Py_DECREF(*callresult);
1043 /* switch to next unicode()/repr() result */
1044 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 break;
1046 }
1047 case 'U':
1048 {
1049 PyObject *obj = va_arg(vargs, PyObject *);
1050 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1051 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1052 s += size;
1053 break;
1054 }
1055 case 'V':
1056 {
1057 PyObject *obj = va_arg(vargs, PyObject *);
1058 const char *str = va_arg(vargs, const char *);
1059 if (obj) {
1060 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1061 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1062 s += size;
1063 } else {
1064 appendstring(str);
1065 }
1066 break;
1067 }
1068 case 'S':
1069 case 'R':
1070 {
1071 Py_UNICODE *ucopy;
1072 Py_ssize_t usize;
1073 Py_ssize_t upos;
1074 /* unused, since we already have the result */
1075 (void) va_arg(vargs, PyObject *);
1076 ucopy = PyUnicode_AS_UNICODE(*callresult);
1077 usize = PyUnicode_GET_SIZE(*callresult);
1078 for (upos = 0; upos<usize;)
1079 *s++ = ucopy[upos++];
1080 /* We're done with the unicode()/repr() => forget it */
1081 Py_DECREF(*callresult);
1082 /* switch to next unicode()/repr() result */
1083 ++callresult;
1084 break;
1085 }
1086 case 'p':
1087 sprintf(buffer, "%p", va_arg(vargs, void*));
1088 /* %p is ill-defined: ensure leading 0x. */
1089 if (buffer[1] == 'X')
1090 buffer[1] = 'x';
1091 else if (buffer[1] != 'x') {
1092 memmove(buffer+2, buffer, strlen(buffer)+1);
1093 buffer[0] = '0';
1094 buffer[1] = 'x';
1095 }
1096 appendstring(buffer);
1097 break;
1098 case '%':
1099 *s++ = '%';
1100 break;
1101 default:
1102 appendstring(p);
1103 goto end;
1104 }
Victor Stinner1205f272010-09-11 00:54:47 +00001105 }
1106 else if (128 <= (unsigned char)*f) {
1107 PyErr_Format(PyExc_ValueError,
1108 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1109 "string, got a non-ascii byte: 0x%02x",
1110 (unsigned char)*f);
1111 goto fail;
1112 }
1113 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001114 *s++ = *f;
1115 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001116
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 if (callresults)
1119 PyObject_Free(callresults);
1120 if (abuffer)
1121 PyObject_Free(abuffer);
1122 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1123 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001124 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 if (callresults) {
1126 PyObject **callresult2 = callresults;
1127 while (callresult2 < callresult) {
1128 Py_DECREF(*callresult2);
1129 ++callresult2;
1130 }
1131 PyObject_Free(callresults);
1132 }
1133 if (abuffer)
1134 PyObject_Free(abuffer);
1135 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001136}
1137
1138#undef appendstring
1139
1140PyObject *
1141PyUnicode_FromFormat(const char *format, ...)
1142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001143 PyObject* ret;
1144 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001145
1146#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001147 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001148#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001150#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001151 ret = PyUnicode_FromFormatV(format, vargs);
1152 va_end(vargs);
1153 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001154}
1155
Martin v. Löwis18e16552006-02-15 17:27:45 +00001156Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001157 wchar_t *w,
1158 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159{
1160 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001161 PyErr_BadInternalCall();
1162 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001164
1165 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001167 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001168
Daniel Stutzbach8515eae2010-08-24 21:57:33 +00001169#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 memcpy(w, unicode->str, size * sizeof(wchar_t));
1171#else
1172 {
Benjamin Peterson29060642009-01-31 22:14:21 +00001173 register Py_UNICODE *u;
1174 register Py_ssize_t i;
1175 u = PyUnicode_AS_UNICODE(unicode);
1176 for (i = size; i > 0; i--)
1177 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 }
1179#endif
1180
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001181 if (size > PyUnicode_GET_SIZE(unicode))
1182 return PyUnicode_GET_SIZE(unicode);
1183 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185}
1186
1187#endif
1188
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001189PyObject *PyUnicode_FromOrdinal(int ordinal)
1190{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001191 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001192
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001193 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001194 PyErr_SetString(PyExc_ValueError,
1195 "chr() arg not in range(0x110000)");
1196 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001197 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001198
1199#ifndef Py_UNICODE_WIDE
1200 if (ordinal > 0xffff) {
1201 ordinal -= 0x10000;
1202 s[0] = 0xD800 | (ordinal >> 10);
1203 s[1] = 0xDC00 | (ordinal & 0x3FF);
1204 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001205 }
1206#endif
1207
Hye-Shik Chang40574832004-04-06 07:24:51 +00001208 s[0] = (Py_UNICODE)ordinal;
1209 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001210}
1211
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212PyObject *PyUnicode_FromObject(register PyObject *obj)
1213{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001214 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001215 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001216 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001217 Py_INCREF(obj);
1218 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001219 }
1220 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001221 /* For a Unicode subtype that's not a Unicode object,
1222 return a true Unicode object with the same data. */
1223 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1224 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001225 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001226 PyErr_Format(PyExc_TypeError,
1227 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001228 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001229 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001230}
1231
1232PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001233 const char *encoding,
1234 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001235{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001236 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001237 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001238
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001240 PyErr_BadInternalCall();
1241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001243
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001244 /* Decoding bytes objects is the most common case and should be fast */
1245 if (PyBytes_Check(obj)) {
1246 if (PyBytes_GET_SIZE(obj) == 0) {
1247 Py_INCREF(unicode_empty);
1248 v = (PyObject *) unicode_empty;
1249 }
1250 else {
1251 v = PyUnicode_Decode(
1252 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1253 encoding, errors);
1254 }
1255 return v;
1256 }
1257
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001258 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001259 PyErr_SetString(PyExc_TypeError,
1260 "decoding str is not supported");
1261 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001262 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001263
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001264 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1265 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1266 PyErr_Format(PyExc_TypeError,
1267 "coercing to str: need bytes, bytearray "
1268 "or buffer-like object, %.80s found",
1269 Py_TYPE(obj)->tp_name);
1270 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001271 }
Tim Petersced69f82003-09-16 20:30:58 +00001272
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001273 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001274 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001275 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 }
Tim Petersced69f82003-09-16 20:30:58 +00001277 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001278 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001279
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001280 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001281 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001282}
1283
Victor Stinner600d3be2010-06-10 12:00:55 +00001284/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001285 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1286 1 on success. */
1287static int
1288normalize_encoding(const char *encoding,
1289 char *lower,
1290 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001292 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001293 char *l;
1294 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001295
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001296 e = encoding;
1297 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001298 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001299 while (*e) {
1300 if (l == l_end)
1301 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001302 if (ISUPPER(*e)) {
1303 *l++ = TOLOWER(*e++);
1304 }
1305 else if (*e == '_') {
1306 *l++ = '-';
1307 e++;
1308 }
1309 else {
1310 *l++ = *e++;
1311 }
1312 }
1313 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001314 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001315}
1316
1317PyObject *PyUnicode_Decode(const char *s,
1318 Py_ssize_t size,
1319 const char *encoding,
1320 const char *errors)
1321{
1322 PyObject *buffer = NULL, *unicode;
1323 Py_buffer info;
1324 char lower[11]; /* Enough for any encoding shortcut */
1325
1326 if (encoding == NULL)
1327 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001328
1329 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001330 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1331 if (strcmp(lower, "utf-8") == 0)
1332 return PyUnicode_DecodeUTF8(s, size, errors);
1333 else if ((strcmp(lower, "latin-1") == 0) ||
1334 (strcmp(lower, "iso-8859-1") == 0))
1335 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001336#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001337 else if (strcmp(lower, "mbcs") == 0)
1338 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001339#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001340 else if (strcmp(lower, "ascii") == 0)
1341 return PyUnicode_DecodeASCII(s, size, errors);
1342 else if (strcmp(lower, "utf-16") == 0)
1343 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1344 else if (strcmp(lower, "utf-32") == 0)
1345 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001347
1348 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001349 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001350 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001351 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001352 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353 if (buffer == NULL)
1354 goto onError;
1355 unicode = PyCodec_Decode(buffer, encoding, errors);
1356 if (unicode == NULL)
1357 goto onError;
1358 if (!PyUnicode_Check(unicode)) {
1359 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001360 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001361 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 Py_DECREF(unicode);
1363 goto onError;
1364 }
1365 Py_DECREF(buffer);
1366 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001367
Benjamin Peterson29060642009-01-31 22:14:21 +00001368 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369 Py_XDECREF(buffer);
1370 return NULL;
1371}
1372
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001373PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1374 const char *encoding,
1375 const char *errors)
1376{
1377 PyObject *v;
1378
1379 if (!PyUnicode_Check(unicode)) {
1380 PyErr_BadArgument();
1381 goto onError;
1382 }
1383
1384 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001385 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001386
1387 /* Decode via the codec registry */
1388 v = PyCodec_Decode(unicode, encoding, errors);
1389 if (v == NULL)
1390 goto onError;
1391 return v;
1392
Benjamin Peterson29060642009-01-31 22:14:21 +00001393 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001394 return NULL;
1395}
1396
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001397PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1398 const char *encoding,
1399 const char *errors)
1400{
1401 PyObject *v;
1402
1403 if (!PyUnicode_Check(unicode)) {
1404 PyErr_BadArgument();
1405 goto onError;
1406 }
1407
1408 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001409 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001410
1411 /* Decode via the codec registry */
1412 v = PyCodec_Decode(unicode, encoding, errors);
1413 if (v == NULL)
1414 goto onError;
1415 if (!PyUnicode_Check(v)) {
1416 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001417 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001418 Py_TYPE(v)->tp_name);
1419 Py_DECREF(v);
1420 goto onError;
1421 }
1422 return v;
1423
Benjamin Peterson29060642009-01-31 22:14:21 +00001424 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001425 return NULL;
1426}
1427
Guido van Rossumd57fd912000-03-10 22:53:23 +00001428PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001429 Py_ssize_t size,
1430 const char *encoding,
1431 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432{
1433 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001434
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 unicode = PyUnicode_FromUnicode(s, size);
1436 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001437 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1439 Py_DECREF(unicode);
1440 return v;
1441}
1442
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001443PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1444 const char *encoding,
1445 const char *errors)
1446{
1447 PyObject *v;
1448
1449 if (!PyUnicode_Check(unicode)) {
1450 PyErr_BadArgument();
1451 goto onError;
1452 }
1453
1454 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001455 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001456
1457 /* Encode via the codec registry */
1458 v = PyCodec_Encode(unicode, encoding, errors);
1459 if (v == NULL)
1460 goto onError;
1461 return v;
1462
Benjamin Peterson29060642009-01-31 22:14:21 +00001463 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001464 return NULL;
1465}
1466
Victor Stinnerae6265f2010-05-15 16:27:27 +00001467PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1468{
Victor Stinner313a1202010-06-11 23:56:51 +00001469 if (Py_FileSystemDefaultEncoding) {
1470#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1471 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1472 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1473 PyUnicode_GET_SIZE(unicode),
1474 NULL);
1475#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001476 return PyUnicode_AsEncodedString(unicode,
1477 Py_FileSystemDefaultEncoding,
1478 "surrogateescape");
Victor Stinner313a1202010-06-11 23:56:51 +00001479 } else
Victor Stinnerae6265f2010-05-15 16:27:27 +00001480 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Victor Stinner3119ed72010-08-18 22:26:50 +00001481 PyUnicode_GET_SIZE(unicode),
1482 "surrogateescape");
Victor Stinnerae6265f2010-05-15 16:27:27 +00001483}
1484
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1486 const char *encoding,
1487 const char *errors)
1488{
1489 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001490 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001491
Guido van Rossumd57fd912000-03-10 22:53:23 +00001492 if (!PyUnicode_Check(unicode)) {
1493 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001495 }
Fred Drakee4315f52000-05-09 19:53:39 +00001496
Tim Petersced69f82003-09-16 20:30:58 +00001497 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001498 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001499
1500 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001501 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1502 if (strcmp(lower, "utf-8") == 0)
1503 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1504 PyUnicode_GET_SIZE(unicode),
1505 errors);
1506 else if ((strcmp(lower, "latin-1") == 0) ||
1507 (strcmp(lower, "iso-8859-1") == 0))
1508 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1509 PyUnicode_GET_SIZE(unicode),
1510 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001511#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001512 else if (strcmp(lower, "mbcs") == 0)
1513 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1514 PyUnicode_GET_SIZE(unicode),
1515 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001516#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001517 else if (strcmp(lower, "ascii") == 0)
1518 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1519 PyUnicode_GET_SIZE(unicode),
1520 errors);
1521 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001522 /* During bootstrap, we may need to find the encodings
1523 package, to load the file system encoding, and require the
1524 file system encoding in order to load the encodings
1525 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001526
Victor Stinner59e62db2010-05-15 13:14:32 +00001527 Break out of this dependency by assuming that the path to
1528 the encodings module is ASCII-only. XXX could try wcstombs
1529 instead, if the file system encoding is the locale's
1530 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001531 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001532 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1533 !PyThreadState_GET()->interp->codecs_initialized)
1534 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1535 PyUnicode_GET_SIZE(unicode),
1536 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001537
1538 /* Encode via the codec registry */
1539 v = PyCodec_Encode(unicode, encoding, errors);
1540 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001541 return NULL;
1542
1543 /* The normal path */
1544 if (PyBytes_Check(v))
1545 return v;
1546
1547 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001548 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001549 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001550 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001551
1552 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1553 "encoder %s returned bytearray instead of bytes",
1554 encoding);
1555 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001556 Py_DECREF(v);
1557 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001558 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001559
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001560 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1561 Py_DECREF(v);
1562 return b;
1563 }
1564
1565 PyErr_Format(PyExc_TypeError,
1566 "encoder did not return a bytes object (type=%.400s)",
1567 Py_TYPE(v)->tp_name);
1568 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001569 return NULL;
1570}
1571
1572PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1573 const char *encoding,
1574 const char *errors)
1575{
1576 PyObject *v;
1577
1578 if (!PyUnicode_Check(unicode)) {
1579 PyErr_BadArgument();
1580 goto onError;
1581 }
1582
1583 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001585
1586 /* Encode via the codec registry */
1587 v = PyCodec_Encode(unicode, encoding, errors);
1588 if (v == NULL)
1589 goto onError;
1590 if (!PyUnicode_Check(v)) {
1591 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001592 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001593 Py_TYPE(v)->tp_name);
1594 Py_DECREF(v);
1595 goto onError;
1596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001598
Benjamin Peterson29060642009-01-31 22:14:21 +00001599 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600 return NULL;
1601}
1602
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001603PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001604 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001605{
1606 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001607 if (v)
1608 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001609 if (errors != NULL)
1610 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001611 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001612 PyUnicode_GET_SIZE(unicode),
1613 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001614 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001615 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001616 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001617 return v;
1618}
1619
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001620PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001621PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001622 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001623 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1624}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001625
Christian Heimes5894ba72007-11-04 11:43:14 +00001626PyObject*
1627PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1628{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001629 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1630 can be undefined. If it is case, decode using UTF-8. The following assumes
1631 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1632 bootstrapping process where the codecs aren't ready yet.
1633 */
1634 if (Py_FileSystemDefaultEncoding) {
1635#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001636 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001637 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001638 }
1639#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001640 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001641 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001642 }
1643#endif
1644 return PyUnicode_Decode(s, size,
1645 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001646 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001647 }
1648 else {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001649 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001650 }
1651}
1652
Martin v. Löwis011e8422009-05-05 04:43:17 +00001653
1654int
1655PyUnicode_FSConverter(PyObject* arg, void* addr)
1656{
1657 PyObject *output = NULL;
1658 Py_ssize_t size;
1659 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001660 if (arg == NULL) {
1661 Py_DECREF(*(PyObject**)addr);
1662 return 1;
1663 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001664 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001665 output = arg;
1666 Py_INCREF(output);
1667 }
1668 else {
1669 arg = PyUnicode_FromObject(arg);
1670 if (!arg)
1671 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001672 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001673 Py_DECREF(arg);
1674 if (!output)
1675 return 0;
1676 if (!PyBytes_Check(output)) {
1677 Py_DECREF(output);
1678 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1679 return 0;
1680 }
1681 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001682 size = PyBytes_GET_SIZE(output);
1683 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001684 if (size != strlen(data)) {
1685 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1686 Py_DECREF(output);
1687 return 0;
1688 }
1689 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001690 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001691}
1692
1693
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001694int
1695PyUnicode_FSDecoder(PyObject* arg, void* addr)
1696{
1697 PyObject *output = NULL;
1698 Py_ssize_t size;
1699 void *data;
1700 if (arg == NULL) {
1701 Py_DECREF(*(PyObject**)addr);
1702 return 1;
1703 }
1704 if (PyUnicode_Check(arg)) {
1705 output = arg;
1706 Py_INCREF(output);
1707 }
1708 else {
1709 arg = PyBytes_FromObject(arg);
1710 if (!arg)
1711 return 0;
1712 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1713 PyBytes_GET_SIZE(arg));
1714 Py_DECREF(arg);
1715 if (!output)
1716 return 0;
1717 if (!PyUnicode_Check(output)) {
1718 Py_DECREF(output);
1719 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1720 return 0;
1721 }
1722 }
1723 size = PyUnicode_GET_SIZE(output);
1724 data = PyUnicode_AS_UNICODE(output);
1725 if (size != Py_UNICODE_strlen(data)) {
1726 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1727 Py_DECREF(output);
1728 return 0;
1729 }
1730 *(PyObject**)addr = output;
1731 return Py_CLEANUP_SUPPORTED;
1732}
1733
1734
Martin v. Löwis5b222132007-06-10 09:51:05 +00001735char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001736_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001737{
Christian Heimesf3863112007-11-22 07:46:41 +00001738 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001739 if (!PyUnicode_Check(unicode)) {
1740 PyErr_BadArgument();
1741 return NULL;
1742 }
Christian Heimesf3863112007-11-22 07:46:41 +00001743 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1744 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001745 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001746 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001747 *psize = PyBytes_GET_SIZE(bytes);
1748 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001749}
1750
1751char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001752_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001753{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001754 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001755}
1756
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1758{
1759 if (!PyUnicode_Check(unicode)) {
1760 PyErr_BadArgument();
1761 goto onError;
1762 }
1763 return PyUnicode_AS_UNICODE(unicode);
1764
Benjamin Peterson29060642009-01-31 22:14:21 +00001765 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 return NULL;
1767}
1768
Martin v. Löwis18e16552006-02-15 17:27:45 +00001769Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770{
1771 if (!PyUnicode_Check(unicode)) {
1772 PyErr_BadArgument();
1773 goto onError;
1774 }
1775 return PyUnicode_GET_SIZE(unicode);
1776
Benjamin Peterson29060642009-01-31 22:14:21 +00001777 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 return -1;
1779}
1780
Thomas Wouters78890102000-07-22 19:25:51 +00001781const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001782{
Victor Stinner42cb4622010-09-01 19:39:01 +00001783 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001784}
1785
Victor Stinner554f3f02010-06-16 23:33:54 +00001786/* create or adjust a UnicodeDecodeError */
1787static void
1788make_decode_exception(PyObject **exceptionObject,
1789 const char *encoding,
1790 const char *input, Py_ssize_t length,
1791 Py_ssize_t startpos, Py_ssize_t endpos,
1792 const char *reason)
1793{
1794 if (*exceptionObject == NULL) {
1795 *exceptionObject = PyUnicodeDecodeError_Create(
1796 encoding, input, length, startpos, endpos, reason);
1797 }
1798 else {
1799 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1800 goto onError;
1801 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1802 goto onError;
1803 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1804 goto onError;
1805 }
1806 return;
1807
1808onError:
1809 Py_DECREF(*exceptionObject);
1810 *exceptionObject = NULL;
1811}
1812
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001813/* error handling callback helper:
1814 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001815 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001816 and adjust various state variables.
1817 return 0 on success, -1 on error
1818*/
1819
1820static
1821int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001822 const char *encoding, const char *reason,
1823 const char **input, const char **inend, Py_ssize_t *startinpos,
1824 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1825 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001827 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001828
1829 PyObject *restuple = NULL;
1830 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001831 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001832 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001833 Py_ssize_t requiredsize;
1834 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001836 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001837 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001838 int res = -1;
1839
1840 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001841 *errorHandler = PyCodec_LookupError(errors);
1842 if (*errorHandler == NULL)
1843 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 }
1845
Victor Stinner554f3f02010-06-16 23:33:54 +00001846 make_decode_exception(exceptionObject,
1847 encoding,
1848 *input, *inend - *input,
1849 *startinpos, *endinpos,
1850 reason);
1851 if (*exceptionObject == NULL)
1852 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001853
1854 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1855 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001856 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001857 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001858 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001859 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001860 }
1861 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001862 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001863
1864 /* Copy back the bytes variables, which might have been modified by the
1865 callback */
1866 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1867 if (!inputobj)
1868 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001869 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001870 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001871 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001872 *input = PyBytes_AS_STRING(inputobj);
1873 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001874 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001875 /* we can DECREF safely, as the exception has another reference,
1876 so the object won't go away. */
1877 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001878
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001879 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001880 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001881 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001882 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1883 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001884 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001885
1886 /* need more space? (at least enough for what we
1887 have+the replacement+the rest of the string (starting
1888 at the new input position), so we won't have to check space
1889 when there are no errors in the rest of the string) */
1890 repptr = PyUnicode_AS_UNICODE(repunicode);
1891 repsize = PyUnicode_GET_SIZE(repunicode);
1892 requiredsize = *outpos + repsize + insize-newpos;
1893 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001894 if (requiredsize<2*outsize)
1895 requiredsize = 2*outsize;
1896 if (_PyUnicode_Resize(output, requiredsize) < 0)
1897 goto onError;
1898 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001899 }
1900 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001901 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001902 Py_UNICODE_COPY(*outptr, repptr, repsize);
1903 *outptr += repsize;
1904 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001905
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001906 /* we made it! */
1907 res = 0;
1908
Benjamin Peterson29060642009-01-31 22:14:21 +00001909 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001910 Py_XDECREF(restuple);
1911 return res;
1912}
1913
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001914/* --- UTF-7 Codec -------------------------------------------------------- */
1915
Antoine Pitrou244651a2009-05-04 18:56:13 +00001916/* See RFC2152 for details. We encode conservatively and decode liberally. */
1917
1918/* Three simple macros defining base-64. */
1919
1920/* Is c a base-64 character? */
1921
1922#define IS_BASE64(c) \
1923 (((c) >= 'A' && (c) <= 'Z') || \
1924 ((c) >= 'a' && (c) <= 'z') || \
1925 ((c) >= '0' && (c) <= '9') || \
1926 (c) == '+' || (c) == '/')
1927
1928/* given that c is a base-64 character, what is its base-64 value? */
1929
1930#define FROM_BASE64(c) \
1931 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1932 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1933 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1934 (c) == '+' ? 62 : 63)
1935
1936/* What is the base-64 character of the bottom 6 bits of n? */
1937
1938#define TO_BASE64(n) \
1939 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1940
1941/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1942 * decoded as itself. We are permissive on decoding; the only ASCII
1943 * byte not decoding to itself is the + which begins a base64
1944 * string. */
1945
1946#define DECODE_DIRECT(c) \
1947 ((c) <= 127 && (c) != '+')
1948
1949/* The UTF-7 encoder treats ASCII characters differently according to
1950 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1951 * the above). See RFC2152. This array identifies these different
1952 * sets:
1953 * 0 : "Set D"
1954 * alphanumeric and '(),-./:?
1955 * 1 : "Set O"
1956 * !"#$%&*;<=>@[]^_`{|}
1957 * 2 : "whitespace"
1958 * ht nl cr sp
1959 * 3 : special (must be base64 encoded)
1960 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1961 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001962
Tim Petersced69f82003-09-16 20:30:58 +00001963static
Antoine Pitrou244651a2009-05-04 18:56:13 +00001964char utf7_category[128] = {
1965/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1966 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1967/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1968 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1969/* sp ! " # $ % & ' ( ) * + , - . / */
1970 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1971/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1972 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1973/* @ A B C D E F G H I J K L M N O */
1974 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1975/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1976 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1977/* ` a b c d e f g h i j k l m n o */
1978 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1979/* p q r s t u v w x y z { | } ~ del */
1980 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001981};
1982
Antoine Pitrou244651a2009-05-04 18:56:13 +00001983/* ENCODE_DIRECT: this character should be encoded as itself. The
1984 * answer depends on whether we are encoding set O as itself, and also
1985 * on whether we are encoding whitespace as itself. RFC2152 makes it
1986 * clear that the answers to these questions vary between
1987 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001988
Antoine Pitrou244651a2009-05-04 18:56:13 +00001989#define ENCODE_DIRECT(c, directO, directWS) \
1990 ((c) < 128 && (c) > 0 && \
1991 ((utf7_category[(c)] == 0) || \
1992 (directWS && (utf7_category[(c)] == 2)) || \
1993 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001994
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001995PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001996 Py_ssize_t size,
1997 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001998{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00001999 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2000}
2001
Antoine Pitrou244651a2009-05-04 18:56:13 +00002002/* The decoder. The only state we preserve is our read position,
2003 * i.e. how many characters we have consumed. So if we end in the
2004 * middle of a shift sequence we have to back off the read position
2005 * and the output to the beginning of the sequence, otherwise we lose
2006 * all the shift state (seen bits, number of bits seen, high
2007 * surrogate). */
2008
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002009PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002010 Py_ssize_t size,
2011 const char *errors,
2012 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002013{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002014 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002015 Py_ssize_t startinpos;
2016 Py_ssize_t endinpos;
2017 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002018 const char *e;
2019 PyUnicodeObject *unicode;
2020 Py_UNICODE *p;
2021 const char *errmsg = "";
2022 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002023 Py_UNICODE *shiftOutStart;
2024 unsigned int base64bits = 0;
2025 unsigned long base64buffer = 0;
2026 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002027 PyObject *errorHandler = NULL;
2028 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002029
2030 unicode = _PyUnicode_New(size);
2031 if (!unicode)
2032 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002033 if (size == 0) {
2034 if (consumed)
2035 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002036 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002037 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002038
2039 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002040 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002041 e = s + size;
2042
2043 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002045 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002046 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002047
Antoine Pitrou244651a2009-05-04 18:56:13 +00002048 if (inShift) { /* in a base-64 section */
2049 if (IS_BASE64(ch)) { /* consume a base-64 character */
2050 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2051 base64bits += 6;
2052 s++;
2053 if (base64bits >= 16) {
2054 /* we have enough bits for a UTF-16 value */
2055 Py_UNICODE outCh = (Py_UNICODE)
2056 (base64buffer >> (base64bits-16));
2057 base64bits -= 16;
2058 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2059 if (surrogate) {
2060 /* expecting a second surrogate */
2061 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2062#ifdef Py_UNICODE_WIDE
2063 *p++ = (((surrogate & 0x3FF)<<10)
2064 | (outCh & 0x3FF)) + 0x10000;
2065#else
2066 *p++ = surrogate;
2067 *p++ = outCh;
2068#endif
2069 surrogate = 0;
2070 }
2071 else {
2072 surrogate = 0;
2073 errmsg = "second surrogate missing";
2074 goto utf7Error;
2075 }
2076 }
2077 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2078 /* first surrogate */
2079 surrogate = outCh;
2080 }
2081 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2082 errmsg = "unexpected second surrogate";
2083 goto utf7Error;
2084 }
2085 else {
2086 *p++ = outCh;
2087 }
2088 }
2089 }
2090 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002091 inShift = 0;
2092 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002093 if (surrogate) {
2094 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002095 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002096 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002097 if (base64bits > 0) { /* left-over bits */
2098 if (base64bits >= 6) {
2099 /* We've seen at least one base-64 character */
2100 errmsg = "partial character in shift sequence";
2101 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002102 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002103 else {
2104 /* Some bits remain; they should be zero */
2105 if (base64buffer != 0) {
2106 errmsg = "non-zero padding bits in shift sequence";
2107 goto utf7Error;
2108 }
2109 }
2110 }
2111 if (ch != '-') {
2112 /* '-' is absorbed; other terminating
2113 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002114 *p++ = ch;
2115 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002116 }
2117 }
2118 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002120 s++; /* consume '+' */
2121 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002122 s++;
2123 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002124 }
2125 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002126 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002127 shiftOutStart = p;
2128 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002129 }
2130 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002131 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002132 *p++ = ch;
2133 s++;
2134 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002135 else {
2136 startinpos = s-starts;
2137 s++;
2138 errmsg = "unexpected special character";
2139 goto utf7Error;
2140 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002141 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002142utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002143 outpos = p-PyUnicode_AS_UNICODE(unicode);
2144 endinpos = s-starts;
2145 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002146 errors, &errorHandler,
2147 "utf7", errmsg,
2148 &starts, &e, &startinpos, &endinpos, &exc, &s,
2149 &unicode, &outpos, &p))
2150 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002151 }
2152
Antoine Pitrou244651a2009-05-04 18:56:13 +00002153 /* end of string */
2154
2155 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2156 /* if we're in an inconsistent state, that's an error */
2157 if (surrogate ||
2158 (base64bits >= 6) ||
2159 (base64bits > 0 && base64buffer != 0)) {
2160 outpos = p-PyUnicode_AS_UNICODE(unicode);
2161 endinpos = size;
2162 if (unicode_decode_call_errorhandler(
2163 errors, &errorHandler,
2164 "utf7", "unterminated shift sequence",
2165 &starts, &e, &startinpos, &endinpos, &exc, &s,
2166 &unicode, &outpos, &p))
2167 goto onError;
2168 if (s < e)
2169 goto restart;
2170 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002171 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002172
2173 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002174 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002175 if (inShift) {
2176 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002177 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002178 }
2179 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002180 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002181 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002182 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002183
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002184 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002185 goto onError;
2186
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187 Py_XDECREF(errorHandler);
2188 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002189 return (PyObject *)unicode;
2190
Benjamin Peterson29060642009-01-31 22:14:21 +00002191 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002192 Py_XDECREF(errorHandler);
2193 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002194 Py_DECREF(unicode);
2195 return NULL;
2196}
2197
2198
2199PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002200 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002201 int base64SetO,
2202 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002203 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002204{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002205 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002206 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002207 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002208 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002209 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002210 unsigned int base64bits = 0;
2211 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002212 char * out;
2213 char * start;
2214
2215 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002216 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002217
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002218 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002219 return PyErr_NoMemory();
2220
Antoine Pitrou244651a2009-05-04 18:56:13 +00002221 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002222 if (v == NULL)
2223 return NULL;
2224
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002225 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002226 for (;i < size; ++i) {
2227 Py_UNICODE ch = s[i];
2228
Antoine Pitrou244651a2009-05-04 18:56:13 +00002229 if (inShift) {
2230 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2231 /* shifting out */
2232 if (base64bits) { /* output remaining bits */
2233 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2234 base64buffer = 0;
2235 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002236 }
2237 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002238 /* Characters not in the BASE64 set implicitly unshift the sequence
2239 so no '-' is required, except if the character is itself a '-' */
2240 if (IS_BASE64(ch) || ch == '-') {
2241 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002242 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002243 *out++ = (char) ch;
2244 }
2245 else {
2246 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002247 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002248 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002249 else { /* not in a shift sequence */
2250 if (ch == '+') {
2251 *out++ = '+';
2252 *out++ = '-';
2253 }
2254 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2255 *out++ = (char) ch;
2256 }
2257 else {
2258 *out++ = '+';
2259 inShift = 1;
2260 goto encode_char;
2261 }
2262 }
2263 continue;
2264encode_char:
2265#ifdef Py_UNICODE_WIDE
2266 if (ch >= 0x10000) {
2267 /* code first surrogate */
2268 base64bits += 16;
2269 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2270 while (base64bits >= 6) {
2271 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2272 base64bits -= 6;
2273 }
2274 /* prepare second surrogate */
2275 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2276 }
2277#endif
2278 base64bits += 16;
2279 base64buffer = (base64buffer << 16) | ch;
2280 while (base64bits >= 6) {
2281 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2282 base64bits -= 6;
2283 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002284 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002285 if (base64bits)
2286 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2287 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002288 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002289 if (_PyBytes_Resize(&v, out - start) < 0)
2290 return NULL;
2291 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002292}
2293
Antoine Pitrou244651a2009-05-04 18:56:13 +00002294#undef IS_BASE64
2295#undef FROM_BASE64
2296#undef TO_BASE64
2297#undef DECODE_DIRECT
2298#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002299
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300/* --- UTF-8 Codec -------------------------------------------------------- */
2301
Tim Petersced69f82003-09-16 20:30:58 +00002302static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002304 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2305 illegal prefix. See RFC 3629 for details */
2306 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2307 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002308 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002309 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2310 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2311 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2312 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002313 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2314 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2316 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002317 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2318 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2319 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2320 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2321 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322};
2323
Guido van Rossumd57fd912000-03-10 22:53:23 +00002324PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002325 Py_ssize_t size,
2326 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327{
Walter Dörwald69652032004-09-07 20:24:22 +00002328 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2329}
2330
Antoine Pitrouab868312009-01-10 15:40:25 +00002331/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2332#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2333
2334/* Mask to quickly check whether a C 'long' contains a
2335 non-ASCII, UTF8-encoded char. */
2336#if (SIZEOF_LONG == 8)
2337# define ASCII_CHAR_MASK 0x8080808080808080L
2338#elif (SIZEOF_LONG == 4)
2339# define ASCII_CHAR_MASK 0x80808080L
2340#else
2341# error C 'long' size should be either 4 or 8!
2342#endif
2343
Walter Dörwald69652032004-09-07 20:24:22 +00002344PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002345 Py_ssize_t size,
2346 const char *errors,
2347 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002348{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002349 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002351 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002352 Py_ssize_t startinpos;
2353 Py_ssize_t endinpos;
2354 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002355 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 PyUnicodeObject *unicode;
2357 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002358 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002359 PyObject *errorHandler = NULL;
2360 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361
2362 /* Note: size will always be longer than the resulting Unicode
2363 character count */
2364 unicode = _PyUnicode_New(size);
2365 if (!unicode)
2366 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002367 if (size == 0) {
2368 if (consumed)
2369 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002370 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002371 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372
2373 /* Unpack UTF-8 encoded data */
2374 p = unicode->str;
2375 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002376 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002377
2378 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002379 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002380
2381 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002382 /* Fast path for runs of ASCII characters. Given that common UTF-8
2383 input will consist of an overwhelming majority of ASCII
2384 characters, we try to optimize for this case by checking
2385 as many characters as a C 'long' can contain.
2386 First, check if we can do an aligned read, as most CPUs have
2387 a penalty for unaligned reads.
2388 */
2389 if (!((size_t) s & LONG_PTR_MASK)) {
2390 /* Help register allocation */
2391 register const char *_s = s;
2392 register Py_UNICODE *_p = p;
2393 while (_s < aligned_end) {
2394 /* Read a whole long at a time (either 4 or 8 bytes),
2395 and do a fast unrolled copy if it only contains ASCII
2396 characters. */
2397 unsigned long data = *(unsigned long *) _s;
2398 if (data & ASCII_CHAR_MASK)
2399 break;
2400 _p[0] = (unsigned char) _s[0];
2401 _p[1] = (unsigned char) _s[1];
2402 _p[2] = (unsigned char) _s[2];
2403 _p[3] = (unsigned char) _s[3];
2404#if (SIZEOF_LONG == 8)
2405 _p[4] = (unsigned char) _s[4];
2406 _p[5] = (unsigned char) _s[5];
2407 _p[6] = (unsigned char) _s[6];
2408 _p[7] = (unsigned char) _s[7];
2409#endif
2410 _s += SIZEOF_LONG;
2411 _p += SIZEOF_LONG;
2412 }
2413 s = _s;
2414 p = _p;
2415 if (s == e)
2416 break;
2417 ch = (unsigned char)*s;
2418 }
2419 }
2420
2421 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002422 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 s++;
2424 continue;
2425 }
2426
2427 n = utf8_code_length[ch];
2428
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002429 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002430 if (consumed)
2431 break;
2432 else {
2433 errmsg = "unexpected end of data";
2434 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002435 endinpos = startinpos+1;
2436 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2437 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002438 goto utf8Error;
2439 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441
2442 switch (n) {
2443
2444 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002445 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002446 startinpos = s-starts;
2447 endinpos = startinpos+1;
2448 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002449
2450 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002451 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002452 startinpos = s-starts;
2453 endinpos = startinpos+1;
2454 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002455
2456 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002457 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002458 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002459 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002460 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002461 goto utf8Error;
2462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002464 assert ((ch > 0x007F) && (ch <= 0x07FF));
2465 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 break;
2467
2468 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002469 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2470 will result in surrogates in range d800-dfff. Surrogates are
2471 not valid UTF-8 so they are rejected.
2472 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2473 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002474 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002475 (s[2] & 0xc0) != 0x80 ||
2476 ((unsigned char)s[0] == 0xE0 &&
2477 (unsigned char)s[1] < 0xA0) ||
2478 ((unsigned char)s[0] == 0xED &&
2479 (unsigned char)s[1] > 0x9F)) {
2480 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002481 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002482 endinpos = startinpos + 1;
2483
2484 /* if s[1] first two bits are 1 and 0, then the invalid
2485 continuation byte is s[2], so increment endinpos by 1,
2486 if not, s[1] is invalid and endinpos doesn't need to
2487 be incremented. */
2488 if ((s[1] & 0xC0) == 0x80)
2489 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002490 goto utf8Error;
2491 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002493 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2494 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002495 break;
2496
2497 case 4:
2498 if ((s[1] & 0xc0) != 0x80 ||
2499 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002500 (s[3] & 0xc0) != 0x80 ||
2501 ((unsigned char)s[0] == 0xF0 &&
2502 (unsigned char)s[1] < 0x90) ||
2503 ((unsigned char)s[0] == 0xF4 &&
2504 (unsigned char)s[1] > 0x8F)) {
2505 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002506 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002507 endinpos = startinpos + 1;
2508 if ((s[1] & 0xC0) == 0x80) {
2509 endinpos++;
2510 if ((s[2] & 0xC0) == 0x80)
2511 endinpos++;
2512 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002513 goto utf8Error;
2514 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002515 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002516 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2517 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2518
Fredrik Lundh8f455852001-06-27 18:59:43 +00002519#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002520 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002521#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002522 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002523
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002524 /* translate from 10000..10FFFF to 0..FFFF */
2525 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002526
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002527 /* high surrogate = top 10 bits added to D800 */
2528 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002529
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002530 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002531 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002532#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 }
2535 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002536 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002537
Benjamin Peterson29060642009-01-31 22:14:21 +00002538 utf8Error:
2539 outpos = p-PyUnicode_AS_UNICODE(unicode);
2540 if (unicode_decode_call_errorhandler(
2541 errors, &errorHandler,
2542 "utf8", errmsg,
2543 &starts, &e, &startinpos, &endinpos, &exc, &s,
2544 &unicode, &outpos, &p))
2545 goto onError;
2546 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 }
Walter Dörwald69652032004-09-07 20:24:22 +00002548 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002549 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550
2551 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002552 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 goto onError;
2554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555 Py_XDECREF(errorHandler);
2556 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 return (PyObject *)unicode;
2558
Benjamin Peterson29060642009-01-31 22:14:21 +00002559 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 Py_XDECREF(errorHandler);
2561 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 Py_DECREF(unicode);
2563 return NULL;
2564}
2565
Antoine Pitrouab868312009-01-10 15:40:25 +00002566#undef ASCII_CHAR_MASK
2567
2568
Tim Peters602f7402002-04-27 18:03:26 +00002569/* Allocation strategy: if the string is short, convert into a stack buffer
2570 and allocate exactly as much space needed at the end. Else allocate the
2571 maximum possible needed (4 result bytes per Unicode character), and return
2572 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002573*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002574PyObject *
2575PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002576 Py_ssize_t size,
2577 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578{
Tim Peters602f7402002-04-27 18:03:26 +00002579#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002580
Guido van Rossum98297ee2007-11-06 21:34:58 +00002581 Py_ssize_t i; /* index into s of next input byte */
2582 PyObject *result; /* result string object */
2583 char *p; /* next free byte in output buffer */
2584 Py_ssize_t nallocated; /* number of result bytes allocated */
2585 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002586 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002587 PyObject *errorHandler = NULL;
2588 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002589
Tim Peters602f7402002-04-27 18:03:26 +00002590 assert(s != NULL);
2591 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592
Tim Peters602f7402002-04-27 18:03:26 +00002593 if (size <= MAX_SHORT_UNICHARS) {
2594 /* Write into the stack buffer; nallocated can't overflow.
2595 * At the end, we'll allocate exactly as much heap space as it
2596 * turns out we need.
2597 */
2598 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002599 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002600 p = stackbuf;
2601 }
2602 else {
2603 /* Overallocate on the heap, and give the excess back at the end. */
2604 nallocated = size * 4;
2605 if (nallocated / 4 != size) /* overflow! */
2606 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002607 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002608 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002609 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002610 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002611 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002612
Tim Peters602f7402002-04-27 18:03:26 +00002613 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002614 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002615
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002616 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002617 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002619
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002621 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002622 *p++ = (char)(0xc0 | (ch >> 6));
2623 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002624 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002625#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002626 /* Special case: check for high and low surrogate */
2627 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2628 Py_UCS4 ch2 = s[i];
2629 /* Combine the two surrogates to form a UCS4 value */
2630 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2631 i++;
2632
2633 /* Encode UCS4 Unicode ordinals */
2634 *p++ = (char)(0xf0 | (ch >> 18));
2635 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002636 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2637 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002638 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002639#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002640 Py_ssize_t newpos;
2641 PyObject *rep;
2642 Py_ssize_t repsize, k;
2643 rep = unicode_encode_call_errorhandler
2644 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2645 s, size, &exc, i-1, i, &newpos);
2646 if (!rep)
2647 goto error;
2648
2649 if (PyBytes_Check(rep))
2650 repsize = PyBytes_GET_SIZE(rep);
2651 else
2652 repsize = PyUnicode_GET_SIZE(rep);
2653
2654 if (repsize > 4) {
2655 Py_ssize_t offset;
2656
2657 if (result == NULL)
2658 offset = p - stackbuf;
2659 else
2660 offset = p - PyBytes_AS_STRING(result);
2661
2662 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2663 /* integer overflow */
2664 PyErr_NoMemory();
2665 goto error;
2666 }
2667 nallocated += repsize - 4;
2668 if (result != NULL) {
2669 if (_PyBytes_Resize(&result, nallocated) < 0)
2670 goto error;
2671 } else {
2672 result = PyBytes_FromStringAndSize(NULL, nallocated);
2673 if (result == NULL)
2674 goto error;
2675 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2676 }
2677 p = PyBytes_AS_STRING(result) + offset;
2678 }
2679
2680 if (PyBytes_Check(rep)) {
2681 char *prep = PyBytes_AS_STRING(rep);
2682 for(k = repsize; k > 0; k--)
2683 *p++ = *prep++;
2684 } else /* rep is unicode */ {
2685 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2686 Py_UNICODE c;
2687
2688 for(k=0; k<repsize; k++) {
2689 c = prep[k];
2690 if (0x80 <= c) {
2691 raise_encode_exception(&exc, "utf-8", s, size,
2692 i-1, i, "surrogates not allowed");
2693 goto error;
2694 }
2695 *p++ = (char)prep[k];
2696 }
2697 }
2698 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002699#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002700 }
Victor Stinner445a6232010-04-22 20:01:57 +00002701#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002702 } else if (ch < 0x10000) {
2703 *p++ = (char)(0xe0 | (ch >> 12));
2704 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2705 *p++ = (char)(0x80 | (ch & 0x3f));
2706 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002707 /* Encode UCS4 Unicode ordinals */
2708 *p++ = (char)(0xf0 | (ch >> 18));
2709 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2710 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2711 *p++ = (char)(0x80 | (ch & 0x3f));
2712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002714
Guido van Rossum98297ee2007-11-06 21:34:58 +00002715 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002716 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002717 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002718 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002719 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002720 }
2721 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002722 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002723 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002724 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002725 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002726 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002727 Py_XDECREF(errorHandler);
2728 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002729 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002730 error:
2731 Py_XDECREF(errorHandler);
2732 Py_XDECREF(exc);
2733 Py_XDECREF(result);
2734 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002735
Tim Peters602f7402002-04-27 18:03:26 +00002736#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737}
2738
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2740{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 if (!PyUnicode_Check(unicode)) {
2742 PyErr_BadArgument();
2743 return NULL;
2744 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002745 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002746 PyUnicode_GET_SIZE(unicode),
2747 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748}
2749
Walter Dörwald41980ca2007-08-16 21:55:45 +00002750/* --- UTF-32 Codec ------------------------------------------------------- */
2751
2752PyObject *
2753PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002754 Py_ssize_t size,
2755 const char *errors,
2756 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002757{
2758 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2759}
2760
2761PyObject *
2762PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002763 Py_ssize_t size,
2764 const char *errors,
2765 int *byteorder,
2766 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002767{
2768 const char *starts = s;
2769 Py_ssize_t startinpos;
2770 Py_ssize_t endinpos;
2771 Py_ssize_t outpos;
2772 PyUnicodeObject *unicode;
2773 Py_UNICODE *p;
2774#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002775 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002776 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002777#else
2778 const int pairs = 0;
2779#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002780 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002781 int bo = 0; /* assume native ordering by default */
2782 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002783 /* Offsets from q for retrieving bytes in the right order. */
2784#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2785 int iorder[] = {0, 1, 2, 3};
2786#else
2787 int iorder[] = {3, 2, 1, 0};
2788#endif
2789 PyObject *errorHandler = NULL;
2790 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002791
Walter Dörwald41980ca2007-08-16 21:55:45 +00002792 q = (unsigned char *)s;
2793 e = q + size;
2794
2795 if (byteorder)
2796 bo = *byteorder;
2797
2798 /* Check for BOM marks (U+FEFF) in the input and adjust current
2799 byte order setting accordingly. In native mode, the leading BOM
2800 mark is skipped, in all other modes, it is copied to the output
2801 stream as-is (giving a ZWNBSP character). */
2802 if (bo == 0) {
2803 if (size >= 4) {
2804 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002806#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002807 if (bom == 0x0000FEFF) {
2808 q += 4;
2809 bo = -1;
2810 }
2811 else if (bom == 0xFFFE0000) {
2812 q += 4;
2813 bo = 1;
2814 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002815#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002816 if (bom == 0x0000FEFF) {
2817 q += 4;
2818 bo = 1;
2819 }
2820 else if (bom == 0xFFFE0000) {
2821 q += 4;
2822 bo = -1;
2823 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002824#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002825 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002826 }
2827
2828 if (bo == -1) {
2829 /* force LE */
2830 iorder[0] = 0;
2831 iorder[1] = 1;
2832 iorder[2] = 2;
2833 iorder[3] = 3;
2834 }
2835 else if (bo == 1) {
2836 /* force BE */
2837 iorder[0] = 3;
2838 iorder[1] = 2;
2839 iorder[2] = 1;
2840 iorder[3] = 0;
2841 }
2842
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002843 /* On narrow builds we split characters outside the BMP into two
2844 codepoints => count how much extra space we need. */
2845#ifndef Py_UNICODE_WIDE
2846 for (qq = q; qq < e; qq += 4)
2847 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2848 pairs++;
2849#endif
2850
2851 /* This might be one to much, because of a BOM */
2852 unicode = _PyUnicode_New((size+3)/4+pairs);
2853 if (!unicode)
2854 return NULL;
2855 if (size == 0)
2856 return (PyObject *)unicode;
2857
2858 /* Unpack UTF-32 encoded data */
2859 p = unicode->str;
2860
Walter Dörwald41980ca2007-08-16 21:55:45 +00002861 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002862 Py_UCS4 ch;
2863 /* remaining bytes at the end? (size should be divisible by 4) */
2864 if (e-q<4) {
2865 if (consumed)
2866 break;
2867 errmsg = "truncated data";
2868 startinpos = ((const char *)q)-starts;
2869 endinpos = ((const char *)e)-starts;
2870 goto utf32Error;
2871 /* The remaining input chars are ignored if the callback
2872 chooses to skip the input */
2873 }
2874 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2875 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002876
Benjamin Peterson29060642009-01-31 22:14:21 +00002877 if (ch >= 0x110000)
2878 {
2879 errmsg = "codepoint not in range(0x110000)";
2880 startinpos = ((const char *)q)-starts;
2881 endinpos = startinpos+4;
2882 goto utf32Error;
2883 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002884#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 if (ch >= 0x10000)
2886 {
2887 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2888 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2889 }
2890 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002891#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002892 *p++ = ch;
2893 q += 4;
2894 continue;
2895 utf32Error:
2896 outpos = p-PyUnicode_AS_UNICODE(unicode);
2897 if (unicode_decode_call_errorhandler(
2898 errors, &errorHandler,
2899 "utf32", errmsg,
2900 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2901 &unicode, &outpos, &p))
2902 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002903 }
2904
2905 if (byteorder)
2906 *byteorder = bo;
2907
2908 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002909 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002910
2911 /* Adjust length */
2912 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2913 goto onError;
2914
2915 Py_XDECREF(errorHandler);
2916 Py_XDECREF(exc);
2917 return (PyObject *)unicode;
2918
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002920 Py_DECREF(unicode);
2921 Py_XDECREF(errorHandler);
2922 Py_XDECREF(exc);
2923 return NULL;
2924}
2925
2926PyObject *
2927PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002928 Py_ssize_t size,
2929 const char *errors,
2930 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002931{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002932 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002933 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002934 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002935#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002936 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002937#else
2938 const int pairs = 0;
2939#endif
2940 /* Offsets from p for storing byte pairs in the right order. */
2941#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2942 int iorder[] = {0, 1, 2, 3};
2943#else
2944 int iorder[] = {3, 2, 1, 0};
2945#endif
2946
Benjamin Peterson29060642009-01-31 22:14:21 +00002947#define STORECHAR(CH) \
2948 do { \
2949 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2950 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2951 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2952 p[iorder[0]] = (CH) & 0xff; \
2953 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002954 } while(0)
2955
2956 /* In narrow builds we can output surrogate pairs as one codepoint,
2957 so we need less space. */
2958#ifndef Py_UNICODE_WIDE
2959 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00002960 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2961 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2962 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002963#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002964 nsize = (size - pairs + (byteorder == 0));
2965 bytesize = nsize * 4;
2966 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00002967 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002968 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002969 if (v == NULL)
2970 return NULL;
2971
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002972 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002973 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002974 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00002975 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00002976 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002977
2978 if (byteorder == -1) {
2979 /* force LE */
2980 iorder[0] = 0;
2981 iorder[1] = 1;
2982 iorder[2] = 2;
2983 iorder[3] = 3;
2984 }
2985 else if (byteorder == 1) {
2986 /* force BE */
2987 iorder[0] = 3;
2988 iorder[1] = 2;
2989 iorder[2] = 1;
2990 iorder[3] = 0;
2991 }
2992
2993 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002994 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002995#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002996 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2997 Py_UCS4 ch2 = *s;
2998 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2999 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3000 s++;
3001 size--;
3002 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003003 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003004#endif
3005 STORECHAR(ch);
3006 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003007
3008 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003009 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003010#undef STORECHAR
3011}
3012
3013PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3014{
3015 if (!PyUnicode_Check(unicode)) {
3016 PyErr_BadArgument();
3017 return NULL;
3018 }
3019 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003020 PyUnicode_GET_SIZE(unicode),
3021 NULL,
3022 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003023}
3024
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025/* --- UTF-16 Codec ------------------------------------------------------- */
3026
Tim Peters772747b2001-08-09 22:21:55 +00003027PyObject *
3028PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 Py_ssize_t size,
3030 const char *errors,
3031 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032{
Walter Dörwald69652032004-09-07 20:24:22 +00003033 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3034}
3035
Antoine Pitrouab868312009-01-10 15:40:25 +00003036/* Two masks for fast checking of whether a C 'long' may contain
3037 UTF16-encoded surrogate characters. This is an efficient heuristic,
3038 assuming that non-surrogate characters with a code point >= 0x8000 are
3039 rare in most input.
3040 FAST_CHAR_MASK is used when the input is in native byte ordering,
3041 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003042*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003043#if (SIZEOF_LONG == 8)
3044# define FAST_CHAR_MASK 0x8000800080008000L
3045# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3046#elif (SIZEOF_LONG == 4)
3047# define FAST_CHAR_MASK 0x80008000L
3048# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3049#else
3050# error C 'long' size should be either 4 or 8!
3051#endif
3052
Walter Dörwald69652032004-09-07 20:24:22 +00003053PyObject *
3054PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003055 Py_ssize_t size,
3056 const char *errors,
3057 int *byteorder,
3058 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003059{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003061 Py_ssize_t startinpos;
3062 Py_ssize_t endinpos;
3063 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 PyUnicodeObject *unicode;
3065 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003066 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003067 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003068 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003069 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003070 /* Offsets from q for retrieving byte pairs in the right order. */
3071#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3072 int ihi = 1, ilo = 0;
3073#else
3074 int ihi = 0, ilo = 1;
3075#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 PyObject *errorHandler = NULL;
3077 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078
3079 /* Note: size will always be longer than the resulting Unicode
3080 character count */
3081 unicode = _PyUnicode_New(size);
3082 if (!unicode)
3083 return NULL;
3084 if (size == 0)
3085 return (PyObject *)unicode;
3086
3087 /* Unpack UTF-16 encoded data */
3088 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003089 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003090 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091
3092 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003093 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003095 /* Check for BOM marks (U+FEFF) in the input and adjust current
3096 byte order setting accordingly. In native mode, the leading BOM
3097 mark is skipped, in all other modes, it is copied to the output
3098 stream as-is (giving a ZWNBSP character). */
3099 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003100 if (size >= 2) {
3101 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003102#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003103 if (bom == 0xFEFF) {
3104 q += 2;
3105 bo = -1;
3106 }
3107 else if (bom == 0xFFFE) {
3108 q += 2;
3109 bo = 1;
3110 }
Tim Petersced69f82003-09-16 20:30:58 +00003111#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 if (bom == 0xFEFF) {
3113 q += 2;
3114 bo = 1;
3115 }
3116 else if (bom == 0xFFFE) {
3117 q += 2;
3118 bo = -1;
3119 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003120#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003121 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123
Tim Peters772747b2001-08-09 22:21:55 +00003124 if (bo == -1) {
3125 /* force LE */
3126 ihi = 1;
3127 ilo = 0;
3128 }
3129 else if (bo == 1) {
3130 /* force BE */
3131 ihi = 0;
3132 ilo = 1;
3133 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003134#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3135 native_ordering = ilo < ihi;
3136#else
3137 native_ordering = ilo > ihi;
3138#endif
Tim Peters772747b2001-08-09 22:21:55 +00003139
Antoine Pitrouab868312009-01-10 15:40:25 +00003140 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003141 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003142 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003143 /* First check for possible aligned read of a C 'long'. Unaligned
3144 reads are more expensive, better to defer to another iteration. */
3145 if (!((size_t) q & LONG_PTR_MASK)) {
3146 /* Fast path for runs of non-surrogate chars. */
3147 register const unsigned char *_q = q;
3148 Py_UNICODE *_p = p;
3149 if (native_ordering) {
3150 /* Native ordering is simple: as long as the input cannot
3151 possibly contain a surrogate char, do an unrolled copy
3152 of several 16-bit code points to the target object.
3153 The non-surrogate check is done on several input bytes
3154 at a time (as many as a C 'long' can contain). */
3155 while (_q < aligned_end) {
3156 unsigned long data = * (unsigned long *) _q;
3157 if (data & FAST_CHAR_MASK)
3158 break;
3159 _p[0] = ((unsigned short *) _q)[0];
3160 _p[1] = ((unsigned short *) _q)[1];
3161#if (SIZEOF_LONG == 8)
3162 _p[2] = ((unsigned short *) _q)[2];
3163 _p[3] = ((unsigned short *) _q)[3];
3164#endif
3165 _q += SIZEOF_LONG;
3166 _p += SIZEOF_LONG / 2;
3167 }
3168 }
3169 else {
3170 /* Byteswapped ordering is similar, but we must decompose
3171 the copy bytewise, and take care of zero'ing out the
3172 upper bytes if the target object is in 32-bit units
3173 (that is, in UCS-4 builds). */
3174 while (_q < aligned_end) {
3175 unsigned long data = * (unsigned long *) _q;
3176 if (data & SWAPPED_FAST_CHAR_MASK)
3177 break;
3178 /* Zero upper bytes in UCS-4 builds */
3179#if (Py_UNICODE_SIZE > 2)
3180 _p[0] = 0;
3181 _p[1] = 0;
3182#if (SIZEOF_LONG == 8)
3183 _p[2] = 0;
3184 _p[3] = 0;
3185#endif
3186#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003187 /* Issue #4916; UCS-4 builds on big endian machines must
3188 fill the two last bytes of each 4-byte unit. */
3189#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3190# define OFF 2
3191#else
3192# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003193#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003194 ((unsigned char *) _p)[OFF + 1] = _q[0];
3195 ((unsigned char *) _p)[OFF + 0] = _q[1];
3196 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3197 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3198#if (SIZEOF_LONG == 8)
3199 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3200 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3201 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3202 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3203#endif
3204#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003205 _q += SIZEOF_LONG;
3206 _p += SIZEOF_LONG / 2;
3207 }
3208 }
3209 p = _p;
3210 q = _q;
3211 if (q >= e)
3212 break;
3213 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003214 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003215
Benjamin Peterson14339b62009-01-31 16:36:08 +00003216 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003217
3218 if (ch < 0xD800 || ch > 0xDFFF) {
3219 *p++ = ch;
3220 continue;
3221 }
3222
3223 /* UTF-16 code pair: */
3224 if (q > e) {
3225 errmsg = "unexpected end of data";
3226 startinpos = (((const char *)q) - 2) - starts;
3227 endinpos = ((const char *)e) + 1 - starts;
3228 goto utf16Error;
3229 }
3230 if (0xD800 <= ch && ch <= 0xDBFF) {
3231 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3232 q += 2;
3233 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003234#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 *p++ = ch;
3236 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003237#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003239#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 continue;
3241 }
3242 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003243 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 startinpos = (((const char *)q)-4)-starts;
3245 endinpos = startinpos+2;
3246 goto utf16Error;
3247 }
3248
Benjamin Peterson14339b62009-01-31 16:36:08 +00003249 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 errmsg = "illegal encoding";
3251 startinpos = (((const char *)q)-2)-starts;
3252 endinpos = startinpos+2;
3253 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003254
Benjamin Peterson29060642009-01-31 22:14:21 +00003255 utf16Error:
3256 outpos = p - PyUnicode_AS_UNICODE(unicode);
3257 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003258 errors,
3259 &errorHandler,
3260 "utf16", errmsg,
3261 &starts,
3262 (const char **)&e,
3263 &startinpos,
3264 &endinpos,
3265 &exc,
3266 (const char **)&q,
3267 &unicode,
3268 &outpos,
3269 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003270 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003272 /* remaining byte at the end? (size should be even) */
3273 if (e == q) {
3274 if (!consumed) {
3275 errmsg = "truncated data";
3276 startinpos = ((const char *)q) - starts;
3277 endinpos = ((const char *)e) + 1 - starts;
3278 outpos = p - PyUnicode_AS_UNICODE(unicode);
3279 if (unicode_decode_call_errorhandler(
3280 errors,
3281 &errorHandler,
3282 "utf16", errmsg,
3283 &starts,
3284 (const char **)&e,
3285 &startinpos,
3286 &endinpos,
3287 &exc,
3288 (const char **)&q,
3289 &unicode,
3290 &outpos,
3291 &p))
3292 goto onError;
3293 /* The remaining input chars are ignored if the callback
3294 chooses to skip the input */
3295 }
3296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297
3298 if (byteorder)
3299 *byteorder = bo;
3300
Walter Dörwald69652032004-09-07 20:24:22 +00003301 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003302 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003303
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003305 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 goto onError;
3307
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 Py_XDECREF(errorHandler);
3309 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 return (PyObject *)unicode;
3311
Benjamin Peterson29060642009-01-31 22:14:21 +00003312 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003314 Py_XDECREF(errorHandler);
3315 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 return NULL;
3317}
3318
Antoine Pitrouab868312009-01-10 15:40:25 +00003319#undef FAST_CHAR_MASK
3320#undef SWAPPED_FAST_CHAR_MASK
3321
Tim Peters772747b2001-08-09 22:21:55 +00003322PyObject *
3323PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003324 Py_ssize_t size,
3325 const char *errors,
3326 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003328 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003329 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003330 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003331#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003332 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003333#else
3334 const int pairs = 0;
3335#endif
Tim Peters772747b2001-08-09 22:21:55 +00003336 /* Offsets from p for storing byte pairs in the right order. */
3337#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3338 int ihi = 1, ilo = 0;
3339#else
3340 int ihi = 0, ilo = 1;
3341#endif
3342
Benjamin Peterson29060642009-01-31 22:14:21 +00003343#define STORECHAR(CH) \
3344 do { \
3345 p[ihi] = ((CH) >> 8) & 0xff; \
3346 p[ilo] = (CH) & 0xff; \
3347 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003348 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003350#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003351 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003352 if (s[i] >= 0x10000)
3353 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003354#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003355 /* 2 * (size + pairs + (byteorder == 0)) */
3356 if (size > PY_SSIZE_T_MAX ||
3357 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003358 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003359 nsize = size + pairs + (byteorder == 0);
3360 bytesize = nsize * 2;
3361 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003362 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003363 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 if (v == NULL)
3365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003367 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003369 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003370 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003371 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003372
3373 if (byteorder == -1) {
3374 /* force LE */
3375 ihi = 1;
3376 ilo = 0;
3377 }
3378 else if (byteorder == 1) {
3379 /* force BE */
3380 ihi = 0;
3381 ilo = 1;
3382 }
3383
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003384 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003385 Py_UNICODE ch = *s++;
3386 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003387#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003388 if (ch >= 0x10000) {
3389 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3390 ch = 0xD800 | ((ch-0x10000) >> 10);
3391 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003392#endif
Tim Peters772747b2001-08-09 22:21:55 +00003393 STORECHAR(ch);
3394 if (ch2)
3395 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003396 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003397
3398 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003399 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003400#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401}
3402
3403PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3404{
3405 if (!PyUnicode_Check(unicode)) {
3406 PyErr_BadArgument();
3407 return NULL;
3408 }
3409 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 PyUnicode_GET_SIZE(unicode),
3411 NULL,
3412 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003413}
3414
3415/* --- Unicode Escape Codec ----------------------------------------------- */
3416
Fredrik Lundh06d12682001-01-24 07:59:11 +00003417static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003418
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003420 Py_ssize_t size,
3421 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003424 Py_ssize_t startinpos;
3425 Py_ssize_t endinpos;
3426 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003427 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003431 char* message;
3432 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433 PyObject *errorHandler = NULL;
3434 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003435
Guido van Rossumd57fd912000-03-10 22:53:23 +00003436 /* Escaped strings will always be longer than the resulting
3437 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438 length after conversion to the true value.
3439 (but if the error callback returns a long replacement string
3440 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 v = _PyUnicode_New(size);
3442 if (v == NULL)
3443 goto onError;
3444 if (size == 0)
3445 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003446
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003447 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003449
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450 while (s < end) {
3451 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003452 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454
3455 /* Non-escape characters are interpreted as Unicode ordinals */
3456 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003457 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458 continue;
3459 }
3460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 /* \ - Escapes */
3463 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003464 c = *s++;
3465 if (s > end)
3466 c = '\0'; /* Invalid after \ */
3467 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468
Benjamin Peterson29060642009-01-31 22:14:21 +00003469 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 case '\n': break;
3471 case '\\': *p++ = '\\'; break;
3472 case '\'': *p++ = '\''; break;
3473 case '\"': *p++ = '\"'; break;
3474 case 'b': *p++ = '\b'; break;
3475 case 'f': *p++ = '\014'; break; /* FF */
3476 case 't': *p++ = '\t'; break;
3477 case 'n': *p++ = '\n'; break;
3478 case 'r': *p++ = '\r'; break;
3479 case 'v': *p++ = '\013'; break; /* VT */
3480 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3481
Benjamin Peterson29060642009-01-31 22:14:21 +00003482 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 case '0': case '1': case '2': case '3':
3484 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003485 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003486 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003487 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003488 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003489 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003491 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 break;
3493
Benjamin Peterson29060642009-01-31 22:14:21 +00003494 /* hex escapes */
3495 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003497 digits = 2;
3498 message = "truncated \\xXX escape";
3499 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500
Benjamin Peterson29060642009-01-31 22:14:21 +00003501 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003503 digits = 4;
3504 message = "truncated \\uXXXX escape";
3505 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506
Benjamin Peterson29060642009-01-31 22:14:21 +00003507 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003508 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003509 digits = 8;
3510 message = "truncated \\UXXXXXXXX escape";
3511 hexescape:
3512 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513 outpos = p-PyUnicode_AS_UNICODE(v);
3514 if (s+digits>end) {
3515 endinpos = size;
3516 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003517 errors, &errorHandler,
3518 "unicodeescape", "end of string in escape sequence",
3519 &starts, &end, &startinpos, &endinpos, &exc, &s,
3520 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 goto onError;
3522 goto nextByte;
3523 }
3524 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003525 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003526 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 endinpos = (s+i+1)-starts;
3528 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003529 errors, &errorHandler,
3530 "unicodeescape", message,
3531 &starts, &end, &startinpos, &endinpos, &exc, &s,
3532 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003533 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003535 }
3536 chr = (chr<<4) & ~0xF;
3537 if (c >= '0' && c <= '9')
3538 chr += c - '0';
3539 else if (c >= 'a' && c <= 'f')
3540 chr += 10 + c - 'a';
3541 else
3542 chr += 10 + c - 'A';
3543 }
3544 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003545 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 /* _decoding_error will have already written into the
3547 target buffer. */
3548 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003549 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003550 /* when we get here, chr is a 32-bit unicode character */
3551 if (chr <= 0xffff)
3552 /* UCS-2 character */
3553 *p++ = (Py_UNICODE) chr;
3554 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003555 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003556 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003557#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003558 *p++ = chr;
3559#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003560 chr -= 0x10000L;
3561 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003562 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003563#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003564 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 endinpos = s-starts;
3566 outpos = p-PyUnicode_AS_UNICODE(v);
3567 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003568 errors, &errorHandler,
3569 "unicodeescape", "illegal Unicode character",
3570 &starts, &end, &startinpos, &endinpos, &exc, &s,
3571 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003572 goto onError;
3573 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003574 break;
3575
Benjamin Peterson29060642009-01-31 22:14:21 +00003576 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003577 case 'N':
3578 message = "malformed \\N character escape";
3579 if (ucnhash_CAPI == NULL) {
3580 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003581 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003582 if (ucnhash_CAPI == NULL)
3583 goto ucnhashError;
3584 }
3585 if (*s == '{') {
3586 const char *start = s+1;
3587 /* look for the closing brace */
3588 while (*s != '}' && s < end)
3589 s++;
3590 if (s > start && s < end && *s == '}') {
3591 /* found a name. look it up in the unicode database */
3592 message = "unknown Unicode character name";
3593 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003594 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003595 goto store;
3596 }
3597 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003598 endinpos = s-starts;
3599 outpos = p-PyUnicode_AS_UNICODE(v);
3600 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003601 errors, &errorHandler,
3602 "unicodeescape", message,
3603 &starts, &end, &startinpos, &endinpos, &exc, &s,
3604 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003605 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003606 break;
3607
3608 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003609 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610 message = "\\ at end of string";
3611 s--;
3612 endinpos = s-starts;
3613 outpos = p-PyUnicode_AS_UNICODE(v);
3614 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003615 errors, &errorHandler,
3616 "unicodeescape", message,
3617 &starts, &end, &startinpos, &endinpos, &exc, &s,
3618 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003619 goto onError;
3620 }
3621 else {
3622 *p++ = '\\';
3623 *p++ = (unsigned char)s[-1];
3624 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003625 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003630 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003632 Py_XDECREF(errorHandler);
3633 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003635
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003637 PyErr_SetString(
3638 PyExc_UnicodeError,
3639 "\\N escapes not supported (can't load unicodedata module)"
3640 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003641 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642 Py_XDECREF(errorHandler);
3643 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003644 return NULL;
3645
Benjamin Peterson29060642009-01-31 22:14:21 +00003646 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 Py_XDECREF(errorHandler);
3649 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 return NULL;
3651}
3652
3653/* Return a Unicode-Escape string version of the Unicode object.
3654
3655 If quotes is true, the string is enclosed in u"" or u'' quotes as
3656 appropriate.
3657
3658*/
3659
Thomas Wouters477c8d52006-05-27 19:21:47 +00003660Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003661 Py_ssize_t size,
3662 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003663{
3664 /* like wcschr, but doesn't stop at NULL characters */
3665
3666 while (size-- > 0) {
3667 if (*s == ch)
3668 return s;
3669 s++;
3670 }
3671
3672 return NULL;
3673}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003674
Walter Dörwald79e913e2007-05-12 11:08:06 +00003675static const char *hexdigits = "0123456789abcdef";
3676
3677PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003680 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003683#ifdef Py_UNICODE_WIDE
3684 const Py_ssize_t expandsize = 10;
3685#else
3686 const Py_ssize_t expandsize = 6;
3687#endif
3688
Thomas Wouters89f507f2006-12-13 04:49:30 +00003689 /* XXX(nnorwitz): rather than over-allocating, it would be
3690 better to choose a different scheme. Perhaps scan the
3691 first N-chars of the string and allocate based on that size.
3692 */
3693 /* Initial allocation is based on the longest-possible unichr
3694 escape.
3695
3696 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3697 unichr, so in this case it's the longest unichr escape. In
3698 narrow (UTF-16) builds this is five chars per source unichr
3699 since there are two unichrs in the surrogate pair, so in narrow
3700 (UTF-16) builds it's not the longest unichr escape.
3701
3702 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3703 so in the narrow (UTF-16) build case it's the longest unichr
3704 escape.
3705 */
3706
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003707 if (size == 0)
3708 return PyBytes_FromStringAndSize(NULL, 0);
3709
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003710 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003711 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003712
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003713 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 2
3715 + expandsize*size
3716 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 if (repr == NULL)
3718 return NULL;
3719
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003720 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722 while (size-- > 0) {
3723 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003724
Walter Dörwald79e913e2007-05-12 11:08:06 +00003725 /* Escape backslashes */
3726 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727 *p++ = '\\';
3728 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003729 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003730 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003731
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003732#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003733 /* Map 21-bit characters to '\U00xxxxxx' */
3734 else if (ch >= 0x10000) {
3735 *p++ = '\\';
3736 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003737 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3738 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3739 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3740 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3741 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3742 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3743 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3744 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003745 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003746 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003747#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003748 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3749 else if (ch >= 0xD800 && ch < 0xDC00) {
3750 Py_UNICODE ch2;
3751 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003752
Benjamin Peterson29060642009-01-31 22:14:21 +00003753 ch2 = *s++;
3754 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003755 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3757 *p++ = '\\';
3758 *p++ = 'U';
3759 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3760 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3761 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3762 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3763 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3764 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3765 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3766 *p++ = hexdigits[ucs & 0x0000000F];
3767 continue;
3768 }
3769 /* Fall through: isolated surrogates are copied as-is */
3770 s--;
3771 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003772 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003773#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003774
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003776 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 *p++ = '\\';
3778 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003779 *p++ = hexdigits[(ch >> 12) & 0x000F];
3780 *p++ = hexdigits[(ch >> 8) & 0x000F];
3781 *p++ = hexdigits[(ch >> 4) & 0x000F];
3782 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003784
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003785 /* Map special whitespace to '\t', \n', '\r' */
3786 else if (ch == '\t') {
3787 *p++ = '\\';
3788 *p++ = 't';
3789 }
3790 else if (ch == '\n') {
3791 *p++ = '\\';
3792 *p++ = 'n';
3793 }
3794 else if (ch == '\r') {
3795 *p++ = '\\';
3796 *p++ = 'r';
3797 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003798
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003799 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003800 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003802 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003803 *p++ = hexdigits[(ch >> 4) & 0x000F];
3804 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003805 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003806
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807 /* Copy everything else as-is */
3808 else
3809 *p++ = (char) ch;
3810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003812 assert(p - PyBytes_AS_STRING(repr) > 0);
3813 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3814 return NULL;
3815 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816}
3817
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003818PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003820 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 if (!PyUnicode_Check(unicode)) {
3822 PyErr_BadArgument();
3823 return NULL;
3824 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003825 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3826 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003827 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828}
3829
3830/* --- Raw Unicode Escape Codec ------------------------------------------- */
3831
3832PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003833 Py_ssize_t size,
3834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003836 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003837 Py_ssize_t startinpos;
3838 Py_ssize_t endinpos;
3839 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003841 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842 const char *end;
3843 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003844 PyObject *errorHandler = NULL;
3845 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003846
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847 /* Escaped strings will always be longer than the resulting
3848 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003849 length after conversion to the true value. (But decoding error
3850 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851 v = _PyUnicode_New(size);
3852 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003853 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003856 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857 end = s + size;
3858 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003859 unsigned char c;
3860 Py_UCS4 x;
3861 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003862 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863
Benjamin Peterson29060642009-01-31 22:14:21 +00003864 /* Non-escape characters are interpreted as Unicode ordinals */
3865 if (*s != '\\') {
3866 *p++ = (unsigned char)*s++;
3867 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003868 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003869 startinpos = s-starts;
3870
3871 /* \u-escapes are only interpreted iff the number of leading
3872 backslashes if odd */
3873 bs = s;
3874 for (;s < end;) {
3875 if (*s != '\\')
3876 break;
3877 *p++ = (unsigned char)*s++;
3878 }
3879 if (((s - bs) & 1) == 0 ||
3880 s >= end ||
3881 (*s != 'u' && *s != 'U')) {
3882 continue;
3883 }
3884 p--;
3885 count = *s=='u' ? 4 : 8;
3886 s++;
3887
3888 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3889 outpos = p-PyUnicode_AS_UNICODE(v);
3890 for (x = 0, i = 0; i < count; ++i, ++s) {
3891 c = (unsigned char)*s;
3892 if (!ISXDIGIT(c)) {
3893 endinpos = s-starts;
3894 if (unicode_decode_call_errorhandler(
3895 errors, &errorHandler,
3896 "rawunicodeescape", "truncated \\uXXXX",
3897 &starts, &end, &startinpos, &endinpos, &exc, &s,
3898 &v, &outpos, &p))
3899 goto onError;
3900 goto nextByte;
3901 }
3902 x = (x<<4) & ~0xF;
3903 if (c >= '0' && c <= '9')
3904 x += c - '0';
3905 else if (c >= 'a' && c <= 'f')
3906 x += 10 + c - 'a';
3907 else
3908 x += 10 + c - 'A';
3909 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003910 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003911 /* UCS-2 character */
3912 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003913 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003914 /* UCS-4 character. Either store directly, or as
3915 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003916#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003917 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003918#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003919 x -= 0x10000L;
3920 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3921 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003922#endif
3923 } else {
3924 endinpos = s-starts;
3925 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003926 if (unicode_decode_call_errorhandler(
3927 errors, &errorHandler,
3928 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 &starts, &end, &startinpos, &endinpos, &exc, &s,
3930 &v, &outpos, &p))
3931 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003932 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003933 nextByte:
3934 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003935 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003936 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003937 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 Py_XDECREF(errorHandler);
3939 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003941
Benjamin Peterson29060642009-01-31 22:14:21 +00003942 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003944 Py_XDECREF(errorHandler);
3945 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946 return NULL;
3947}
3948
3949PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003952 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 char *p;
3954 char *q;
3955
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003956#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003957 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003958#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003959 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003960#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003961
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003962 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00003964
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003965 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966 if (repr == NULL)
3967 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003968 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003969 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003971 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972 while (size-- > 0) {
3973 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003974#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 /* Map 32-bit characters to '\Uxxxxxxxx' */
3976 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003977 *p++ = '\\';
3978 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00003979 *p++ = hexdigits[(ch >> 28) & 0xf];
3980 *p++ = hexdigits[(ch >> 24) & 0xf];
3981 *p++ = hexdigits[(ch >> 20) & 0xf];
3982 *p++ = hexdigits[(ch >> 16) & 0xf];
3983 *p++ = hexdigits[(ch >> 12) & 0xf];
3984 *p++ = hexdigits[(ch >> 8) & 0xf];
3985 *p++ = hexdigits[(ch >> 4) & 0xf];
3986 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003987 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003988 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00003989#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3991 if (ch >= 0xD800 && ch < 0xDC00) {
3992 Py_UNICODE ch2;
3993 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003994
Benjamin Peterson29060642009-01-31 22:14:21 +00003995 ch2 = *s++;
3996 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003997 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3999 *p++ = '\\';
4000 *p++ = 'U';
4001 *p++ = hexdigits[(ucs >> 28) & 0xf];
4002 *p++ = hexdigits[(ucs >> 24) & 0xf];
4003 *p++ = hexdigits[(ucs >> 20) & 0xf];
4004 *p++ = hexdigits[(ucs >> 16) & 0xf];
4005 *p++ = hexdigits[(ucs >> 12) & 0xf];
4006 *p++ = hexdigits[(ucs >> 8) & 0xf];
4007 *p++ = hexdigits[(ucs >> 4) & 0xf];
4008 *p++ = hexdigits[ucs & 0xf];
4009 continue;
4010 }
4011 /* Fall through: isolated surrogates are copied as-is */
4012 s--;
4013 size++;
4014 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004015#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004016 /* Map 16-bit characters to '\uxxxx' */
4017 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018 *p++ = '\\';
4019 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004020 *p++ = hexdigits[(ch >> 12) & 0xf];
4021 *p++ = hexdigits[(ch >> 8) & 0xf];
4022 *p++ = hexdigits[(ch >> 4) & 0xf];
4023 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004025 /* Copy everything else as-is */
4026 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 *p++ = (char) ch;
4028 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004029 size = p - q;
4030
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004031 assert(size > 0);
4032 if (_PyBytes_Resize(&repr, size) < 0)
4033 return NULL;
4034 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035}
4036
4037PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4038{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004039 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004041 PyErr_BadArgument();
4042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004044 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4045 PyUnicode_GET_SIZE(unicode));
4046
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004047 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048}
4049
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004050/* --- Unicode Internal Codec ------------------------------------------- */
4051
4052PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004053 Py_ssize_t size,
4054 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004055{
4056 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004057 Py_ssize_t startinpos;
4058 Py_ssize_t endinpos;
4059 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004060 PyUnicodeObject *v;
4061 Py_UNICODE *p;
4062 const char *end;
4063 const char *reason;
4064 PyObject *errorHandler = NULL;
4065 PyObject *exc = NULL;
4066
Neal Norwitzd43069c2006-01-08 01:12:10 +00004067#ifdef Py_UNICODE_WIDE
4068 Py_UNICODE unimax = PyUnicode_GetMax();
4069#endif
4070
Thomas Wouters89f507f2006-12-13 04:49:30 +00004071 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004072 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4073 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004074 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004075 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004076 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004077 p = PyUnicode_AS_UNICODE(v);
4078 end = s + size;
4079
4080 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004081 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004082 /* We have to sanity check the raw data, otherwise doom looms for
4083 some malformed UCS-4 data. */
4084 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004085#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004086 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004087#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004088 end-s < Py_UNICODE_SIZE
4089 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004090 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004091 startinpos = s - starts;
4092 if (end-s < Py_UNICODE_SIZE) {
4093 endinpos = end-starts;
4094 reason = "truncated input";
4095 }
4096 else {
4097 endinpos = s - starts + Py_UNICODE_SIZE;
4098 reason = "illegal code point (> 0x10FFFF)";
4099 }
4100 outpos = p - PyUnicode_AS_UNICODE(v);
4101 if (unicode_decode_call_errorhandler(
4102 errors, &errorHandler,
4103 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004104 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004105 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004106 goto onError;
4107 }
4108 }
4109 else {
4110 p++;
4111 s += Py_UNICODE_SIZE;
4112 }
4113 }
4114
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004115 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004116 goto onError;
4117 Py_XDECREF(errorHandler);
4118 Py_XDECREF(exc);
4119 return (PyObject *)v;
4120
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004122 Py_XDECREF(v);
4123 Py_XDECREF(errorHandler);
4124 Py_XDECREF(exc);
4125 return NULL;
4126}
4127
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128/* --- Latin-1 Codec ------------------------------------------------------ */
4129
4130PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 Py_ssize_t size,
4132 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133{
4134 PyUnicodeObject *v;
4135 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004136 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004137
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004139 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 Py_UNICODE r = *(unsigned char*)s;
4141 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004142 }
4143
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 v = _PyUnicode_New(size);
4145 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004148 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004150 e = s + size;
4151 /* Unrolling the copy makes it much faster by reducing the looping
4152 overhead. This is similar to what many memcpy() implementations do. */
4153 unrolled_end = e - 4;
4154 while (s < unrolled_end) {
4155 p[0] = (unsigned char) s[0];
4156 p[1] = (unsigned char) s[1];
4157 p[2] = (unsigned char) s[2];
4158 p[3] = (unsigned char) s[3];
4159 s += 4;
4160 p += 4;
4161 }
4162 while (s < e)
4163 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004165
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 Py_XDECREF(v);
4168 return NULL;
4169}
4170
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171/* create or adjust a UnicodeEncodeError */
4172static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 const char *encoding,
4174 const Py_UNICODE *unicode, Py_ssize_t size,
4175 Py_ssize_t startpos, Py_ssize_t endpos,
4176 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 *exceptionObject = PyUnicodeEncodeError_Create(
4180 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 }
4182 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4184 goto onError;
4185 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4186 goto onError;
4187 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4188 goto onError;
4189 return;
4190 onError:
4191 Py_DECREF(*exceptionObject);
4192 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193 }
4194}
4195
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196/* raises a UnicodeEncodeError */
4197static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 const char *encoding,
4199 const Py_UNICODE *unicode, Py_ssize_t size,
4200 Py_ssize_t startpos, Py_ssize_t endpos,
4201 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202{
4203 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207}
4208
4209/* error handling callback helper:
4210 build arguments, call the callback and check the arguments,
4211 put the result into newpos and return the replacement string, which
4212 has to be freed by the caller */
4213static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004214 PyObject **errorHandler,
4215 const char *encoding, const char *reason,
4216 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4217 Py_ssize_t startpos, Py_ssize_t endpos,
4218 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004220 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221
4222 PyObject *restuple;
4223 PyObject *resunicode;
4224
4225 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229 }
4230
4231 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004232 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235
4236 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004237 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004239 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004241 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004242 Py_DECREF(restuple);
4243 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004245 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 &resunicode, newpos)) {
4247 Py_DECREF(restuple);
4248 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004249 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004250 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4251 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4252 Py_DECREF(restuple);
4253 return NULL;
4254 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004255 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004257 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4259 Py_DECREF(restuple);
4260 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004261 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 Py_INCREF(resunicode);
4263 Py_DECREF(restuple);
4264 return resunicode;
4265}
4266
4267static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 Py_ssize_t size,
4269 const char *errors,
4270 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271{
4272 /* output object */
4273 PyObject *res;
4274 /* pointers to the beginning and end+1 of input */
4275 const Py_UNICODE *startp = p;
4276 const Py_UNICODE *endp = p + size;
4277 /* pointer to the beginning of the unencodable characters */
4278 /* const Py_UNICODE *badp = NULL; */
4279 /* pointer into the output */
4280 char *str;
4281 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004282 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004283 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4284 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 PyObject *errorHandler = NULL;
4286 PyObject *exc = NULL;
4287 /* the following variable is used for caching string comparisons
4288 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4289 int known_errorHandler = -1;
4290
4291 /* allocate enough for a simple encoding without
4292 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004293 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004294 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004295 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004297 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004298 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299 ressize = size;
4300
4301 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 /* can we encode this? */
4305 if (c<limit) {
4306 /* no overflow check, because we know that the space is enough */
4307 *str++ = (char)c;
4308 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004309 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004310 else {
4311 Py_ssize_t unicodepos = p-startp;
4312 Py_ssize_t requiredsize;
4313 PyObject *repunicode;
4314 Py_ssize_t repsize;
4315 Py_ssize_t newpos;
4316 Py_ssize_t respos;
4317 Py_UNICODE *uni2;
4318 /* startpos for collecting unencodable chars */
4319 const Py_UNICODE *collstart = p;
4320 const Py_UNICODE *collend = p;
4321 /* find all unecodable characters */
4322 while ((collend < endp) && ((*collend)>=limit))
4323 ++collend;
4324 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4325 if (known_errorHandler==-1) {
4326 if ((errors==NULL) || (!strcmp(errors, "strict")))
4327 known_errorHandler = 1;
4328 else if (!strcmp(errors, "replace"))
4329 known_errorHandler = 2;
4330 else if (!strcmp(errors, "ignore"))
4331 known_errorHandler = 3;
4332 else if (!strcmp(errors, "xmlcharrefreplace"))
4333 known_errorHandler = 4;
4334 else
4335 known_errorHandler = 0;
4336 }
4337 switch (known_errorHandler) {
4338 case 1: /* strict */
4339 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4340 goto onError;
4341 case 2: /* replace */
4342 while (collstart++<collend)
4343 *str++ = '?'; /* fall through */
4344 case 3: /* ignore */
4345 p = collend;
4346 break;
4347 case 4: /* xmlcharrefreplace */
4348 respos = str - PyBytes_AS_STRING(res);
4349 /* determine replacement size (temporarily (mis)uses p) */
4350 for (p = collstart, repsize = 0; p < collend; ++p) {
4351 if (*p<10)
4352 repsize += 2+1+1;
4353 else if (*p<100)
4354 repsize += 2+2+1;
4355 else if (*p<1000)
4356 repsize += 2+3+1;
4357 else if (*p<10000)
4358 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004359#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 else
4361 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004362#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 else if (*p<100000)
4364 repsize += 2+5+1;
4365 else if (*p<1000000)
4366 repsize += 2+6+1;
4367 else
4368 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004369#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004370 }
4371 requiredsize = respos+repsize+(endp-collend);
4372 if (requiredsize > ressize) {
4373 if (requiredsize<2*ressize)
4374 requiredsize = 2*ressize;
4375 if (_PyBytes_Resize(&res, requiredsize))
4376 goto onError;
4377 str = PyBytes_AS_STRING(res) + respos;
4378 ressize = requiredsize;
4379 }
4380 /* generate replacement (temporarily (mis)uses p) */
4381 for (p = collstart; p < collend; ++p) {
4382 str += sprintf(str, "&#%d;", (int)*p);
4383 }
4384 p = collend;
4385 break;
4386 default:
4387 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4388 encoding, reason, startp, size, &exc,
4389 collstart-startp, collend-startp, &newpos);
4390 if (repunicode == NULL)
4391 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004392 if (PyBytes_Check(repunicode)) {
4393 /* Directly copy bytes result to output. */
4394 repsize = PyBytes_Size(repunicode);
4395 if (repsize > 1) {
4396 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004397 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004398 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4399 Py_DECREF(repunicode);
4400 goto onError;
4401 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004402 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004403 ressize += repsize-1;
4404 }
4405 memcpy(str, PyBytes_AsString(repunicode), repsize);
4406 str += repsize;
4407 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004408 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004409 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004410 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 /* need more space? (at least enough for what we
4412 have+the replacement+the rest of the string, so
4413 we won't have to check space for encodable characters) */
4414 respos = str - PyBytes_AS_STRING(res);
4415 repsize = PyUnicode_GET_SIZE(repunicode);
4416 requiredsize = respos+repsize+(endp-collend);
4417 if (requiredsize > ressize) {
4418 if (requiredsize<2*ressize)
4419 requiredsize = 2*ressize;
4420 if (_PyBytes_Resize(&res, requiredsize)) {
4421 Py_DECREF(repunicode);
4422 goto onError;
4423 }
4424 str = PyBytes_AS_STRING(res) + respos;
4425 ressize = requiredsize;
4426 }
4427 /* check if there is anything unencodable in the replacement
4428 and copy it to the output */
4429 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4430 c = *uni2;
4431 if (c >= limit) {
4432 raise_encode_exception(&exc, encoding, startp, size,
4433 unicodepos, unicodepos+1, reason);
4434 Py_DECREF(repunicode);
4435 goto onError;
4436 }
4437 *str = (char)c;
4438 }
4439 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004440 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004441 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004442 }
4443 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004444 /* Resize if we allocated to much */
4445 size = str - PyBytes_AS_STRING(res);
4446 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004447 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004448 if (_PyBytes_Resize(&res, size) < 0)
4449 goto onError;
4450 }
4451
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004452 Py_XDECREF(errorHandler);
4453 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004454 return res;
4455
4456 onError:
4457 Py_XDECREF(res);
4458 Py_XDECREF(errorHandler);
4459 Py_XDECREF(exc);
4460 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461}
4462
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004464 Py_ssize_t size,
4465 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468}
4469
4470PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4471{
4472 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004473 PyErr_BadArgument();
4474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 }
4476 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 PyUnicode_GET_SIZE(unicode),
4478 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479}
4480
4481/* --- 7-bit ASCII Codec -------------------------------------------------- */
4482
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004484 Py_ssize_t size,
4485 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 PyUnicodeObject *v;
4489 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004490 Py_ssize_t startinpos;
4491 Py_ssize_t endinpos;
4492 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493 const char *e;
4494 PyObject *errorHandler = NULL;
4495 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004496
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004498 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 Py_UNICODE r = *(unsigned char*)s;
4500 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004501 }
Tim Petersced69f82003-09-16 20:30:58 +00004502
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 v = _PyUnicode_New(size);
4504 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004505 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 e = s + size;
4510 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 register unsigned char c = (unsigned char)*s;
4512 if (c < 128) {
4513 *p++ = c;
4514 ++s;
4515 }
4516 else {
4517 startinpos = s-starts;
4518 endinpos = startinpos + 1;
4519 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4520 if (unicode_decode_call_errorhandler(
4521 errors, &errorHandler,
4522 "ascii", "ordinal not in range(128)",
4523 &starts, &e, &startinpos, &endinpos, &exc, &s,
4524 &v, &outpos, &p))
4525 goto onError;
4526 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004528 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4530 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 Py_XDECREF(errorHandler);
4532 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004534
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 Py_XDECREF(errorHandler);
4538 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 return NULL;
4540}
4541
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 Py_ssize_t size,
4544 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547}
4548
4549PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4550{
4551 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 PyErr_BadArgument();
4553 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554 }
4555 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 PyUnicode_GET_SIZE(unicode),
4557 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558}
4559
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004560#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004561
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004562/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004563
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004564#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004565#define NEED_RETRY
4566#endif
4567
4568/* XXX This code is limited to "true" double-byte encodings, as
4569 a) it assumes an incomplete character consists of a single byte, and
4570 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004571 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004572
4573static int is_dbcs_lead_byte(const char *s, int offset)
4574{
4575 const char *curr = s + offset;
4576
4577 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004578 const char *prev = CharPrev(s, curr);
4579 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004580 }
4581 return 0;
4582}
4583
4584/*
4585 * Decode MBCS string into unicode object. If 'final' is set, converts
4586 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4587 */
4588static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 const char *s, /* MBCS string */
4590 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004591 int final,
4592 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004593{
4594 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004595 Py_ssize_t n;
4596 DWORD usize;
4597 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004598
4599 assert(size >= 0);
4600
Victor Stinner554f3f02010-06-16 23:33:54 +00004601 /* check and handle 'errors' arg */
4602 if (errors==NULL || strcmp(errors, "strict")==0)
4603 flags = MB_ERR_INVALID_CHARS;
4604 else if (strcmp(errors, "ignore")==0)
4605 flags = 0;
4606 else {
4607 PyErr_Format(PyExc_ValueError,
4608 "mbcs encoding does not support errors='%s'",
4609 errors);
4610 return -1;
4611 }
4612
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004613 /* Skip trailing lead-byte unless 'final' is set */
4614 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004616
4617 /* First get the size of the result */
4618 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004619 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4620 if (usize==0)
4621 goto mbcs_decode_error;
4622 } else
4623 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004624
4625 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004626 /* Create unicode object */
4627 *v = _PyUnicode_New(usize);
4628 if (*v == NULL)
4629 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004630 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004631 }
4632 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004633 /* Extend unicode object */
4634 n = PyUnicode_GET_SIZE(*v);
4635 if (_PyUnicode_Resize(v, n + usize) < 0)
4636 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004637 }
4638
4639 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004640 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004642 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4643 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004645 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004646 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004647
4648mbcs_decode_error:
4649 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4650 we raise a UnicodeDecodeError - else it is a 'generic'
4651 windows error
4652 */
4653 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4654 /* Ideally, we should get reason from FormatMessage - this
4655 is the Windows 2000 English version of the message
4656 */
4657 PyObject *exc = NULL;
4658 const char *reason = "No mapping for the Unicode character exists "
4659 "in the target multi-byte code page.";
4660 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4661 if (exc != NULL) {
4662 PyCodec_StrictErrors(exc);
4663 Py_DECREF(exc);
4664 }
4665 } else {
4666 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4667 }
4668 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004669}
4670
4671PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 Py_ssize_t size,
4673 const char *errors,
4674 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004675{
4676 PyUnicodeObject *v = NULL;
4677 int done;
4678
4679 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004681
4682#ifdef NEED_RETRY
4683 retry:
4684 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004685 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004686 else
4687#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004688 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004689
4690 if (done < 0) {
4691 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004693 }
4694
4695 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004696 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004697
4698#ifdef NEED_RETRY
4699 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 s += done;
4701 size -= done;
4702 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004703 }
4704#endif
4705
4706 return (PyObject *)v;
4707}
4708
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004709PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004710 Py_ssize_t size,
4711 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004712{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004713 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4714}
4715
4716/*
4717 * Convert unicode into string object (MBCS).
4718 * Returns 0 if succeed, -1 otherwise.
4719 */
4720static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004722 int size, /* size of unicode */
4723 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004724{
Victor Stinner554f3f02010-06-16 23:33:54 +00004725 BOOL usedDefaultChar = FALSE;
4726 BOOL *pusedDefaultChar;
4727 int mbcssize;
4728 Py_ssize_t n;
4729 PyObject *exc = NULL;
4730 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004731
4732 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004733
Victor Stinner554f3f02010-06-16 23:33:54 +00004734 /* check and handle 'errors' arg */
4735 if (errors==NULL || strcmp(errors, "strict")==0) {
4736 flags = WC_NO_BEST_FIT_CHARS;
4737 pusedDefaultChar = &usedDefaultChar;
4738 } else if (strcmp(errors, "replace")==0) {
4739 flags = 0;
4740 pusedDefaultChar = NULL;
4741 } else {
4742 PyErr_Format(PyExc_ValueError,
4743 "mbcs encoding does not support errors='%s'",
4744 errors);
4745 return -1;
4746 }
4747
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004748 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004749 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004750 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4751 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 if (mbcssize == 0) {
4753 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4754 return -1;
4755 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004756 /* If we used a default char, then we failed! */
4757 if (pusedDefaultChar && *pusedDefaultChar)
4758 goto mbcs_encode_error;
4759 } else {
4760 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004761 }
4762
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004763 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 /* Create string object */
4765 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4766 if (*repr == NULL)
4767 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004768 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004769 }
4770 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004771 /* Extend string object */
4772 n = PyBytes_Size(*repr);
4773 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4774 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004775 }
4776
4777 /* Do the conversion */
4778 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004780 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4781 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4783 return -1;
4784 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004785 if (pusedDefaultChar && *pusedDefaultChar)
4786 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004787 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004788 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004789
4790mbcs_encode_error:
4791 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4792 Py_XDECREF(exc);
4793 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004794}
4795
4796PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 Py_ssize_t size,
4798 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004799{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004800 PyObject *repr = NULL;
4801 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004802
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004803#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004804 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004805 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004806 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004807 else
4808#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004809 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004810
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004811 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004812 Py_XDECREF(repr);
4813 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004814 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004815
4816#ifdef NEED_RETRY
4817 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 p += INT_MAX;
4819 size -= INT_MAX;
4820 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004821 }
4822#endif
4823
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004824 return repr;
4825}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004826
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004827PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4828{
4829 if (!PyUnicode_Check(unicode)) {
4830 PyErr_BadArgument();
4831 return NULL;
4832 }
4833 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004834 PyUnicode_GET_SIZE(unicode),
4835 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004836}
4837
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004838#undef NEED_RETRY
4839
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004840#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004841
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842/* --- Character Mapping Codec -------------------------------------------- */
4843
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 Py_ssize_t size,
4846 PyObject *mapping,
4847 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004850 Py_ssize_t startinpos;
4851 Py_ssize_t endinpos;
4852 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 PyUnicodeObject *v;
4855 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004856 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857 PyObject *errorHandler = NULL;
4858 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004859 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004860 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004861
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 /* Default to Latin-1 */
4863 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865
4866 v = _PyUnicode_New(size);
4867 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004873 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004874 mapstring = PyUnicode_AS_UNICODE(mapping);
4875 maplen = PyUnicode_GET_SIZE(mapping);
4876 while (s < e) {
4877 unsigned char ch = *s;
4878 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 if (ch < maplen)
4881 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882
Benjamin Peterson29060642009-01-31 22:14:21 +00004883 if (x == 0xfffe) {
4884 /* undefined mapping */
4885 outpos = p-PyUnicode_AS_UNICODE(v);
4886 startinpos = s-starts;
4887 endinpos = startinpos+1;
4888 if (unicode_decode_call_errorhandler(
4889 errors, &errorHandler,
4890 "charmap", "character maps to <undefined>",
4891 &starts, &e, &startinpos, &endinpos, &exc, &s,
4892 &v, &outpos, &p)) {
4893 goto onError;
4894 }
4895 continue;
4896 }
4897 *p++ = x;
4898 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004899 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004900 }
4901 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 while (s < e) {
4903 unsigned char ch = *s;
4904 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004905
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4907 w = PyLong_FromLong((long)ch);
4908 if (w == NULL)
4909 goto onError;
4910 x = PyObject_GetItem(mapping, w);
4911 Py_DECREF(w);
4912 if (x == NULL) {
4913 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4914 /* No mapping found means: mapping is undefined. */
4915 PyErr_Clear();
4916 x = Py_None;
4917 Py_INCREF(x);
4918 } else
4919 goto onError;
4920 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004921
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 /* Apply mapping */
4923 if (PyLong_Check(x)) {
4924 long value = PyLong_AS_LONG(x);
4925 if (value < 0 || value > 65535) {
4926 PyErr_SetString(PyExc_TypeError,
4927 "character mapping must be in range(65536)");
4928 Py_DECREF(x);
4929 goto onError;
4930 }
4931 *p++ = (Py_UNICODE)value;
4932 }
4933 else if (x == Py_None) {
4934 /* undefined mapping */
4935 outpos = p-PyUnicode_AS_UNICODE(v);
4936 startinpos = s-starts;
4937 endinpos = startinpos+1;
4938 if (unicode_decode_call_errorhandler(
4939 errors, &errorHandler,
4940 "charmap", "character maps to <undefined>",
4941 &starts, &e, &startinpos, &endinpos, &exc, &s,
4942 &v, &outpos, &p)) {
4943 Py_DECREF(x);
4944 goto onError;
4945 }
4946 Py_DECREF(x);
4947 continue;
4948 }
4949 else if (PyUnicode_Check(x)) {
4950 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004951
Benjamin Peterson29060642009-01-31 22:14:21 +00004952 if (targetsize == 1)
4953 /* 1-1 mapping */
4954 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004955
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 else if (targetsize > 1) {
4957 /* 1-n mapping */
4958 if (targetsize > extrachars) {
4959 /* resize first */
4960 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4961 Py_ssize_t needed = (targetsize - extrachars) + \
4962 (targetsize << 2);
4963 extrachars += needed;
4964 /* XXX overflow detection missing */
4965 if (_PyUnicode_Resize(&v,
4966 PyUnicode_GET_SIZE(v) + needed) < 0) {
4967 Py_DECREF(x);
4968 goto onError;
4969 }
4970 p = PyUnicode_AS_UNICODE(v) + oldpos;
4971 }
4972 Py_UNICODE_COPY(p,
4973 PyUnicode_AS_UNICODE(x),
4974 targetsize);
4975 p += targetsize;
4976 extrachars -= targetsize;
4977 }
4978 /* 1-0 mapping: skip the character */
4979 }
4980 else {
4981 /* wrong return value */
4982 PyErr_SetString(PyExc_TypeError,
4983 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00004984 Py_DECREF(x);
4985 goto onError;
4986 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004987 Py_DECREF(x);
4988 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 }
4991 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4993 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 Py_XDECREF(errorHandler);
4995 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004997
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004999 Py_XDECREF(errorHandler);
5000 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001 Py_XDECREF(v);
5002 return NULL;
5003}
5004
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005005/* Charmap encoding: the lookup table */
5006
5007struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 PyObject_HEAD
5009 unsigned char level1[32];
5010 int count2, count3;
5011 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005012};
5013
5014static PyObject*
5015encoding_map_size(PyObject *obj, PyObject* args)
5016{
5017 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005018 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005019 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005020}
5021
5022static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005023 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 PyDoc_STR("Return the size (in bytes) of this object") },
5025 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005026};
5027
5028static void
5029encoding_map_dealloc(PyObject* o)
5030{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005031 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005032}
5033
5034static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005035 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 "EncodingMap", /*tp_name*/
5037 sizeof(struct encoding_map), /*tp_basicsize*/
5038 0, /*tp_itemsize*/
5039 /* methods */
5040 encoding_map_dealloc, /*tp_dealloc*/
5041 0, /*tp_print*/
5042 0, /*tp_getattr*/
5043 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005044 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 0, /*tp_repr*/
5046 0, /*tp_as_number*/
5047 0, /*tp_as_sequence*/
5048 0, /*tp_as_mapping*/
5049 0, /*tp_hash*/
5050 0, /*tp_call*/
5051 0, /*tp_str*/
5052 0, /*tp_getattro*/
5053 0, /*tp_setattro*/
5054 0, /*tp_as_buffer*/
5055 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5056 0, /*tp_doc*/
5057 0, /*tp_traverse*/
5058 0, /*tp_clear*/
5059 0, /*tp_richcompare*/
5060 0, /*tp_weaklistoffset*/
5061 0, /*tp_iter*/
5062 0, /*tp_iternext*/
5063 encoding_map_methods, /*tp_methods*/
5064 0, /*tp_members*/
5065 0, /*tp_getset*/
5066 0, /*tp_base*/
5067 0, /*tp_dict*/
5068 0, /*tp_descr_get*/
5069 0, /*tp_descr_set*/
5070 0, /*tp_dictoffset*/
5071 0, /*tp_init*/
5072 0, /*tp_alloc*/
5073 0, /*tp_new*/
5074 0, /*tp_free*/
5075 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005076};
5077
5078PyObject*
5079PyUnicode_BuildEncodingMap(PyObject* string)
5080{
5081 Py_UNICODE *decode;
5082 PyObject *result;
5083 struct encoding_map *mresult;
5084 int i;
5085 int need_dict = 0;
5086 unsigned char level1[32];
5087 unsigned char level2[512];
5088 unsigned char *mlevel1, *mlevel2, *mlevel3;
5089 int count2 = 0, count3 = 0;
5090
5091 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5092 PyErr_BadArgument();
5093 return NULL;
5094 }
5095 decode = PyUnicode_AS_UNICODE(string);
5096 memset(level1, 0xFF, sizeof level1);
5097 memset(level2, 0xFF, sizeof level2);
5098
5099 /* If there isn't a one-to-one mapping of NULL to \0,
5100 or if there are non-BMP characters, we need to use
5101 a mapping dictionary. */
5102 if (decode[0] != 0)
5103 need_dict = 1;
5104 for (i = 1; i < 256; i++) {
5105 int l1, l2;
5106 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005107#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005108 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005109#endif
5110 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005111 need_dict = 1;
5112 break;
5113 }
5114 if (decode[i] == 0xFFFE)
5115 /* unmapped character */
5116 continue;
5117 l1 = decode[i] >> 11;
5118 l2 = decode[i] >> 7;
5119 if (level1[l1] == 0xFF)
5120 level1[l1] = count2++;
5121 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005122 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005123 }
5124
5125 if (count2 >= 0xFF || count3 >= 0xFF)
5126 need_dict = 1;
5127
5128 if (need_dict) {
5129 PyObject *result = PyDict_New();
5130 PyObject *key, *value;
5131 if (!result)
5132 return NULL;
5133 for (i = 0; i < 256; i++) {
5134 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005135 key = PyLong_FromLong(decode[i]);
5136 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005137 if (!key || !value)
5138 goto failed1;
5139 if (PyDict_SetItem(result, key, value) == -1)
5140 goto failed1;
5141 Py_DECREF(key);
5142 Py_DECREF(value);
5143 }
5144 return result;
5145 failed1:
5146 Py_XDECREF(key);
5147 Py_XDECREF(value);
5148 Py_DECREF(result);
5149 return NULL;
5150 }
5151
5152 /* Create a three-level trie */
5153 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5154 16*count2 + 128*count3 - 1);
5155 if (!result)
5156 return PyErr_NoMemory();
5157 PyObject_Init(result, &EncodingMapType);
5158 mresult = (struct encoding_map*)result;
5159 mresult->count2 = count2;
5160 mresult->count3 = count3;
5161 mlevel1 = mresult->level1;
5162 mlevel2 = mresult->level23;
5163 mlevel3 = mresult->level23 + 16*count2;
5164 memcpy(mlevel1, level1, 32);
5165 memset(mlevel2, 0xFF, 16*count2);
5166 memset(mlevel3, 0, 128*count3);
5167 count3 = 0;
5168 for (i = 1; i < 256; i++) {
5169 int o1, o2, o3, i2, i3;
5170 if (decode[i] == 0xFFFE)
5171 /* unmapped character */
5172 continue;
5173 o1 = decode[i]>>11;
5174 o2 = (decode[i]>>7) & 0xF;
5175 i2 = 16*mlevel1[o1] + o2;
5176 if (mlevel2[i2] == 0xFF)
5177 mlevel2[i2] = count3++;
5178 o3 = decode[i] & 0x7F;
5179 i3 = 128*mlevel2[i2] + o3;
5180 mlevel3[i3] = i;
5181 }
5182 return result;
5183}
5184
5185static int
5186encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5187{
5188 struct encoding_map *map = (struct encoding_map*)mapping;
5189 int l1 = c>>11;
5190 int l2 = (c>>7) & 0xF;
5191 int l3 = c & 0x7F;
5192 int i;
5193
5194#ifdef Py_UNICODE_WIDE
5195 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005197 }
5198#endif
5199 if (c == 0)
5200 return 0;
5201 /* level 1*/
5202 i = map->level1[l1];
5203 if (i == 0xFF) {
5204 return -1;
5205 }
5206 /* level 2*/
5207 i = map->level23[16*i+l2];
5208 if (i == 0xFF) {
5209 return -1;
5210 }
5211 /* level 3 */
5212 i = map->level23[16*map->count2 + 128*i + l3];
5213 if (i == 0) {
5214 return -1;
5215 }
5216 return i;
5217}
5218
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005219/* Lookup the character ch in the mapping. If the character
5220 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005221 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223{
Christian Heimes217cfd12007-12-02 14:31:20 +00005224 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005225 PyObject *x;
5226
5227 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005229 x = PyObject_GetItem(mapping, w);
5230 Py_DECREF(w);
5231 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5233 /* No mapping found means: mapping is undefined. */
5234 PyErr_Clear();
5235 x = Py_None;
5236 Py_INCREF(x);
5237 return x;
5238 } else
5239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005241 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005243 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005244 long value = PyLong_AS_LONG(x);
5245 if (value < 0 || value > 255) {
5246 PyErr_SetString(PyExc_TypeError,
5247 "character mapping must be in range(256)");
5248 Py_DECREF(x);
5249 return NULL;
5250 }
5251 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005253 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005254 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005256 /* wrong return value */
5257 PyErr_Format(PyExc_TypeError,
5258 "character mapping must return integer, bytes or None, not %.400s",
5259 x->ob_type->tp_name);
5260 Py_DECREF(x);
5261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 }
5263}
5264
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005265static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005266charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005267{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005268 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5269 /* exponentially overallocate to minimize reallocations */
5270 if (requiredsize < 2*outsize)
5271 requiredsize = 2*outsize;
5272 if (_PyBytes_Resize(outobj, requiredsize))
5273 return -1;
5274 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005275}
5276
Benjamin Peterson14339b62009-01-31 16:36:08 +00005277typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005279}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005280/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005281 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005282 space is available. Return a new reference to the object that
5283 was put in the output buffer, or Py_None, if the mapping was undefined
5284 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005285 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005286static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005287charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005289{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005290 PyObject *rep;
5291 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005292 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005293
Christian Heimes90aa7642007-12-19 02:45:37 +00005294 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005295 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005297 if (res == -1)
5298 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 if (outsize<requiredsize)
5300 if (charmapencode_resize(outobj, outpos, requiredsize))
5301 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005302 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 outstart[(*outpos)++] = (char)res;
5304 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005305 }
5306
5307 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005308 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005310 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 Py_DECREF(rep);
5312 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005313 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 if (PyLong_Check(rep)) {
5315 Py_ssize_t requiredsize = *outpos+1;
5316 if (outsize<requiredsize)
5317 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5318 Py_DECREF(rep);
5319 return enc_EXCEPTION;
5320 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005321 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005323 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 else {
5325 const char *repchars = PyBytes_AS_STRING(rep);
5326 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5327 Py_ssize_t requiredsize = *outpos+repsize;
5328 if (outsize<requiredsize)
5329 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5330 Py_DECREF(rep);
5331 return enc_EXCEPTION;
5332 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005333 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 memcpy(outstart + *outpos, repchars, repsize);
5335 *outpos += repsize;
5336 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005337 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005338 Py_DECREF(rep);
5339 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340}
5341
5342/* handle an error in PyUnicode_EncodeCharmap
5343 Return 0 on success, -1 on error */
5344static
5345int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005346 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005348 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005349 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005350{
5351 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 Py_ssize_t repsize;
5353 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005354 Py_UNICODE *uni2;
5355 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005356 Py_ssize_t collstartpos = *inpos;
5357 Py_ssize_t collendpos = *inpos+1;
5358 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359 char *encoding = "charmap";
5360 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005361 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 /* find all unencodable characters */
5364 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005365 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005366 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 int res = encoding_map_lookup(p[collendpos], mapping);
5368 if (res != -1)
5369 break;
5370 ++collendpos;
5371 continue;
5372 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005373
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 rep = charmapencode_lookup(p[collendpos], mapping);
5375 if (rep==NULL)
5376 return -1;
5377 else if (rep!=Py_None) {
5378 Py_DECREF(rep);
5379 break;
5380 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005381 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005382 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005383 }
5384 /* cache callback name lookup
5385 * (if not done yet, i.e. it's the first error) */
5386 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 if ((errors==NULL) || (!strcmp(errors, "strict")))
5388 *known_errorHandler = 1;
5389 else if (!strcmp(errors, "replace"))
5390 *known_errorHandler = 2;
5391 else if (!strcmp(errors, "ignore"))
5392 *known_errorHandler = 3;
5393 else if (!strcmp(errors, "xmlcharrefreplace"))
5394 *known_errorHandler = 4;
5395 else
5396 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005397 }
5398 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005399 case 1: /* strict */
5400 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5401 return -1;
5402 case 2: /* replace */
5403 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 x = charmapencode_output('?', mapping, res, respos);
5405 if (x==enc_EXCEPTION) {
5406 return -1;
5407 }
5408 else if (x==enc_FAILED) {
5409 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5410 return -1;
5411 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005412 }
5413 /* fall through */
5414 case 3: /* ignore */
5415 *inpos = collendpos;
5416 break;
5417 case 4: /* xmlcharrefreplace */
5418 /* generate replacement (temporarily (mis)uses p) */
5419 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 char buffer[2+29+1+1];
5421 char *cp;
5422 sprintf(buffer, "&#%d;", (int)p[collpos]);
5423 for (cp = buffer; *cp; ++cp) {
5424 x = charmapencode_output(*cp, mapping, res, respos);
5425 if (x==enc_EXCEPTION)
5426 return -1;
5427 else if (x==enc_FAILED) {
5428 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5429 return -1;
5430 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005431 }
5432 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005433 *inpos = collendpos;
5434 break;
5435 default:
5436 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 encoding, reason, p, size, exceptionObject,
5438 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005439 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005441 if (PyBytes_Check(repunicode)) {
5442 /* Directly copy bytes result to output. */
5443 Py_ssize_t outsize = PyBytes_Size(*res);
5444 Py_ssize_t requiredsize;
5445 repsize = PyBytes_Size(repunicode);
5446 requiredsize = *respos + repsize;
5447 if (requiredsize > outsize)
5448 /* Make room for all additional bytes. */
5449 if (charmapencode_resize(res, respos, requiredsize)) {
5450 Py_DECREF(repunicode);
5451 return -1;
5452 }
5453 memcpy(PyBytes_AsString(*res) + *respos,
5454 PyBytes_AsString(repunicode), repsize);
5455 *respos += repsize;
5456 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005457 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005458 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005459 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005460 /* generate replacement */
5461 repsize = PyUnicode_GET_SIZE(repunicode);
5462 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 x = charmapencode_output(*uni2, mapping, res, respos);
5464 if (x==enc_EXCEPTION) {
5465 return -1;
5466 }
5467 else if (x==enc_FAILED) {
5468 Py_DECREF(repunicode);
5469 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5470 return -1;
5471 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005472 }
5473 *inpos = newpos;
5474 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 }
5476 return 0;
5477}
5478
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 Py_ssize_t size,
5481 PyObject *mapping,
5482 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005484 /* output object */
5485 PyObject *res = NULL;
5486 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005487 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005489 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005490 PyObject *errorHandler = NULL;
5491 PyObject *exc = NULL;
5492 /* the following variable is used for caching string comparisons
5493 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5494 * 3=ignore, 4=xmlcharrefreplace */
5495 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496
5497 /* Default to Latin-1 */
5498 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501 /* allocate enough for a simple encoding without
5502 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005503 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005504 if (res == NULL)
5505 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005506 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 /* try to encode it */
5511 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5512 if (x==enc_EXCEPTION) /* error */
5513 goto onError;
5514 if (x==enc_FAILED) { /* unencodable character */
5515 if (charmap_encoding_error(p, size, &inpos, mapping,
5516 &exc,
5517 &known_errorHandler, &errorHandler, errors,
5518 &res, &respos)) {
5519 goto onError;
5520 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005521 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 else
5523 /* done with this character => adjust input position */
5524 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005527 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005528 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005529 if (_PyBytes_Resize(&res, respos) < 0)
5530 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005531
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005532 Py_XDECREF(exc);
5533 Py_XDECREF(errorHandler);
5534 return res;
5535
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005537 Py_XDECREF(res);
5538 Py_XDECREF(exc);
5539 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 return NULL;
5541}
5542
5543PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005544 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545{
5546 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 PyErr_BadArgument();
5548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 }
5550 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 PyUnicode_GET_SIZE(unicode),
5552 mapping,
5553 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554}
5555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005556/* create or adjust a UnicodeTranslateError */
5557static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 const Py_UNICODE *unicode, Py_ssize_t size,
5559 Py_ssize_t startpos, Py_ssize_t endpos,
5560 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005562 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005563 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 }
5566 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5568 goto onError;
5569 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5570 goto onError;
5571 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5572 goto onError;
5573 return;
5574 onError:
5575 Py_DECREF(*exceptionObject);
5576 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 }
5578}
5579
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580/* raises a UnicodeTranslateError */
5581static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 const Py_UNICODE *unicode, Py_ssize_t size,
5583 Py_ssize_t startpos, Py_ssize_t endpos,
5584 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005585{
5586 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005588 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005590}
5591
5592/* error handling callback helper:
5593 build arguments, call the callback and check the arguments,
5594 put the result into newpos and return the replacement string, which
5595 has to be freed by the caller */
5596static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 PyObject **errorHandler,
5598 const char *reason,
5599 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5600 Py_ssize_t startpos, Py_ssize_t endpos,
5601 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005603 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005604
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005605 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606 PyObject *restuple;
5607 PyObject *resunicode;
5608
5609 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005612 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005613 }
5614
5615 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619
5620 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005625 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 Py_DECREF(restuple);
5627 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628 }
5629 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 &resunicode, &i_newpos)) {
5631 Py_DECREF(restuple);
5632 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005634 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005636 else
5637 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005638 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5640 Py_DECREF(restuple);
5641 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005642 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 Py_INCREF(resunicode);
5644 Py_DECREF(restuple);
5645 return resunicode;
5646}
5647
5648/* Lookup the character ch in the mapping and put the result in result,
5649 which must be decrefed by the caller.
5650 Return 0 on success, -1 on error */
5651static
5652int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5653{
Christian Heimes217cfd12007-12-02 14:31:20 +00005654 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 PyObject *x;
5656
5657 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005659 x = PyObject_GetItem(mapping, w);
5660 Py_DECREF(w);
5661 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005662 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5663 /* No mapping found means: use 1:1 mapping. */
5664 PyErr_Clear();
5665 *result = NULL;
5666 return 0;
5667 } else
5668 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005669 }
5670 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 *result = x;
5672 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005673 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005674 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 long value = PyLong_AS_LONG(x);
5676 long max = PyUnicode_GetMax();
5677 if (value < 0 || value > max) {
5678 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005679 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 Py_DECREF(x);
5681 return -1;
5682 }
5683 *result = x;
5684 return 0;
5685 }
5686 else if (PyUnicode_Check(x)) {
5687 *result = x;
5688 return 0;
5689 }
5690 else {
5691 /* wrong return value */
5692 PyErr_SetString(PyExc_TypeError,
5693 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005694 Py_DECREF(x);
5695 return -1;
5696 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697}
5698/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 if not reallocate and adjust various state variables.
5700 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701static
Walter Dörwald4894c302003-10-24 14:25:28 +00005702int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005705 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005706 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 /* remember old output position */
5708 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5709 /* exponentially overallocate to minimize reallocations */
5710 if (requiredsize < 2 * oldsize)
5711 requiredsize = 2 * oldsize;
5712 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5713 return -1;
5714 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 }
5716 return 0;
5717}
5718/* lookup the character, put the result in the output string and adjust
5719 various state variables. Return a new reference to the object that
5720 was put in the output buffer in *result, or Py_None, if the mapping was
5721 undefined (in which case no character was written).
5722 The called must decref result.
5723 Return 0 on success, -1 on error. */
5724static
Walter Dörwald4894c302003-10-24 14:25:28 +00005725int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5727 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728{
Walter Dörwald4894c302003-10-24 14:25:28 +00005729 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005732 /* not found => default to 1:1 mapping */
5733 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734 }
5735 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005737 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 /* no overflow check, because we know that the space is enough */
5739 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740 }
5741 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5743 if (repsize==1) {
5744 /* no overflow check, because we know that the space is enough */
5745 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5746 }
5747 else if (repsize!=0) {
5748 /* more than one character */
5749 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5750 (insize - (curinp-startinp)) +
5751 repsize - 1;
5752 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5753 return -1;
5754 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5755 *outp += repsize;
5756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005757 }
5758 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005760 return 0;
5761}
5762
5763PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 Py_ssize_t size,
5765 PyObject *mapping,
5766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 /* output object */
5769 PyObject *res = NULL;
5770 /* pointers to the beginning and end+1 of input */
5771 const Py_UNICODE *startp = p;
5772 const Py_UNICODE *endp = p + size;
5773 /* pointer into the output */
5774 Py_UNICODE *str;
5775 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005776 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 char *reason = "character maps to <undefined>";
5778 PyObject *errorHandler = NULL;
5779 PyObject *exc = NULL;
5780 /* the following variable is used for caching string comparisons
5781 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5782 * 3=ignore, 4=xmlcharrefreplace */
5783 int known_errorHandler = -1;
5784
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 PyErr_BadArgument();
5787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789
5790 /* allocate enough for a simple 1:1 translation without
5791 replacements, if we need more, we'll resize */
5792 res = PyUnicode_FromUnicode(NULL, size);
5793 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 /* try to encode it */
5801 PyObject *x = NULL;
5802 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5803 Py_XDECREF(x);
5804 goto onError;
5805 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005806 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 if (x!=Py_None) /* it worked => adjust input pointer */
5808 ++p;
5809 else { /* untranslatable character */
5810 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5811 Py_ssize_t repsize;
5812 Py_ssize_t newpos;
5813 Py_UNICODE *uni2;
5814 /* startpos for collecting untranslatable chars */
5815 const Py_UNICODE *collstart = p;
5816 const Py_UNICODE *collend = p+1;
5817 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 /* find all untranslatable characters */
5820 while (collend < endp) {
5821 if (charmaptranslate_lookup(*collend, mapping, &x))
5822 goto onError;
5823 Py_XDECREF(x);
5824 if (x!=Py_None)
5825 break;
5826 ++collend;
5827 }
5828 /* cache callback name lookup
5829 * (if not done yet, i.e. it's the first error) */
5830 if (known_errorHandler==-1) {
5831 if ((errors==NULL) || (!strcmp(errors, "strict")))
5832 known_errorHandler = 1;
5833 else if (!strcmp(errors, "replace"))
5834 known_errorHandler = 2;
5835 else if (!strcmp(errors, "ignore"))
5836 known_errorHandler = 3;
5837 else if (!strcmp(errors, "xmlcharrefreplace"))
5838 known_errorHandler = 4;
5839 else
5840 known_errorHandler = 0;
5841 }
5842 switch (known_errorHandler) {
5843 case 1: /* strict */
5844 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005845 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 case 2: /* replace */
5847 /* No need to check for space, this is a 1:1 replacement */
5848 for (coll = collstart; coll<collend; ++coll)
5849 *str++ = '?';
5850 /* fall through */
5851 case 3: /* ignore */
5852 p = collend;
5853 break;
5854 case 4: /* xmlcharrefreplace */
5855 /* generate replacement (temporarily (mis)uses p) */
5856 for (p = collstart; p < collend; ++p) {
5857 char buffer[2+29+1+1];
5858 char *cp;
5859 sprintf(buffer, "&#%d;", (int)*p);
5860 if (charmaptranslate_makespace(&res, &str,
5861 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5862 goto onError;
5863 for (cp = buffer; *cp; ++cp)
5864 *str++ = *cp;
5865 }
5866 p = collend;
5867 break;
5868 default:
5869 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5870 reason, startp, size, &exc,
5871 collstart-startp, collend-startp, &newpos);
5872 if (repunicode == NULL)
5873 goto onError;
5874 /* generate replacement */
5875 repsize = PyUnicode_GET_SIZE(repunicode);
5876 if (charmaptranslate_makespace(&res, &str,
5877 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5878 Py_DECREF(repunicode);
5879 goto onError;
5880 }
5881 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5882 *str++ = *uni2;
5883 p = startp + newpos;
5884 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005885 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005886 }
5887 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005888 /* Resize if we allocated to much */
5889 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005890 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 if (PyUnicode_Resize(&res, respos) < 0)
5892 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 }
5894 Py_XDECREF(exc);
5895 Py_XDECREF(errorHandler);
5896 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897
Benjamin Peterson29060642009-01-31 22:14:21 +00005898 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005899 Py_XDECREF(res);
5900 Py_XDECREF(exc);
5901 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 return NULL;
5903}
5904
5905PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 PyObject *mapping,
5907 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908{
5909 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005910
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 str = PyUnicode_FromObject(str);
5912 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 PyUnicode_GET_SIZE(str),
5916 mapping,
5917 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 Py_DECREF(str);
5919 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005920
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 Py_XDECREF(str);
5923 return NULL;
5924}
Tim Petersced69f82003-09-16 20:30:58 +00005925
Guido van Rossum9e896b32000-04-05 20:11:21 +00005926/* --- Decimal Encoder ---------------------------------------------------- */
5927
5928int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 Py_ssize_t length,
5930 char *output,
5931 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005932{
5933 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005934 PyObject *errorHandler = NULL;
5935 PyObject *exc = NULL;
5936 const char *encoding = "decimal";
5937 const char *reason = "invalid decimal Unicode string";
5938 /* the following variable is used for caching string comparisons
5939 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5940 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005941
5942 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 PyErr_BadArgument();
5944 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005945 }
5946
5947 p = s;
5948 end = s + length;
5949 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 register Py_UNICODE ch = *p;
5951 int decimal;
5952 PyObject *repunicode;
5953 Py_ssize_t repsize;
5954 Py_ssize_t newpos;
5955 Py_UNICODE *uni2;
5956 Py_UNICODE *collstart;
5957 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005958
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005960 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00005961 ++p;
5962 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005963 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 decimal = Py_UNICODE_TODECIMAL(ch);
5965 if (decimal >= 0) {
5966 *output++ = '0' + decimal;
5967 ++p;
5968 continue;
5969 }
5970 if (0 < ch && ch < 256) {
5971 *output++ = (char)ch;
5972 ++p;
5973 continue;
5974 }
5975 /* All other characters are considered unencodable */
5976 collstart = p;
5977 collend = p+1;
5978 while (collend < end) {
5979 if ((0 < *collend && *collend < 256) ||
5980 !Py_UNICODE_ISSPACE(*collend) ||
5981 Py_UNICODE_TODECIMAL(*collend))
5982 break;
5983 }
5984 /* cache callback name lookup
5985 * (if not done yet, i.e. it's the first error) */
5986 if (known_errorHandler==-1) {
5987 if ((errors==NULL) || (!strcmp(errors, "strict")))
5988 known_errorHandler = 1;
5989 else if (!strcmp(errors, "replace"))
5990 known_errorHandler = 2;
5991 else if (!strcmp(errors, "ignore"))
5992 known_errorHandler = 3;
5993 else if (!strcmp(errors, "xmlcharrefreplace"))
5994 known_errorHandler = 4;
5995 else
5996 known_errorHandler = 0;
5997 }
5998 switch (known_errorHandler) {
5999 case 1: /* strict */
6000 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6001 goto onError;
6002 case 2: /* replace */
6003 for (p = collstart; p < collend; ++p)
6004 *output++ = '?';
6005 /* fall through */
6006 case 3: /* ignore */
6007 p = collend;
6008 break;
6009 case 4: /* xmlcharrefreplace */
6010 /* generate replacement (temporarily (mis)uses p) */
6011 for (p = collstart; p < collend; ++p)
6012 output += sprintf(output, "&#%d;", (int)*p);
6013 p = collend;
6014 break;
6015 default:
6016 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6017 encoding, reason, s, length, &exc,
6018 collstart-s, collend-s, &newpos);
6019 if (repunicode == NULL)
6020 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006021 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006022 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006023 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6024 Py_DECREF(repunicode);
6025 goto onError;
6026 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 /* generate replacement */
6028 repsize = PyUnicode_GET_SIZE(repunicode);
6029 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6030 Py_UNICODE ch = *uni2;
6031 if (Py_UNICODE_ISSPACE(ch))
6032 *output++ = ' ';
6033 else {
6034 decimal = Py_UNICODE_TODECIMAL(ch);
6035 if (decimal >= 0)
6036 *output++ = '0' + decimal;
6037 else if (0 < ch && ch < 256)
6038 *output++ = (char)ch;
6039 else {
6040 Py_DECREF(repunicode);
6041 raise_encode_exception(&exc, encoding,
6042 s, length, collstart-s, collend-s, reason);
6043 goto onError;
6044 }
6045 }
6046 }
6047 p = s + newpos;
6048 Py_DECREF(repunicode);
6049 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006050 }
6051 /* 0-terminate the output string */
6052 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053 Py_XDECREF(exc);
6054 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006055 return 0;
6056
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058 Py_XDECREF(exc);
6059 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006060 return -1;
6061}
6062
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063/* --- Helpers ------------------------------------------------------------ */
6064
Eric Smith8c663262007-08-25 02:26:07 +00006065#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006066#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006067
Thomas Wouters477c8d52006-05-27 19:21:47 +00006068#include "stringlib/count.h"
6069#include "stringlib/find.h"
6070#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006071#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006072
Eric Smith5807c412008-05-11 21:00:57 +00006073#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006074#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006075#include "stringlib/localeutil.h"
6076
Thomas Wouters477c8d52006-05-27 19:21:47 +00006077/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006078#define ADJUST_INDICES(start, end, len) \
6079 if (end > len) \
6080 end = len; \
6081 else if (end < 0) { \
6082 end += len; \
6083 if (end < 0) \
6084 end = 0; \
6085 } \
6086 if (start < 0) { \
6087 start += len; \
6088 if (start < 0) \
6089 start = 0; \
6090 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006091
Martin v. Löwis18e16552006-02-15 17:27:45 +00006092Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006093 PyObject *substr,
6094 Py_ssize_t start,
6095 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006097 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006098 PyUnicodeObject* str_obj;
6099 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006100
Thomas Wouters477c8d52006-05-27 19:21:47 +00006101 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6102 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006104 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6105 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 Py_DECREF(str_obj);
6107 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 }
Tim Petersced69f82003-09-16 20:30:58 +00006109
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006110 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006111 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006112 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6113 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006114 );
6115
6116 Py_DECREF(sub_obj);
6117 Py_DECREF(str_obj);
6118
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 return result;
6120}
6121
Martin v. Löwis18e16552006-02-15 17:27:45 +00006122Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006123 PyObject *sub,
6124 Py_ssize_t start,
6125 Py_ssize_t end,
6126 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006128 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006129
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006131 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006133 sub = PyUnicode_FromObject(sub);
6134 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 Py_DECREF(str);
6136 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 }
Tim Petersced69f82003-09-16 20:30:58 +00006138
Thomas Wouters477c8d52006-05-27 19:21:47 +00006139 if (direction > 0)
6140 result = stringlib_find_slice(
6141 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6142 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6143 start, end
6144 );
6145 else
6146 result = stringlib_rfind_slice(
6147 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6148 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6149 start, end
6150 );
6151
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006153 Py_DECREF(sub);
6154
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 return result;
6156}
6157
Tim Petersced69f82003-09-16 20:30:58 +00006158static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 PyUnicodeObject *substring,
6161 Py_ssize_t start,
6162 Py_ssize_t end,
6163 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 if (substring->length == 0)
6166 return 1;
6167
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006168 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 end -= substring->length;
6170 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172
6173 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 if (Py_UNICODE_MATCH(self, end, substring))
6175 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 } else {
6177 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 }
6180
6181 return 0;
6182}
6183
Martin v. Löwis18e16552006-02-15 17:27:45 +00006184Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 PyObject *substr,
6186 Py_ssize_t start,
6187 Py_ssize_t end,
6188 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006190 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006191
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 str = PyUnicode_FromObject(str);
6193 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 substr = PyUnicode_FromObject(substr);
6196 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 Py_DECREF(str);
6198 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 }
Tim Petersced69f82003-09-16 20:30:58 +00006200
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 (PyUnicodeObject *)substr,
6203 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 Py_DECREF(str);
6205 Py_DECREF(substr);
6206 return result;
6207}
6208
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209/* Apply fixfct filter to the Unicode object self and return a
6210 reference to the modified object */
6211
Tim Petersced69f82003-09-16 20:30:58 +00006212static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215{
6216
6217 PyUnicodeObject *u;
6218
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006219 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006222
6223 Py_UNICODE_COPY(u->str, self->str, self->length);
6224
Tim Peters7a29bd52001-09-12 03:03:31 +00006225 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 /* fixfct should return TRUE if it modified the buffer. If
6227 FALSE, return a reference to the original buffer instead
6228 (to save space, not time) */
6229 Py_INCREF(self);
6230 Py_DECREF(u);
6231 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 }
6233 return (PyObject*) u;
6234}
6235
Tim Petersced69f82003-09-16 20:30:58 +00006236static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237int fixupper(PyUnicodeObject *self)
6238{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006239 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 Py_UNICODE *s = self->str;
6241 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006242
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006245
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 ch = Py_UNICODE_TOUPPER(*s);
6247 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 *s = ch;
6250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 s++;
6252 }
6253
6254 return status;
6255}
6256
Tim Petersced69f82003-09-16 20:30:58 +00006257static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258int fixlower(PyUnicodeObject *self)
6259{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006260 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 Py_UNICODE *s = self->str;
6262 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006263
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006266
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 ch = Py_UNICODE_TOLOWER(*s);
6268 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 *s = ch;
6271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 s++;
6273 }
6274
6275 return status;
6276}
6277
Tim Petersced69f82003-09-16 20:30:58 +00006278static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279int fixswapcase(PyUnicodeObject *self)
6280{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006281 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 Py_UNICODE *s = self->str;
6283 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006284
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 while (len-- > 0) {
6286 if (Py_UNICODE_ISUPPER(*s)) {
6287 *s = Py_UNICODE_TOLOWER(*s);
6288 status = 1;
6289 } else if (Py_UNICODE_ISLOWER(*s)) {
6290 *s = Py_UNICODE_TOUPPER(*s);
6291 status = 1;
6292 }
6293 s++;
6294 }
6295
6296 return status;
6297}
6298
Tim Petersced69f82003-09-16 20:30:58 +00006299static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300int fixcapitalize(PyUnicodeObject *self)
6301{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006302 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006303 Py_UNICODE *s = self->str;
6304 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006305
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006306 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006308 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 *s = Py_UNICODE_TOUPPER(*s);
6310 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006312 s++;
6313 while (--len > 0) {
6314 if (Py_UNICODE_ISUPPER(*s)) {
6315 *s = Py_UNICODE_TOLOWER(*s);
6316 status = 1;
6317 }
6318 s++;
6319 }
6320 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321}
6322
6323static
6324int fixtitle(PyUnicodeObject *self)
6325{
6326 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6327 register Py_UNICODE *e;
6328 int previous_is_cased;
6329
6330 /* Shortcut for single character strings */
6331 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6333 if (*p != ch) {
6334 *p = ch;
6335 return 1;
6336 }
6337 else
6338 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 }
Tim Petersced69f82003-09-16 20:30:58 +00006340
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 e = p + PyUnicode_GET_SIZE(self);
6342 previous_is_cased = 0;
6343 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006345
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 if (previous_is_cased)
6347 *p = Py_UNICODE_TOLOWER(ch);
6348 else
6349 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006350
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 if (Py_UNICODE_ISLOWER(ch) ||
6352 Py_UNICODE_ISUPPER(ch) ||
6353 Py_UNICODE_ISTITLE(ch))
6354 previous_is_cased = 1;
6355 else
6356 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 }
6358 return 1;
6359}
6360
Tim Peters8ce9f162004-08-27 01:49:32 +00006361PyObject *
6362PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363{
Skip Montanaro6543b452004-09-16 03:28:13 +00006364 const Py_UNICODE blank = ' ';
6365 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006366 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006367 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006368 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6369 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006370 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6371 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006372 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006373 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374
Tim Peters05eba1f2004-08-27 21:32:02 +00006375 fseq = PySequence_Fast(seq, "");
6376 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006377 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006378 }
6379
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006380 /* NOTE: the following code can't call back into Python code,
6381 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006382 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006383
Tim Peters05eba1f2004-08-27 21:32:02 +00006384 seqlen = PySequence_Fast_GET_SIZE(fseq);
6385 /* If empty sequence, return u"". */
6386 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006387 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6388 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006389 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006390 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006391 /* If singleton sequence with an exact Unicode, return that. */
6392 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 item = items[0];
6394 if (PyUnicode_CheckExact(item)) {
6395 Py_INCREF(item);
6396 res = (PyUnicodeObject *)item;
6397 goto Done;
6398 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006399 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006400 else {
6401 /* Set up sep and seplen */
6402 if (separator == NULL) {
6403 sep = &blank;
6404 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006405 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006406 else {
6407 if (!PyUnicode_Check(separator)) {
6408 PyErr_Format(PyExc_TypeError,
6409 "separator: expected str instance,"
6410 " %.80s found",
6411 Py_TYPE(separator)->tp_name);
6412 goto onError;
6413 }
6414 sep = PyUnicode_AS_UNICODE(separator);
6415 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006416 }
6417 }
6418
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006419 /* There are at least two things to join, or else we have a subclass
6420 * of str in the sequence.
6421 * Do a pre-pass to figure out the total amount of space we'll
6422 * need (sz), and see whether all argument are strings.
6423 */
6424 sz = 0;
6425 for (i = 0; i < seqlen; i++) {
6426 const Py_ssize_t old_sz = sz;
6427 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 if (!PyUnicode_Check(item)) {
6429 PyErr_Format(PyExc_TypeError,
6430 "sequence item %zd: expected str instance,"
6431 " %.80s found",
6432 i, Py_TYPE(item)->tp_name);
6433 goto onError;
6434 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006435 sz += PyUnicode_GET_SIZE(item);
6436 if (i != 0)
6437 sz += seplen;
6438 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6439 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006440 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006441 goto onError;
6442 }
6443 }
Tim Petersced69f82003-09-16 20:30:58 +00006444
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006445 res = _PyUnicode_New(sz);
6446 if (res == NULL)
6447 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006448
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006449 /* Catenate everything. */
6450 res_p = PyUnicode_AS_UNICODE(res);
6451 for (i = 0; i < seqlen; ++i) {
6452 Py_ssize_t itemlen;
6453 item = items[i];
6454 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 /* Copy item, and maybe the separator. */
6456 if (i) {
6457 Py_UNICODE_COPY(res_p, sep, seplen);
6458 res_p += seplen;
6459 }
6460 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6461 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006462 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006463
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006465 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 return (PyObject *)res;
6467
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006469 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006470 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 return NULL;
6472}
6473
Tim Petersced69f82003-09-16 20:30:58 +00006474static
6475PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 Py_ssize_t left,
6477 Py_ssize_t right,
6478 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479{
6480 PyUnicodeObject *u;
6481
6482 if (left < 0)
6483 left = 0;
6484 if (right < 0)
6485 right = 0;
6486
Tim Peters7a29bd52001-09-12 03:03:31 +00006487 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 Py_INCREF(self);
6489 return self;
6490 }
6491
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006492 if (left > PY_SSIZE_T_MAX - self->length ||
6493 right > PY_SSIZE_T_MAX - (left + self->length)) {
6494 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6495 return NULL;
6496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 u = _PyUnicode_New(left + self->length + right);
6498 if (u) {
6499 if (left)
6500 Py_UNICODE_FILL(u->str, fill, left);
6501 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6502 if (right)
6503 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6504 }
6505
6506 return u;
6507}
6508
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006509PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512
6513 string = PyUnicode_FromObject(string);
6514 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006517 list = stringlib_splitlines(
6518 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6519 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520
6521 Py_DECREF(string);
6522 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523}
6524
Tim Petersced69f82003-09-16 20:30:58 +00006525static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 PyUnicodeObject *substring,
6528 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006531 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006534 return stringlib_split_whitespace(
6535 (PyObject*) self, self->str, self->length, maxcount
6536 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006538 return stringlib_split(
6539 (PyObject*) self, self->str, self->length,
6540 substring->str, substring->length,
6541 maxcount
6542 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543}
6544
Tim Petersced69f82003-09-16 20:30:58 +00006545static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006546PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 PyUnicodeObject *substring,
6548 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006549{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006550 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006551 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006552
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006553 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006554 return stringlib_rsplit_whitespace(
6555 (PyObject*) self, self->str, self->length, maxcount
6556 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006557
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006558 return stringlib_rsplit(
6559 (PyObject*) self, self->str, self->length,
6560 substring->str, substring->length,
6561 maxcount
6562 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006563}
6564
6565static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 PyUnicodeObject *str1,
6568 PyUnicodeObject *str2,
6569 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570{
6571 PyUnicodeObject *u;
6572
6573 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006575 else if (maxcount == 0 || self->length == 0)
6576 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577
Thomas Wouters477c8d52006-05-27 19:21:47 +00006578 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006579 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006580 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006581 if (str1->length == 0)
6582 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006583 if (str1->length == 1) {
6584 /* replace characters */
6585 Py_UNICODE u1, u2;
6586 if (!findchar(self->str, self->length, str1->str[0]))
6587 goto nothing;
6588 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6589 if (!u)
6590 return NULL;
6591 Py_UNICODE_COPY(u->str, self->str, self->length);
6592 u1 = str1->str[0];
6593 u2 = str2->str[0];
6594 for (i = 0; i < u->length; i++)
6595 if (u->str[i] == u1) {
6596 if (--maxcount < 0)
6597 break;
6598 u->str[i] = u2;
6599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006601 i = stringlib_find(
6602 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006604 if (i < 0)
6605 goto nothing;
6606 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6607 if (!u)
6608 return NULL;
6609 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006610
6611 /* change everything in-place, starting with this one */
6612 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6613 i += str1->length;
6614
6615 while ( --maxcount > 0) {
6616 i = stringlib_find(self->str+i, self->length-i,
6617 str1->str, str1->length,
6618 i);
6619 if (i == -1)
6620 break;
6621 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6622 i += str1->length;
6623 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006626
6627 Py_ssize_t n, i, j, e;
6628 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 Py_UNICODE *p;
6630
6631 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006632 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6633 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006634 if (n == 0)
6635 goto nothing;
6636 /* new_size = self->length + n * (str2->length - str1->length)); */
6637 delta = (str2->length - str1->length);
6638 if (delta == 0) {
6639 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006641 product = n * (str2->length - str1->length);
6642 if ((product / (str2->length - str1->length)) != n) {
6643 PyErr_SetString(PyExc_OverflowError,
6644 "replace string is too long");
6645 return NULL;
6646 }
6647 new_size = self->length + product;
6648 if (new_size < 0) {
6649 PyErr_SetString(PyExc_OverflowError,
6650 "replace string is too long");
6651 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 }
6653 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006654 u = _PyUnicode_New(new_size);
6655 if (!u)
6656 return NULL;
6657 i = 0;
6658 p = u->str;
6659 e = self->length - str1->length;
6660 if (str1->length > 0) {
6661 while (n-- > 0) {
6662 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006663 j = stringlib_find(self->str+i, self->length-i,
6664 str1->str, str1->length,
6665 i);
6666 if (j == -1)
6667 break;
6668 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006669 /* copy unchanged part [i:j] */
6670 Py_UNICODE_COPY(p, self->str+i, j-i);
6671 p += j - i;
6672 }
6673 /* copy substitution string */
6674 if (str2->length > 0) {
6675 Py_UNICODE_COPY(p, str2->str, str2->length);
6676 p += str2->length;
6677 }
6678 i = j + str1->length;
6679 }
6680 if (i < self->length)
6681 /* copy tail [i:] */
6682 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6683 } else {
6684 /* interleave */
6685 while (n > 0) {
6686 Py_UNICODE_COPY(p, str2->str, str2->length);
6687 p += str2->length;
6688 if (--n <= 0)
6689 break;
6690 *p++ = self->str[i++];
6691 }
6692 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6693 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006696
Benjamin Peterson29060642009-01-31 22:14:21 +00006697 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006698 /* nothing to replace; return original string (when possible) */
6699 if (PyUnicode_CheckExact(self)) {
6700 Py_INCREF(self);
6701 return (PyObject *) self;
6702 }
6703 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704}
6705
6706/* --- Unicode Object Methods --------------------------------------------- */
6707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006708PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710\n\
6711Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006712characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
6714static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006715unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 return fixup(self, fixtitle);
6718}
6719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006720PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722\n\
6723Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006724have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725
6726static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006727unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 return fixup(self, fixcapitalize);
6730}
6731
6732#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006733PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735\n\
6736Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006737normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738
6739static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006740unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741{
6742 PyObject *list;
6743 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006744 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 /* Split into words */
6747 list = split(self, NULL, -1);
6748 if (!list)
6749 return NULL;
6750
6751 /* Capitalize each word */
6752 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6753 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 if (item == NULL)
6756 goto onError;
6757 Py_DECREF(PyList_GET_ITEM(list, i));
6758 PyList_SET_ITEM(list, i, item);
6759 }
6760
6761 /* Join the words to form a new string */
6762 item = PyUnicode_Join(NULL, list);
6763
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 Py_DECREF(list);
6766 return (PyObject *)item;
6767}
6768#endif
6769
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006770/* Argument converter. Coerces to a single unicode character */
6771
6772static int
6773convert_uc(PyObject *obj, void *addr)
6774{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006775 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6776 PyObject *uniobj;
6777 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006778
Benjamin Peterson14339b62009-01-31 16:36:08 +00006779 uniobj = PyUnicode_FromObject(obj);
6780 if (uniobj == NULL) {
6781 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006783 return 0;
6784 }
6785 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6786 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006788 Py_DECREF(uniobj);
6789 return 0;
6790 }
6791 unistr = PyUnicode_AS_UNICODE(uniobj);
6792 *fillcharloc = unistr[0];
6793 Py_DECREF(uniobj);
6794 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006795}
6796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006797PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006800Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006801done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802
6803static PyObject *
6804unicode_center(PyUnicodeObject *self, PyObject *args)
6805{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006806 Py_ssize_t marg, left;
6807 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006808 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809
Thomas Woutersde017742006-02-16 19:34:37 +00006810 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 return NULL;
6812
Tim Peters7a29bd52001-09-12 03:03:31 +00006813 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 Py_INCREF(self);
6815 return (PyObject*) self;
6816 }
6817
6818 marg = width - self->length;
6819 left = marg / 2 + (marg & width & 1);
6820
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006821 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822}
6823
Marc-André Lemburge5034372000-08-08 08:04:29 +00006824#if 0
6825
6826/* This code should go into some future Unicode collation support
6827 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006828 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006829
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006830/* speedy UTF-16 code point order comparison */
6831/* gleaned from: */
6832/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6833
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006834static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006835{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006836 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006837 0, 0, 0, 0, 0, 0, 0, 0,
6838 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006839 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006840};
6841
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842static int
6843unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6844{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006845 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006846
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 Py_UNICODE *s1 = str1->str;
6848 Py_UNICODE *s2 = str2->str;
6849
6850 len1 = str1->length;
6851 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006852
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006854 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006855
6856 c1 = *s1++;
6857 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006858
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 if (c1 > (1<<11) * 26)
6860 c1 += utf16Fixup[c1>>11];
6861 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006862 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006863 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006864
6865 if (c1 != c2)
6866 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006867
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006868 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 }
6870
6871 return (len1 < len2) ? -1 : (len1 != len2);
6872}
6873
Marc-André Lemburge5034372000-08-08 08:04:29 +00006874#else
6875
6876static int
6877unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6878{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006879 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006880
6881 Py_UNICODE *s1 = str1->str;
6882 Py_UNICODE *s2 = str2->str;
6883
6884 len1 = str1->length;
6885 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006886
Marc-André Lemburge5034372000-08-08 08:04:29 +00006887 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006888 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006889
Fredrik Lundh45714e92001-06-26 16:39:36 +00006890 c1 = *s1++;
6891 c2 = *s2++;
6892
6893 if (c1 != c2)
6894 return (c1 < c2) ? -1 : 1;
6895
Marc-André Lemburge5034372000-08-08 08:04:29 +00006896 len1--; len2--;
6897 }
6898
6899 return (len1 < len2) ? -1 : (len1 != len2);
6900}
6901
6902#endif
6903
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006907 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6908 return unicode_compare((PyUnicodeObject *)left,
6909 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006910 PyErr_Format(PyExc_TypeError,
6911 "Can't compare %.100s and %.100s",
6912 left->ob_type->tp_name,
6913 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 return -1;
6915}
6916
Martin v. Löwis5b222132007-06-10 09:51:05 +00006917int
6918PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6919{
6920 int i;
6921 Py_UNICODE *id;
6922 assert(PyUnicode_Check(uni));
6923 id = PyUnicode_AS_UNICODE(uni);
6924 /* Compare Unicode string and source character set string */
6925 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 if (id[i] != str[i])
6927 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006928 /* This check keeps Python strings that end in '\0' from comparing equal
6929 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006930 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006932 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006934 return 0;
6935}
6936
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006937
Benjamin Peterson29060642009-01-31 22:14:21 +00006938#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006939 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006940
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006941PyObject *PyUnicode_RichCompare(PyObject *left,
6942 PyObject *right,
6943 int op)
6944{
6945 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006946
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006947 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6948 PyObject *v;
6949 if (((PyUnicodeObject *) left)->length !=
6950 ((PyUnicodeObject *) right)->length) {
6951 if (op == Py_EQ) {
6952 Py_INCREF(Py_False);
6953 return Py_False;
6954 }
6955 if (op == Py_NE) {
6956 Py_INCREF(Py_True);
6957 return Py_True;
6958 }
6959 }
6960 if (left == right)
6961 result = 0;
6962 else
6963 result = unicode_compare((PyUnicodeObject *)left,
6964 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006965
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006966 /* Convert the return value to a Boolean */
6967 switch (op) {
6968 case Py_EQ:
6969 v = TEST_COND(result == 0);
6970 break;
6971 case Py_NE:
6972 v = TEST_COND(result != 0);
6973 break;
6974 case Py_LE:
6975 v = TEST_COND(result <= 0);
6976 break;
6977 case Py_GE:
6978 v = TEST_COND(result >= 0);
6979 break;
6980 case Py_LT:
6981 v = TEST_COND(result == -1);
6982 break;
6983 case Py_GT:
6984 v = TEST_COND(result == 1);
6985 break;
6986 default:
6987 PyErr_BadArgument();
6988 return NULL;
6989 }
6990 Py_INCREF(v);
6991 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006992 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006993
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006994 Py_INCREF(Py_NotImplemented);
6995 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006996}
6997
Guido van Rossum403d68b2000-03-13 15:55:09 +00006998int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007000{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007001 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007002 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007003
7004 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007005 sub = PyUnicode_FromObject(element);
7006 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 PyErr_Format(PyExc_TypeError,
7008 "'in <string>' requires string as left operand, not %s",
7009 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007010 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007011 }
7012
Thomas Wouters477c8d52006-05-27 19:21:47 +00007013 str = PyUnicode_FromObject(container);
7014 if (!str) {
7015 Py_DECREF(sub);
7016 return -1;
7017 }
7018
7019 result = stringlib_contains_obj(str, sub);
7020
7021 Py_DECREF(str);
7022 Py_DECREF(sub);
7023
Guido van Rossum403d68b2000-03-13 15:55:09 +00007024 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007025}
7026
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027/* Concat to string or Unicode object giving a new Unicode object. */
7028
7029PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031{
7032 PyUnicodeObject *u = NULL, *v = NULL, *w;
7033
7034 /* Coerce the two arguments */
7035 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7036 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7039 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041
7042 /* Shortcuts */
7043 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 Py_DECREF(v);
7045 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 }
7047 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 Py_DECREF(u);
7049 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 }
7051
7052 /* Concat the two Unicode strings */
7053 w = _PyUnicode_New(u->length + v->length);
7054 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056 Py_UNICODE_COPY(w->str, u->str, u->length);
7057 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7058
7059 Py_DECREF(u);
7060 Py_DECREF(v);
7061 return (PyObject *)w;
7062
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 Py_XDECREF(u);
7065 Py_XDECREF(v);
7066 return NULL;
7067}
7068
Walter Dörwald1ab83302007-05-18 17:15:44 +00007069void
7070PyUnicode_Append(PyObject **pleft, PyObject *right)
7071{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007072 PyObject *new;
7073 if (*pleft == NULL)
7074 return;
7075 if (right == NULL || !PyUnicode_Check(*pleft)) {
7076 Py_DECREF(*pleft);
7077 *pleft = NULL;
7078 return;
7079 }
7080 new = PyUnicode_Concat(*pleft, right);
7081 Py_DECREF(*pleft);
7082 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007083}
7084
7085void
7086PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7087{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007088 PyUnicode_Append(pleft, right);
7089 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007090}
7091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007092PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007093 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007095Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007096string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007097interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098
7099static PyObject *
7100unicode_count(PyUnicodeObject *self, PyObject *args)
7101{
7102 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007103 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007104 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105 PyObject *result;
7106
Guido van Rossumb8872e62000-05-09 14:14:27 +00007107 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007108 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 return NULL;
7110
7111 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007112 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007115
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007116 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007117 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007118 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007119 substring->str, substring->length,
7120 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007121 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122
7123 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007124
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 return result;
7126}
7127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007128PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007131Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007132to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007133handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007134a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7135'xmlcharrefreplace' as well as any other name registered with\n\
7136codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137
7138static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007139unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007141 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 char *encoding = NULL;
7143 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007144 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007145
Benjamin Peterson308d6372009-09-18 21:42:35 +00007146 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7147 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007149 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007150 if (v == NULL)
7151 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007152 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007153 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007154 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007155 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007156 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007157 Py_DECREF(v);
7158 return NULL;
7159 }
7160 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007161
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007163 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007164}
7165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007166PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168\n\
7169Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007170If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171
7172static PyObject*
7173unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7174{
7175 Py_UNICODE *e;
7176 Py_UNICODE *p;
7177 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007178 Py_UNICODE *qe;
7179 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180 PyUnicodeObject *u;
7181 int tabsize = 8;
7182
7183 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185
Thomas Wouters7e474022000-07-16 12:04:32 +00007186 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007187 i = 0; /* chars up to and including most recent \n or \r */
7188 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7189 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 for (p = self->str; p < e; p++)
7191 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 if (tabsize > 0) {
7193 incr = tabsize - (j % tabsize); /* cannot overflow */
7194 if (j > PY_SSIZE_T_MAX - incr)
7195 goto overflow1;
7196 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007197 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 if (j > PY_SSIZE_T_MAX - 1)
7201 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 j++;
7203 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 if (i > PY_SSIZE_T_MAX - j)
7205 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007207 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208 }
7209 }
7210
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007211 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007213
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 /* Second pass: create output string and fill it */
7215 u = _PyUnicode_New(i + j);
7216 if (!u)
7217 return NULL;
7218
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007219 j = 0; /* same as in first pass */
7220 q = u->str; /* next output char */
7221 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222
7223 for (p = self->str; p < e; p++)
7224 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 if (tabsize > 0) {
7226 i = tabsize - (j % tabsize);
7227 j += i;
7228 while (i--) {
7229 if (q >= qe)
7230 goto overflow2;
7231 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007232 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007234 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 else {
7236 if (q >= qe)
7237 goto overflow2;
7238 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007239 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 if (*p == '\n' || *p == '\r')
7241 j = 0;
7242 }
7243
7244 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007245
7246 overflow2:
7247 Py_DECREF(u);
7248 overflow1:
7249 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251}
7252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007253PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255\n\
7256Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007257such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258arguments start and end are interpreted as in slice notation.\n\
7259\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007260Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261
7262static PyObject *
7263unicode_find(PyUnicodeObject *self, PyObject *args)
7264{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007265 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007266 Py_ssize_t start;
7267 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007268 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269
Christian Heimes9cd17752007-11-18 19:35:23 +00007270 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272
Thomas Wouters477c8d52006-05-27 19:21:47 +00007273 result = stringlib_find_slice(
7274 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7275 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7276 start, end
7277 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278
7279 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007280
Christian Heimes217cfd12007-12-02 14:31:20 +00007281 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282}
7283
7284static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007285unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286{
7287 if (index < 0 || index >= self->length) {
7288 PyErr_SetString(PyExc_IndexError, "string index out of range");
7289 return NULL;
7290 }
7291
7292 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7293}
7294
Guido van Rossumc2504932007-09-18 19:42:40 +00007295/* Believe it or not, this produces the same value for ASCII strings
7296 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007298unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299{
Guido van Rossumc2504932007-09-18 19:42:40 +00007300 Py_ssize_t len;
7301 Py_UNICODE *p;
7302 long x;
7303
7304 if (self->hash != -1)
7305 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007306 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007307 p = self->str;
7308 x = *p << 7;
7309 while (--len >= 0)
7310 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007311 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007312 if (x == -1)
7313 x = -2;
7314 self->hash = x;
7315 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316}
7317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007318PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007321Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322
7323static PyObject *
7324unicode_index(PyUnicodeObject *self, PyObject *args)
7325{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007326 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007327 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007328 Py_ssize_t start;
7329 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330
Christian Heimes9cd17752007-11-18 19:35:23 +00007331 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007332 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333
Thomas Wouters477c8d52006-05-27 19:21:47 +00007334 result = stringlib_find_slice(
7335 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7336 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7337 start, end
7338 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339
7340 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007341
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 if (result < 0) {
7343 PyErr_SetString(PyExc_ValueError, "substring not found");
7344 return NULL;
7345 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007346
Christian Heimes217cfd12007-12-02 14:31:20 +00007347 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348}
7349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007350PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007353Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007354at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355
7356static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007357unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358{
7359 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7360 register const Py_UNICODE *e;
7361 int cased;
7362
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363 /* Shortcut for single character strings */
7364 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007367 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007368 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007370
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371 e = p + PyUnicode_GET_SIZE(self);
7372 cased = 0;
7373 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007375
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7377 return PyBool_FromLong(0);
7378 else if (!cased && Py_UNICODE_ISLOWER(ch))
7379 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007381 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382}
7383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007384PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007387Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007388at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389
7390static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007391unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392{
7393 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7394 register const Py_UNICODE *e;
7395 int cased;
7396
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397 /* Shortcut for single character strings */
7398 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007401 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007402 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007404
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405 e = p + PyUnicode_GET_SIZE(self);
7406 cased = 0;
7407 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007409
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7411 return PyBool_FromLong(0);
7412 else if (!cased && Py_UNICODE_ISUPPER(ch))
7413 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007415 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416}
7417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007418PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007421Return True if S is a titlecased string and there is at least one\n\
7422character in S, i.e. upper- and titlecase characters may only\n\
7423follow uncased characters and lowercase characters only cased ones.\n\
7424Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425
7426static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007427unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428{
7429 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7430 register const Py_UNICODE *e;
7431 int cased, previous_is_cased;
7432
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433 /* Shortcut for single character strings */
7434 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7436 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007438 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007439 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007441
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442 e = p + PyUnicode_GET_SIZE(self);
7443 cased = 0;
7444 previous_is_cased = 0;
7445 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007447
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7449 if (previous_is_cased)
7450 return PyBool_FromLong(0);
7451 previous_is_cased = 1;
7452 cased = 1;
7453 }
7454 else if (Py_UNICODE_ISLOWER(ch)) {
7455 if (!previous_is_cased)
7456 return PyBool_FromLong(0);
7457 previous_is_cased = 1;
7458 cased = 1;
7459 }
7460 else
7461 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007463 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464}
7465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007466PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007469Return True if all characters in S are whitespace\n\
7470and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
7472static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007473unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474{
7475 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7476 register const Py_UNICODE *e;
7477
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478 /* Shortcut for single character strings */
7479 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 Py_UNICODE_ISSPACE(*p))
7481 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007483 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007484 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007486
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487 e = p + PyUnicode_GET_SIZE(self);
7488 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 if (!Py_UNICODE_ISSPACE(*p))
7490 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007492 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493}
7494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007495PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007497\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007498Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007499and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007500
7501static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007502unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007503{
7504 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7505 register const Py_UNICODE *e;
7506
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007507 /* Shortcut for single character strings */
7508 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 Py_UNICODE_ISALPHA(*p))
7510 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007511
7512 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007513 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007515
7516 e = p + PyUnicode_GET_SIZE(self);
7517 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 if (!Py_UNICODE_ISALPHA(*p))
7519 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007520 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007521 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007522}
7523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007524PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007526\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007527Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007528and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007529
7530static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007531unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007532{
7533 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7534 register const Py_UNICODE *e;
7535
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007536 /* Shortcut for single character strings */
7537 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 Py_UNICODE_ISALNUM(*p))
7539 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007540
7541 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007542 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007544
7545 e = p + PyUnicode_GET_SIZE(self);
7546 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 if (!Py_UNICODE_ISALNUM(*p))
7548 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007549 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007550 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007551}
7552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007553PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007554 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007556Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007557False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558
7559static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007560unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561{
7562 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7563 register const Py_UNICODE *e;
7564
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 /* Shortcut for single character strings */
7566 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007567 Py_UNICODE_ISDECIMAL(*p))
7568 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007570 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007571 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007573
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 e = p + PyUnicode_GET_SIZE(self);
7575 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 if (!Py_UNICODE_ISDECIMAL(*p))
7577 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007579 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580}
7581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007582PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007585Return True if all characters in S are digits\n\
7586and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
7588static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007589unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590{
7591 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7592 register const Py_UNICODE *e;
7593
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 /* Shortcut for single character strings */
7595 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 Py_UNICODE_ISDIGIT(*p))
7597 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007599 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007600 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007602
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603 e = p + PyUnicode_GET_SIZE(self);
7604 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 if (!Py_UNICODE_ISDIGIT(*p))
7606 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007608 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609}
7610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007611PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007614Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007615False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616
7617static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007618unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619{
7620 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7621 register const Py_UNICODE *e;
7622
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623 /* Shortcut for single character strings */
7624 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 Py_UNICODE_ISNUMERIC(*p))
7626 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007628 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007629 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007631
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632 e = p + PyUnicode_GET_SIZE(self);
7633 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 if (!Py_UNICODE_ISNUMERIC(*p))
7635 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007637 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638}
7639
Martin v. Löwis47383402007-08-15 07:32:56 +00007640int
7641PyUnicode_IsIdentifier(PyObject *self)
7642{
7643 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7644 register const Py_UNICODE *e;
7645
7646 /* Special case for empty strings */
7647 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007649
7650 /* PEP 3131 says that the first character must be in
7651 XID_Start and subsequent characters in XID_Continue,
7652 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007653 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007654 letters, digits, underscore). However, given the current
7655 definition of XID_Start and XID_Continue, it is sufficient
7656 to check just for these, except that _ must be allowed
7657 as starting an identifier. */
7658 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7659 return 0;
7660
7661 e = p + PyUnicode_GET_SIZE(self);
7662 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 if (!_PyUnicode_IsXidContinue(*p))
7664 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007665 }
7666 return 1;
7667}
7668
7669PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007671\n\
7672Return True if S is a valid identifier according\n\
7673to the language definition.");
7674
7675static PyObject*
7676unicode_isidentifier(PyObject *self)
7677{
7678 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7679}
7680
Georg Brandl559e5d72008-06-11 18:37:52 +00007681PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007683\n\
7684Return True if all characters in S are considered\n\
7685printable in repr() or S is empty, False otherwise.");
7686
7687static PyObject*
7688unicode_isprintable(PyObject *self)
7689{
7690 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7691 register const Py_UNICODE *e;
7692
7693 /* Shortcut for single character strings */
7694 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7695 Py_RETURN_TRUE;
7696 }
7697
7698 e = p + PyUnicode_GET_SIZE(self);
7699 for (; p < e; p++) {
7700 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7701 Py_RETURN_FALSE;
7702 }
7703 }
7704 Py_RETURN_TRUE;
7705}
7706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007707PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007708 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709\n\
7710Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007711iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712
7713static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007714unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007716 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717}
7718
Martin v. Löwis18e16552006-02-15 17:27:45 +00007719static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720unicode_length(PyUnicodeObject *self)
7721{
7722 return self->length;
7723}
7724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007725PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007728Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007729done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
7731static PyObject *
7732unicode_ljust(PyUnicodeObject *self, PyObject *args)
7733{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007734 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007735 Py_UNICODE fillchar = ' ';
7736
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007737 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738 return NULL;
7739
Tim Peters7a29bd52001-09-12 03:03:31 +00007740 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741 Py_INCREF(self);
7742 return (PyObject*) self;
7743 }
7744
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007745 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746}
7747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007748PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007751Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752
7753static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007754unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 return fixup(self, fixlower);
7757}
7758
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007759#define LEFTSTRIP 0
7760#define RIGHTSTRIP 1
7761#define BOTHSTRIP 2
7762
7763/* Arrays indexed by above */
7764static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7765
7766#define STRIPNAME(i) (stripformat[i]+3)
7767
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007768/* externally visible for str.strip(unicode) */
7769PyObject *
7770_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7771{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007772 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7773 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7774 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7775 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7776 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007777
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007779
Benjamin Peterson14339b62009-01-31 16:36:08 +00007780 i = 0;
7781 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7783 i++;
7784 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007785 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007786
Benjamin Peterson14339b62009-01-31 16:36:08 +00007787 j = len;
7788 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 do {
7790 j--;
7791 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7792 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007793 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007794
Benjamin Peterson14339b62009-01-31 16:36:08 +00007795 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 Py_INCREF(self);
7797 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007798 }
7799 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007801}
7802
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803
7804static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007805do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007807 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7808 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007809
Benjamin Peterson14339b62009-01-31 16:36:08 +00007810 i = 0;
7811 if (striptype != RIGHTSTRIP) {
7812 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7813 i++;
7814 }
7815 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007816
Benjamin Peterson14339b62009-01-31 16:36:08 +00007817 j = len;
7818 if (striptype != LEFTSTRIP) {
7819 do {
7820 j--;
7821 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7822 j++;
7823 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007824
Benjamin Peterson14339b62009-01-31 16:36:08 +00007825 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7826 Py_INCREF(self);
7827 return (PyObject*)self;
7828 }
7829 else
7830 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831}
7832
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007833
7834static PyObject *
7835do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7836{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007837 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007838
Benjamin Peterson14339b62009-01-31 16:36:08 +00007839 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7840 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007841
Benjamin Peterson14339b62009-01-31 16:36:08 +00007842 if (sep != NULL && sep != Py_None) {
7843 if (PyUnicode_Check(sep))
7844 return _PyUnicode_XStrip(self, striptype, sep);
7845 else {
7846 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 "%s arg must be None or str",
7848 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007849 return NULL;
7850 }
7851 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007852
Benjamin Peterson14339b62009-01-31 16:36:08 +00007853 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007854}
7855
7856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007857PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007859\n\
7860Return a copy of the string S with leading and trailing\n\
7861whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007862If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007863
7864static PyObject *
7865unicode_strip(PyUnicodeObject *self, PyObject *args)
7866{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007867 if (PyTuple_GET_SIZE(args) == 0)
7868 return do_strip(self, BOTHSTRIP); /* Common case */
7869 else
7870 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007871}
7872
7873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007874PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007876\n\
7877Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007878If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007879
7880static PyObject *
7881unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7882{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007883 if (PyTuple_GET_SIZE(args) == 0)
7884 return do_strip(self, LEFTSTRIP); /* Common case */
7885 else
7886 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007887}
7888
7889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007890PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007892\n\
7893Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007894If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007895
7896static PyObject *
7897unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7898{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007899 if (PyTuple_GET_SIZE(args) == 0)
7900 return do_strip(self, RIGHTSTRIP); /* Common case */
7901 else
7902 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007903}
7904
7905
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007907unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908{
7909 PyUnicodeObject *u;
7910 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007911 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007912 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913
Georg Brandl222de0f2009-04-12 12:01:50 +00007914 if (len < 1) {
7915 Py_INCREF(unicode_empty);
7916 return (PyObject *)unicode_empty;
7917 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918
Tim Peters7a29bd52001-09-12 03:03:31 +00007919 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920 /* no repeat, return original string */
7921 Py_INCREF(str);
7922 return (PyObject*) str;
7923 }
Tim Peters8f422462000-09-09 06:13:41 +00007924
7925 /* ensure # of chars needed doesn't overflow int and # of bytes
7926 * needed doesn't overflow size_t
7927 */
7928 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007929 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007930 PyErr_SetString(PyExc_OverflowError,
7931 "repeated string is too long");
7932 return NULL;
7933 }
7934 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7935 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7936 PyErr_SetString(PyExc_OverflowError,
7937 "repeated string is too long");
7938 return NULL;
7939 }
7940 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007941 if (!u)
7942 return NULL;
7943
7944 p = u->str;
7945
Georg Brandl222de0f2009-04-12 12:01:50 +00007946 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007947 Py_UNICODE_FILL(p, str->str[0], len);
7948 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007949 Py_ssize_t done = str->length; /* number of characters copied this far */
7950 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007952 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007953 Py_UNICODE_COPY(p+done, p, n);
7954 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 }
7957
7958 return (PyObject*) u;
7959}
7960
7961PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 PyObject *subobj,
7963 PyObject *replobj,
7964 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965{
7966 PyObject *self;
7967 PyObject *str1;
7968 PyObject *str2;
7969 PyObject *result;
7970
7971 self = PyUnicode_FromObject(obj);
7972 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 str1 = PyUnicode_FromObject(subobj);
7975 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 Py_DECREF(self);
7977 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978 }
7979 str2 = PyUnicode_FromObject(replobj);
7980 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 Py_DECREF(self);
7982 Py_DECREF(str1);
7983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 }
Tim Petersced69f82003-09-16 20:30:58 +00007985 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 (PyUnicodeObject *)str1,
7987 (PyUnicodeObject *)str2,
7988 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 Py_DECREF(self);
7990 Py_DECREF(str1);
7991 Py_DECREF(str2);
7992 return result;
7993}
7994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007995PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00007996 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997\n\
7998Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00007999old replaced by new. If the optional argument count is\n\
8000given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001
8002static PyObject*
8003unicode_replace(PyUnicodeObject *self, PyObject *args)
8004{
8005 PyUnicodeObject *str1;
8006 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008007 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 PyObject *result;
8009
Martin v. Löwis18e16552006-02-15 17:27:45 +00008010 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011 return NULL;
8012 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8013 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008016 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 Py_DECREF(str1);
8018 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008019 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020
8021 result = replace(self, str1, str2, maxcount);
8022
8023 Py_DECREF(str1);
8024 Py_DECREF(str2);
8025 return result;
8026}
8027
8028static
8029PyObject *unicode_repr(PyObject *unicode)
8030{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008031 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008032 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008033 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8034 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8035
8036 /* XXX(nnorwitz): rather than over-allocating, it would be
8037 better to choose a different scheme. Perhaps scan the
8038 first N-chars of the string and allocate based on that size.
8039 */
8040 /* Initial allocation is based on the longest-possible unichr
8041 escape.
8042
8043 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8044 unichr, so in this case it's the longest unichr escape. In
8045 narrow (UTF-16) builds this is five chars per source unichr
8046 since there are two unichrs in the surrogate pair, so in narrow
8047 (UTF-16) builds it's not the longest unichr escape.
8048
8049 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8050 so in the narrow (UTF-16) build case it's the longest unichr
8051 escape.
8052 */
8053
Walter Dörwald1ab83302007-05-18 17:15:44 +00008054 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008056#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008058#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008060#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008062 if (repr == NULL)
8063 return NULL;
8064
Walter Dörwald1ab83302007-05-18 17:15:44 +00008065 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008066
8067 /* Add quote */
8068 *p++ = (findchar(s, size, '\'') &&
8069 !findchar(s, size, '"')) ? '"' : '\'';
8070 while (size-- > 0) {
8071 Py_UNICODE ch = *s++;
8072
8073 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008074 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008075 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008076 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008077 continue;
8078 }
8079
Benjamin Peterson29060642009-01-31 22:14:21 +00008080 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008081 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008082 *p++ = '\\';
8083 *p++ = 't';
8084 }
8085 else if (ch == '\n') {
8086 *p++ = '\\';
8087 *p++ = 'n';
8088 }
8089 else if (ch == '\r') {
8090 *p++ = '\\';
8091 *p++ = 'r';
8092 }
8093
8094 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008095 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008096 *p++ = '\\';
8097 *p++ = 'x';
8098 *p++ = hexdigits[(ch >> 4) & 0x000F];
8099 *p++ = hexdigits[ch & 0x000F];
8100 }
8101
Georg Brandl559e5d72008-06-11 18:37:52 +00008102 /* Copy ASCII characters as-is */
8103 else if (ch < 0x7F) {
8104 *p++ = ch;
8105 }
8106
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008108 else {
8109 Py_UCS4 ucs = ch;
8110
8111#ifndef Py_UNICODE_WIDE
8112 Py_UNICODE ch2 = 0;
8113 /* Get code point from surrogate pair */
8114 if (size > 0) {
8115 ch2 = *s;
8116 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008118 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008120 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008121 size--;
8122 }
8123 }
8124#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008125 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008126 (categories Z* and C* except ASCII space)
8127 */
8128 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8129 /* Map 8-bit characters to '\xhh' */
8130 if (ucs <= 0xff) {
8131 *p++ = '\\';
8132 *p++ = 'x';
8133 *p++ = hexdigits[(ch >> 4) & 0x000F];
8134 *p++ = hexdigits[ch & 0x000F];
8135 }
8136 /* Map 21-bit characters to '\U00xxxxxx' */
8137 else if (ucs >= 0x10000) {
8138 *p++ = '\\';
8139 *p++ = 'U';
8140 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8141 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8142 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8143 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8144 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8145 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8146 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8147 *p++ = hexdigits[ucs & 0x0000000F];
8148 }
8149 /* Map 16-bit characters to '\uxxxx' */
8150 else {
8151 *p++ = '\\';
8152 *p++ = 'u';
8153 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8154 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8155 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8156 *p++ = hexdigits[ucs & 0x000F];
8157 }
8158 }
8159 /* Copy characters as-is */
8160 else {
8161 *p++ = ch;
8162#ifndef Py_UNICODE_WIDE
8163 if (ucs >= 0x10000)
8164 *p++ = ch2;
8165#endif
8166 }
8167 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008168 }
8169 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008170 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008171
8172 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008173 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008174 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175}
8176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008177PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179\n\
8180Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008181such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182arguments start and end are interpreted as in slice notation.\n\
8183\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008184Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185
8186static PyObject *
8187unicode_rfind(PyUnicodeObject *self, PyObject *args)
8188{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008189 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008190 Py_ssize_t start;
8191 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008192 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193
Christian Heimes9cd17752007-11-18 19:35:23 +00008194 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008195 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196
Thomas Wouters477c8d52006-05-27 19:21:47 +00008197 result = stringlib_rfind_slice(
8198 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8199 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8200 start, end
8201 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202
8203 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008204
Christian Heimes217cfd12007-12-02 14:31:20 +00008205 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206}
8207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008208PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008211Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212
8213static PyObject *
8214unicode_rindex(PyUnicodeObject *self, PyObject *args)
8215{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008216 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008217 Py_ssize_t start;
8218 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008219 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220
Christian Heimes9cd17752007-11-18 19:35:23 +00008221 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223
Thomas Wouters477c8d52006-05-27 19:21:47 +00008224 result = stringlib_rfind_slice(
8225 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8226 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8227 start, end
8228 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229
8230 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008231
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232 if (result < 0) {
8233 PyErr_SetString(PyExc_ValueError, "substring not found");
8234 return NULL;
8235 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008236 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237}
8238
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008239PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008242Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008243done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244
8245static PyObject *
8246unicode_rjust(PyUnicodeObject *self, PyObject *args)
8247{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008248 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008249 Py_UNICODE fillchar = ' ';
8250
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008251 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 return NULL;
8253
Tim Peters7a29bd52001-09-12 03:03:31 +00008254 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 Py_INCREF(self);
8256 return (PyObject*) self;
8257 }
8258
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008259 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260}
8261
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 PyObject *sep,
8264 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265{
8266 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008267
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 s = PyUnicode_FromObject(s);
8269 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008270 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 if (sep != NULL) {
8272 sep = PyUnicode_FromObject(sep);
8273 if (sep == NULL) {
8274 Py_DECREF(s);
8275 return NULL;
8276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 }
8278
8279 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8280
8281 Py_DECREF(s);
8282 Py_XDECREF(sep);
8283 return result;
8284}
8285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008286PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288\n\
8289Return a list of the words in S, using sep as the\n\
8290delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008291splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008292whitespace string is a separator and empty strings are\n\
8293removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294
8295static PyObject*
8296unicode_split(PyUnicodeObject *self, PyObject *args)
8297{
8298 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008299 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300
Martin v. Löwis18e16552006-02-15 17:27:45 +00008301 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 return NULL;
8303
8304 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310}
8311
Thomas Wouters477c8d52006-05-27 19:21:47 +00008312PyObject *
8313PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8314{
8315 PyObject* str_obj;
8316 PyObject* sep_obj;
8317 PyObject* out;
8318
8319 str_obj = PyUnicode_FromObject(str_in);
8320 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008322 sep_obj = PyUnicode_FromObject(sep_in);
8323 if (!sep_obj) {
8324 Py_DECREF(str_obj);
8325 return NULL;
8326 }
8327
8328 out = stringlib_partition(
8329 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8330 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8331 );
8332
8333 Py_DECREF(sep_obj);
8334 Py_DECREF(str_obj);
8335
8336 return out;
8337}
8338
8339
8340PyObject *
8341PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8342{
8343 PyObject* str_obj;
8344 PyObject* sep_obj;
8345 PyObject* out;
8346
8347 str_obj = PyUnicode_FromObject(str_in);
8348 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008350 sep_obj = PyUnicode_FromObject(sep_in);
8351 if (!sep_obj) {
8352 Py_DECREF(str_obj);
8353 return NULL;
8354 }
8355
8356 out = stringlib_rpartition(
8357 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8358 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8359 );
8360
8361 Py_DECREF(sep_obj);
8362 Py_DECREF(str_obj);
8363
8364 return out;
8365}
8366
8367PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008369\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008370Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008371the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008372found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008373
8374static PyObject*
8375unicode_partition(PyUnicodeObject *self, PyObject *separator)
8376{
8377 return PyUnicode_Partition((PyObject *)self, separator);
8378}
8379
8380PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008381 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008382\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008383Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008384the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008385separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008386
8387static PyObject*
8388unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8389{
8390 return PyUnicode_RPartition((PyObject *)self, separator);
8391}
8392
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008393PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 PyObject *sep,
8395 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008396{
8397 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008398
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008399 s = PyUnicode_FromObject(s);
8400 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008401 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 if (sep != NULL) {
8403 sep = PyUnicode_FromObject(sep);
8404 if (sep == NULL) {
8405 Py_DECREF(s);
8406 return NULL;
8407 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008408 }
8409
8410 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8411
8412 Py_DECREF(s);
8413 Py_XDECREF(sep);
8414 return result;
8415}
8416
8417PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008419\n\
8420Return a list of the words in S, using sep as the\n\
8421delimiter string, starting at the end of the string and\n\
8422working to the front. If maxsplit is given, at most maxsplit\n\
8423splits are done. If sep is not specified, any whitespace string\n\
8424is a separator.");
8425
8426static PyObject*
8427unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8428{
8429 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008430 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008431
Martin v. Löwis18e16552006-02-15 17:27:45 +00008432 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008433 return NULL;
8434
8435 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008437 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008439 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008441}
8442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008443PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445\n\
8446Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008447Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008448is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449
8450static PyObject*
8451unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8452{
Guido van Rossum86662912000-04-11 15:38:46 +00008453 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454
Guido van Rossum86662912000-04-11 15:38:46 +00008455 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 return NULL;
8457
Guido van Rossum86662912000-04-11 15:38:46 +00008458 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459}
8460
8461static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008462PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463{
Walter Dörwald346737f2007-05-31 10:44:43 +00008464 if (PyUnicode_CheckExact(self)) {
8465 Py_INCREF(self);
8466 return self;
8467 } else
8468 /* Subtype -- return genuine unicode string with the same value. */
8469 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8470 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471}
8472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008473PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475\n\
8476Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008477and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478
8479static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008480unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 return fixup(self, fixswapcase);
8483}
8484
Georg Brandlceee0772007-11-27 23:48:05 +00008485PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008487\n\
8488Return a translation table usable for str.translate().\n\
8489If there is only one argument, it must be a dictionary mapping Unicode\n\
8490ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008491Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008492If there are two arguments, they must be strings of equal length, and\n\
8493in the resulting dictionary, each character in x will be mapped to the\n\
8494character at the same position in y. If there is a third argument, it\n\
8495must be a string, whose characters will be mapped to None in the result.");
8496
8497static PyObject*
8498unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8499{
8500 PyObject *x, *y = NULL, *z = NULL;
8501 PyObject *new = NULL, *key, *value;
8502 Py_ssize_t i = 0;
8503 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008504
Georg Brandlceee0772007-11-27 23:48:05 +00008505 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8506 return NULL;
8507 new = PyDict_New();
8508 if (!new)
8509 return NULL;
8510 if (y != NULL) {
8511 /* x must be a string too, of equal length */
8512 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8513 if (!PyUnicode_Check(x)) {
8514 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8515 "be a string if there is a second argument");
8516 goto err;
8517 }
8518 if (PyUnicode_GET_SIZE(x) != ylen) {
8519 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8520 "arguments must have equal length");
8521 goto err;
8522 }
8523 /* create entries for translating chars in x to those in y */
8524 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008525 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8526 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008527 if (!key || !value)
8528 goto err;
8529 res = PyDict_SetItem(new, key, value);
8530 Py_DECREF(key);
8531 Py_DECREF(value);
8532 if (res < 0)
8533 goto err;
8534 }
8535 /* create entries for deleting chars in z */
8536 if (z != NULL) {
8537 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008538 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008539 if (!key)
8540 goto err;
8541 res = PyDict_SetItem(new, key, Py_None);
8542 Py_DECREF(key);
8543 if (res < 0)
8544 goto err;
8545 }
8546 }
8547 } else {
8548 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008549 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008550 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8551 "to maketrans it must be a dict");
8552 goto err;
8553 }
8554 /* copy entries into the new dict, converting string keys to int keys */
8555 while (PyDict_Next(x, &i, &key, &value)) {
8556 if (PyUnicode_Check(key)) {
8557 /* convert string keys to integer keys */
8558 PyObject *newkey;
8559 if (PyUnicode_GET_SIZE(key) != 1) {
8560 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8561 "table must be of length 1");
8562 goto err;
8563 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008564 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008565 if (!newkey)
8566 goto err;
8567 res = PyDict_SetItem(new, newkey, value);
8568 Py_DECREF(newkey);
8569 if (res < 0)
8570 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008571 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008572 /* just keep integer keys */
8573 if (PyDict_SetItem(new, key, value) < 0)
8574 goto err;
8575 } else {
8576 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8577 "be strings or integers");
8578 goto err;
8579 }
8580 }
8581 }
8582 return new;
8583 err:
8584 Py_DECREF(new);
8585 return NULL;
8586}
8587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008588PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590\n\
8591Return a copy of the string S, where all characters have been mapped\n\
8592through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008593Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008594Unmapped characters are left untouched. Characters mapped to None\n\
8595are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596
8597static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008598unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599{
Georg Brandlceee0772007-11-27 23:48:05 +00008600 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601}
8602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008603PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008606Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607
8608static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008609unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 return fixup(self, fixupper);
8612}
8613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008614PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008617Pad a numeric string S with zeros on the left, to fill a field\n\
8618of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619
8620static PyObject *
8621unicode_zfill(PyUnicodeObject *self, PyObject *args)
8622{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008623 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 PyUnicodeObject *u;
8625
Martin v. Löwis18e16552006-02-15 17:27:45 +00008626 Py_ssize_t width;
8627 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 return NULL;
8629
8630 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008631 if (PyUnicode_CheckExact(self)) {
8632 Py_INCREF(self);
8633 return (PyObject*) self;
8634 }
8635 else
8636 return PyUnicode_FromUnicode(
8637 PyUnicode_AS_UNICODE(self),
8638 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640 }
8641
8642 fill = width - self->length;
8643
8644 u = pad(self, fill, 0, '0');
8645
Walter Dörwald068325e2002-04-15 13:36:47 +00008646 if (u == NULL)
8647 return NULL;
8648
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 if (u->str[fill] == '+' || u->str[fill] == '-') {
8650 /* move sign to beginning of string */
8651 u->str[0] = u->str[fill];
8652 u->str[fill] = '0';
8653 }
8654
8655 return (PyObject*) u;
8656}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657
8658#if 0
8659static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008660unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661{
Christian Heimes2202f872008-02-06 14:31:34 +00008662 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663}
8664#endif
8665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008666PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008669Return True if S starts with the specified prefix, False otherwise.\n\
8670With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008671With optional end, stop comparing S at that position.\n\
8672prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673
8674static PyObject *
8675unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008678 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008680 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008681 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008682 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008684 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8686 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008687 if (PyTuple_Check(subobj)) {
8688 Py_ssize_t i;
8689 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8690 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008692 if (substring == NULL)
8693 return NULL;
8694 result = tailmatch(self, substring, start, end, -1);
8695 Py_DECREF(substring);
8696 if (result) {
8697 Py_RETURN_TRUE;
8698 }
8699 }
8700 /* nothing matched */
8701 Py_RETURN_FALSE;
8702 }
8703 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008706 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008708 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709}
8710
8711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008712PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008715Return True if S ends with the specified suffix, False otherwise.\n\
8716With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008717With optional end, stop comparing S at that position.\n\
8718suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719
8720static PyObject *
8721unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008724 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008726 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008727 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008728 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008730 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8732 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008733 if (PyTuple_Check(subobj)) {
8734 Py_ssize_t i;
8735 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8736 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008738 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008740 result = tailmatch(self, substring, start, end, +1);
8741 Py_DECREF(substring);
8742 if (result) {
8743 Py_RETURN_TRUE;
8744 }
8745 }
8746 Py_RETURN_FALSE;
8747 }
8748 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008752 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008754 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755}
8756
Eric Smith8c663262007-08-25 02:26:07 +00008757#include "stringlib/string_format.h"
8758
8759PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008761\n\
8762");
8763
Eric Smith4a7d76d2008-05-30 18:10:19 +00008764static PyObject *
8765unicode__format__(PyObject* self, PyObject* args)
8766{
8767 PyObject *format_spec;
8768
8769 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8770 return NULL;
8771
8772 return _PyUnicode_FormatAdvanced(self,
8773 PyUnicode_AS_UNICODE(format_spec),
8774 PyUnicode_GET_SIZE(format_spec));
8775}
8776
Eric Smith8c663262007-08-25 02:26:07 +00008777PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008779\n\
8780");
8781
8782static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008783unicode__sizeof__(PyUnicodeObject *v)
8784{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008785 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8786 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008787}
8788
8789PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008791
8792static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008793unicode_getnewargs(PyUnicodeObject *v)
8794{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008795 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008796}
8797
8798
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799static PyMethodDef unicode_methods[] = {
8800
8801 /* Order is according to common usage: often used methods should
8802 appear first, since lookup is done sequentially. */
8803
Benjamin Peterson308d6372009-09-18 21:42:35 +00008804 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008805 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8806 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008807 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008808 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8809 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8810 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8811 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8812 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8813 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8814 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008815 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008816 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8817 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8818 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008819 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008820 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8821 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8822 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008823 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008824 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008825 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008826 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008827 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8828 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8829 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8830 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8831 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8832 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8833 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8834 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8835 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8836 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8837 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8838 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8839 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8840 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008841 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008842 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008843 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008844 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008845 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008846 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8847 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008848 {"maketrans", (PyCFunction) unicode_maketrans,
8849 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008850 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008851#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008852 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853#endif
8854
8855#if 0
8856 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008857 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858#endif
8859
Benjamin Peterson14339b62009-01-31 16:36:08 +00008860 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 {NULL, NULL}
8862};
8863
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008864static PyObject *
8865unicode_mod(PyObject *v, PyObject *w)
8866{
Benjamin Peterson29060642009-01-31 22:14:21 +00008867 if (!PyUnicode_Check(v)) {
8868 Py_INCREF(Py_NotImplemented);
8869 return Py_NotImplemented;
8870 }
8871 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008872}
8873
8874static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008875 0, /*nb_add*/
8876 0, /*nb_subtract*/
8877 0, /*nb_multiply*/
8878 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008879};
8880
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008882 (lenfunc) unicode_length, /* sq_length */
8883 PyUnicode_Concat, /* sq_concat */
8884 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8885 (ssizeargfunc) unicode_getitem, /* sq_item */
8886 0, /* sq_slice */
8887 0, /* sq_ass_item */
8888 0, /* sq_ass_slice */
8889 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890};
8891
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008892static PyObject*
8893unicode_subscript(PyUnicodeObject* self, PyObject* item)
8894{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008895 if (PyIndex_Check(item)) {
8896 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008897 if (i == -1 && PyErr_Occurred())
8898 return NULL;
8899 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008900 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008901 return unicode_getitem(self, i);
8902 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008903 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008904 Py_UNICODE* source_buf;
8905 Py_UNICODE* result_buf;
8906 PyObject* result;
8907
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008908 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008910 return NULL;
8911 }
8912
8913 if (slicelength <= 0) {
8914 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008915 } else if (start == 0 && step == 1 && slicelength == self->length &&
8916 PyUnicode_CheckExact(self)) {
8917 Py_INCREF(self);
8918 return (PyObject *)self;
8919 } else if (step == 1) {
8920 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008921 } else {
8922 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008923 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8924 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008925
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 if (result_buf == NULL)
8927 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008928
8929 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8930 result_buf[i] = source_buf[cur];
8931 }
Tim Petersced69f82003-09-16 20:30:58 +00008932
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008933 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008934 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008935 return result;
8936 }
8937 } else {
8938 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8939 return NULL;
8940 }
8941}
8942
8943static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008944 (lenfunc)unicode_length, /* mp_length */
8945 (binaryfunc)unicode_subscript, /* mp_subscript */
8946 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008947};
8948
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950/* Helpers for PyUnicode_Format() */
8951
8952static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008953getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008955 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 (*p_argidx)++;
8958 if (arglen < 0)
8959 return args;
8960 else
8961 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 }
8963 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 return NULL;
8966}
8967
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008968/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008970static PyObject *
8971formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008973 char *p;
8974 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008976
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 x = PyFloat_AsDouble(v);
8978 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008979 return NULL;
8980
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00008983
Eric Smith0923d1d2009-04-16 20:16:10 +00008984 p = PyOS_double_to_string(x, type, prec,
8985 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00008986 if (p == NULL)
8987 return NULL;
8988 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00008989 PyMem_Free(p);
8990 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991}
8992
Tim Peters38fd5b62000-09-21 05:43:11 +00008993static PyObject*
8994formatlong(PyObject *val, int flags, int prec, int type)
8995{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008996 char *buf;
8997 int len;
8998 PyObject *str; /* temporary string object. */
8999 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009000
Benjamin Peterson14339b62009-01-31 16:36:08 +00009001 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9002 if (!str)
9003 return NULL;
9004 result = PyUnicode_FromStringAndSize(buf, len);
9005 Py_DECREF(str);
9006 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009007}
9008
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009static int
9010formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009011 size_t buflen,
9012 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009014 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009015 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009016 if (PyUnicode_GET_SIZE(v) == 1) {
9017 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9018 buf[1] = '\0';
9019 return 1;
9020 }
9021#ifndef Py_UNICODE_WIDE
9022 if (PyUnicode_GET_SIZE(v) == 2) {
9023 /* Decode a valid surrogate pair */
9024 int c0 = PyUnicode_AS_UNICODE(v)[0];
9025 int c1 = PyUnicode_AS_UNICODE(v)[1];
9026 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9027 0xDC00 <= c1 && c1 <= 0xDFFF) {
9028 buf[0] = c0;
9029 buf[1] = c1;
9030 buf[2] = '\0';
9031 return 2;
9032 }
9033 }
9034#endif
9035 goto onError;
9036 }
9037 else {
9038 /* Integer input truncated to a character */
9039 long x;
9040 x = PyLong_AsLong(v);
9041 if (x == -1 && PyErr_Occurred())
9042 goto onError;
9043
9044 if (x < 0 || x > 0x10ffff) {
9045 PyErr_SetString(PyExc_OverflowError,
9046 "%c arg not in range(0x110000)");
9047 return -1;
9048 }
9049
9050#ifndef Py_UNICODE_WIDE
9051 if (x > 0xffff) {
9052 x -= 0x10000;
9053 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9054 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9055 return 2;
9056 }
9057#endif
9058 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009059 buf[1] = '\0';
9060 return 1;
9061 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009062
Benjamin Peterson29060642009-01-31 22:14:21 +00009063 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009064 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009065 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009066 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067}
9068
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009069/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009070 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009071*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009072#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009073
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076{
9077 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009078 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079 int args_owned = 0;
9080 PyUnicodeObject *result = NULL;
9081 PyObject *dict = NULL;
9082 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009083
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 PyErr_BadInternalCall();
9086 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087 }
9088 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009089 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 fmt = PyUnicode_AS_UNICODE(uformat);
9092 fmtcnt = PyUnicode_GET_SIZE(uformat);
9093
9094 reslen = rescnt = fmtcnt + 100;
9095 result = _PyUnicode_New(reslen);
9096 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 res = PyUnicode_AS_UNICODE(result);
9099
9100 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009101 arglen = PyTuple_Size(args);
9102 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103 }
9104 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 arglen = -1;
9106 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009108 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009109 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111
9112 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 if (*fmt != '%') {
9114 if (--rescnt < 0) {
9115 rescnt = fmtcnt + 100;
9116 reslen += rescnt;
9117 if (_PyUnicode_Resize(&result, reslen) < 0)
9118 goto onError;
9119 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9120 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009121 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009123 }
9124 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 /* Got a format specifier */
9126 int flags = 0;
9127 Py_ssize_t width = -1;
9128 int prec = -1;
9129 Py_UNICODE c = '\0';
9130 Py_UNICODE fill;
9131 int isnumok;
9132 PyObject *v = NULL;
9133 PyObject *temp = NULL;
9134 Py_UNICODE *pbuf;
9135 Py_UNICODE sign;
9136 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009137 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 fmt++;
9140 if (*fmt == '(') {
9141 Py_UNICODE *keystart;
9142 Py_ssize_t keylen;
9143 PyObject *key;
9144 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009145
Benjamin Peterson29060642009-01-31 22:14:21 +00009146 if (dict == NULL) {
9147 PyErr_SetString(PyExc_TypeError,
9148 "format requires a mapping");
9149 goto onError;
9150 }
9151 ++fmt;
9152 --fmtcnt;
9153 keystart = fmt;
9154 /* Skip over balanced parentheses */
9155 while (pcount > 0 && --fmtcnt >= 0) {
9156 if (*fmt == ')')
9157 --pcount;
9158 else if (*fmt == '(')
9159 ++pcount;
9160 fmt++;
9161 }
9162 keylen = fmt - keystart - 1;
9163 if (fmtcnt < 0 || pcount > 0) {
9164 PyErr_SetString(PyExc_ValueError,
9165 "incomplete format key");
9166 goto onError;
9167 }
9168#if 0
9169 /* keys are converted to strings using UTF-8 and
9170 then looked up since Python uses strings to hold
9171 variables names etc. in its namespaces and we
9172 wouldn't want to break common idioms. */
9173 key = PyUnicode_EncodeUTF8(keystart,
9174 keylen,
9175 NULL);
9176#else
9177 key = PyUnicode_FromUnicode(keystart, keylen);
9178#endif
9179 if (key == NULL)
9180 goto onError;
9181 if (args_owned) {
9182 Py_DECREF(args);
9183 args_owned = 0;
9184 }
9185 args = PyObject_GetItem(dict, key);
9186 Py_DECREF(key);
9187 if (args == NULL) {
9188 goto onError;
9189 }
9190 args_owned = 1;
9191 arglen = -1;
9192 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009193 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 while (--fmtcnt >= 0) {
9195 switch (c = *fmt++) {
9196 case '-': flags |= F_LJUST; continue;
9197 case '+': flags |= F_SIGN; continue;
9198 case ' ': flags |= F_BLANK; continue;
9199 case '#': flags |= F_ALT; continue;
9200 case '0': flags |= F_ZERO; continue;
9201 }
9202 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009203 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 if (c == '*') {
9205 v = getnextarg(args, arglen, &argidx);
9206 if (v == NULL)
9207 goto onError;
9208 if (!PyLong_Check(v)) {
9209 PyErr_SetString(PyExc_TypeError,
9210 "* wants int");
9211 goto onError;
9212 }
9213 width = PyLong_AsLong(v);
9214 if (width == -1 && PyErr_Occurred())
9215 goto onError;
9216 if (width < 0) {
9217 flags |= F_LJUST;
9218 width = -width;
9219 }
9220 if (--fmtcnt >= 0)
9221 c = *fmt++;
9222 }
9223 else if (c >= '0' && c <= '9') {
9224 width = c - '0';
9225 while (--fmtcnt >= 0) {
9226 c = *fmt++;
9227 if (c < '0' || c > '9')
9228 break;
9229 if ((width*10) / 10 != width) {
9230 PyErr_SetString(PyExc_ValueError,
9231 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009232 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009233 }
9234 width = width*10 + (c - '0');
9235 }
9236 }
9237 if (c == '.') {
9238 prec = 0;
9239 if (--fmtcnt >= 0)
9240 c = *fmt++;
9241 if (c == '*') {
9242 v = getnextarg(args, arglen, &argidx);
9243 if (v == NULL)
9244 goto onError;
9245 if (!PyLong_Check(v)) {
9246 PyErr_SetString(PyExc_TypeError,
9247 "* wants int");
9248 goto onError;
9249 }
9250 prec = PyLong_AsLong(v);
9251 if (prec == -1 && PyErr_Occurred())
9252 goto onError;
9253 if (prec < 0)
9254 prec = 0;
9255 if (--fmtcnt >= 0)
9256 c = *fmt++;
9257 }
9258 else if (c >= '0' && c <= '9') {
9259 prec = c - '0';
9260 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009261 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009262 if (c < '0' || c > '9')
9263 break;
9264 if ((prec*10) / 10 != prec) {
9265 PyErr_SetString(PyExc_ValueError,
9266 "prec too big");
9267 goto onError;
9268 }
9269 prec = prec*10 + (c - '0');
9270 }
9271 }
9272 } /* prec */
9273 if (fmtcnt >= 0) {
9274 if (c == 'h' || c == 'l' || c == 'L') {
9275 if (--fmtcnt >= 0)
9276 c = *fmt++;
9277 }
9278 }
9279 if (fmtcnt < 0) {
9280 PyErr_SetString(PyExc_ValueError,
9281 "incomplete format");
9282 goto onError;
9283 }
9284 if (c != '%') {
9285 v = getnextarg(args, arglen, &argidx);
9286 if (v == NULL)
9287 goto onError;
9288 }
9289 sign = 0;
9290 fill = ' ';
9291 switch (c) {
9292
9293 case '%':
9294 pbuf = formatbuf;
9295 /* presume that buffer length is at least 1 */
9296 pbuf[0] = '%';
9297 len = 1;
9298 break;
9299
9300 case 's':
9301 case 'r':
9302 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009303 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009304 temp = v;
9305 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009306 }
9307 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009308 if (c == 's')
9309 temp = PyObject_Str(v);
9310 else if (c == 'r')
9311 temp = PyObject_Repr(v);
9312 else
9313 temp = PyObject_ASCII(v);
9314 if (temp == NULL)
9315 goto onError;
9316 if (PyUnicode_Check(temp))
9317 /* nothing to do */;
9318 else {
9319 Py_DECREF(temp);
9320 PyErr_SetString(PyExc_TypeError,
9321 "%s argument has non-string str()");
9322 goto onError;
9323 }
9324 }
9325 pbuf = PyUnicode_AS_UNICODE(temp);
9326 len = PyUnicode_GET_SIZE(temp);
9327 if (prec >= 0 && len > prec)
9328 len = prec;
9329 break;
9330
9331 case 'i':
9332 case 'd':
9333 case 'u':
9334 case 'o':
9335 case 'x':
9336 case 'X':
9337 if (c == 'i')
9338 c = 'd';
9339 isnumok = 0;
9340 if (PyNumber_Check(v)) {
9341 PyObject *iobj=NULL;
9342
9343 if (PyLong_Check(v)) {
9344 iobj = v;
9345 Py_INCREF(iobj);
9346 }
9347 else {
9348 iobj = PyNumber_Long(v);
9349 }
9350 if (iobj!=NULL) {
9351 if (PyLong_Check(iobj)) {
9352 isnumok = 1;
9353 temp = formatlong(iobj, flags, prec, c);
9354 Py_DECREF(iobj);
9355 if (!temp)
9356 goto onError;
9357 pbuf = PyUnicode_AS_UNICODE(temp);
9358 len = PyUnicode_GET_SIZE(temp);
9359 sign = 1;
9360 }
9361 else {
9362 Py_DECREF(iobj);
9363 }
9364 }
9365 }
9366 if (!isnumok) {
9367 PyErr_Format(PyExc_TypeError,
9368 "%%%c format: a number is required, "
9369 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9370 goto onError;
9371 }
9372 if (flags & F_ZERO)
9373 fill = '0';
9374 break;
9375
9376 case 'e':
9377 case 'E':
9378 case 'f':
9379 case 'F':
9380 case 'g':
9381 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009382 temp = formatfloat(v, flags, prec, c);
9383 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009385 pbuf = PyUnicode_AS_UNICODE(temp);
9386 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 sign = 1;
9388 if (flags & F_ZERO)
9389 fill = '0';
9390 break;
9391
9392 case 'c':
9393 pbuf = formatbuf;
9394 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9395 if (len < 0)
9396 goto onError;
9397 break;
9398
9399 default:
9400 PyErr_Format(PyExc_ValueError,
9401 "unsupported format character '%c' (0x%x) "
9402 "at index %zd",
9403 (31<=c && c<=126) ? (char)c : '?',
9404 (int)c,
9405 (Py_ssize_t)(fmt - 1 -
9406 PyUnicode_AS_UNICODE(uformat)));
9407 goto onError;
9408 }
9409 if (sign) {
9410 if (*pbuf == '-' || *pbuf == '+') {
9411 sign = *pbuf++;
9412 len--;
9413 }
9414 else if (flags & F_SIGN)
9415 sign = '+';
9416 else if (flags & F_BLANK)
9417 sign = ' ';
9418 else
9419 sign = 0;
9420 }
9421 if (width < len)
9422 width = len;
9423 if (rescnt - (sign != 0) < width) {
9424 reslen -= rescnt;
9425 rescnt = width + fmtcnt + 100;
9426 reslen += rescnt;
9427 if (reslen < 0) {
9428 Py_XDECREF(temp);
9429 PyErr_NoMemory();
9430 goto onError;
9431 }
9432 if (_PyUnicode_Resize(&result, reslen) < 0) {
9433 Py_XDECREF(temp);
9434 goto onError;
9435 }
9436 res = PyUnicode_AS_UNICODE(result)
9437 + reslen - rescnt;
9438 }
9439 if (sign) {
9440 if (fill != ' ')
9441 *res++ = sign;
9442 rescnt--;
9443 if (width > len)
9444 width--;
9445 }
9446 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9447 assert(pbuf[0] == '0');
9448 assert(pbuf[1] == c);
9449 if (fill != ' ') {
9450 *res++ = *pbuf++;
9451 *res++ = *pbuf++;
9452 }
9453 rescnt -= 2;
9454 width -= 2;
9455 if (width < 0)
9456 width = 0;
9457 len -= 2;
9458 }
9459 if (width > len && !(flags & F_LJUST)) {
9460 do {
9461 --rescnt;
9462 *res++ = fill;
9463 } while (--width > len);
9464 }
9465 if (fill == ' ') {
9466 if (sign)
9467 *res++ = sign;
9468 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9469 assert(pbuf[0] == '0');
9470 assert(pbuf[1] == c);
9471 *res++ = *pbuf++;
9472 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009473 }
9474 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 Py_UNICODE_COPY(res, pbuf, len);
9476 res += len;
9477 rescnt -= len;
9478 while (--width >= len) {
9479 --rescnt;
9480 *res++ = ' ';
9481 }
9482 if (dict && (argidx < arglen) && c != '%') {
9483 PyErr_SetString(PyExc_TypeError,
9484 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009485 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009486 goto onError;
9487 }
9488 Py_XDECREF(temp);
9489 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490 } /* until end */
9491 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 PyErr_SetString(PyExc_TypeError,
9493 "not all arguments converted during string formatting");
9494 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 }
9496
Thomas Woutersa96affe2006-03-12 00:29:36 +00009497 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009500 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 }
9502 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 return (PyObject *)result;
9504
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 Py_XDECREF(result);
9507 Py_DECREF(uformat);
9508 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 }
9511 return NULL;
9512}
9513
Jeremy Hylton938ace62002-07-17 16:30:39 +00009514static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009515unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9516
Tim Peters6d6c1a32001-08-02 04:15:00 +00009517static PyObject *
9518unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9519{
Benjamin Peterson29060642009-01-31 22:14:21 +00009520 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009521 static char *kwlist[] = {"object", "encoding", "errors", 0};
9522 char *encoding = NULL;
9523 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009524
Benjamin Peterson14339b62009-01-31 16:36:08 +00009525 if (type != &PyUnicode_Type)
9526 return unicode_subtype_new(type, args, kwds);
9527 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009528 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009529 return NULL;
9530 if (x == NULL)
9531 return (PyObject *)_PyUnicode_New(0);
9532 if (encoding == NULL && errors == NULL)
9533 return PyObject_Str(x);
9534 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009536}
9537
Guido van Rossume023fe02001-08-30 03:12:59 +00009538static PyObject *
9539unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9540{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009541 PyUnicodeObject *tmp, *pnew;
9542 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009543
Benjamin Peterson14339b62009-01-31 16:36:08 +00009544 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9545 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9546 if (tmp == NULL)
9547 return NULL;
9548 assert(PyUnicode_Check(tmp));
9549 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9550 if (pnew == NULL) {
9551 Py_DECREF(tmp);
9552 return NULL;
9553 }
9554 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9555 if (pnew->str == NULL) {
9556 _Py_ForgetReference((PyObject *)pnew);
9557 PyObject_Del(pnew);
9558 Py_DECREF(tmp);
9559 return PyErr_NoMemory();
9560 }
9561 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9562 pnew->length = n;
9563 pnew->hash = tmp->hash;
9564 Py_DECREF(tmp);
9565 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009566}
9567
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009568PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009570\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009571Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009572encoding defaults to the current default string encoding.\n\
9573errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009574
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009575static PyObject *unicode_iter(PyObject *seq);
9576
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009578 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009579 "str", /* tp_name */
9580 sizeof(PyUnicodeObject), /* tp_size */
9581 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009583 (destructor)unicode_dealloc, /* tp_dealloc */
9584 0, /* tp_print */
9585 0, /* tp_getattr */
9586 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009587 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009588 unicode_repr, /* tp_repr */
9589 &unicode_as_number, /* tp_as_number */
9590 &unicode_as_sequence, /* tp_as_sequence */
9591 &unicode_as_mapping, /* tp_as_mapping */
9592 (hashfunc) unicode_hash, /* tp_hash*/
9593 0, /* tp_call*/
9594 (reprfunc) unicode_str, /* tp_str */
9595 PyObject_GenericGetAttr, /* tp_getattro */
9596 0, /* tp_setattro */
9597 0, /* tp_as_buffer */
9598 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009599 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009600 unicode_doc, /* tp_doc */
9601 0, /* tp_traverse */
9602 0, /* tp_clear */
9603 PyUnicode_RichCompare, /* tp_richcompare */
9604 0, /* tp_weaklistoffset */
9605 unicode_iter, /* tp_iter */
9606 0, /* tp_iternext */
9607 unicode_methods, /* tp_methods */
9608 0, /* tp_members */
9609 0, /* tp_getset */
9610 &PyBaseObject_Type, /* tp_base */
9611 0, /* tp_dict */
9612 0, /* tp_descr_get */
9613 0, /* tp_descr_set */
9614 0, /* tp_dictoffset */
9615 0, /* tp_init */
9616 0, /* tp_alloc */
9617 unicode_new, /* tp_new */
9618 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619};
9620
9621/* Initialize the Unicode implementation */
9622
Thomas Wouters78890102000-07-22 19:25:51 +00009623void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009625 int i;
9626
Thomas Wouters477c8d52006-05-27 19:21:47 +00009627 /* XXX - move this array to unicodectype.c ? */
9628 Py_UNICODE linebreak[] = {
9629 0x000A, /* LINE FEED */
9630 0x000D, /* CARRIAGE RETURN */
9631 0x001C, /* FILE SEPARATOR */
9632 0x001D, /* GROUP SEPARATOR */
9633 0x001E, /* RECORD SEPARATOR */
9634 0x0085, /* NEXT LINE */
9635 0x2028, /* LINE SEPARATOR */
9636 0x2029, /* PARAGRAPH SEPARATOR */
9637 };
9638
Fred Drakee4315f52000-05-09 19:53:39 +00009639 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009640 free_list = NULL;
9641 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009642 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009643 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009644 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009645
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009646 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009647 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009648 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009649 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009650
9651 /* initialize the linebreak bloom filter */
9652 bloom_linebreak = make_bloom_mask(
9653 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9654 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009655
9656 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657}
9658
9659/* Finalize the Unicode implementation */
9660
Christian Heimesa156e092008-02-16 07:38:31 +00009661int
9662PyUnicode_ClearFreeList(void)
9663{
9664 int freelist_size = numfree;
9665 PyUnicodeObject *u;
9666
9667 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009668 PyUnicodeObject *v = u;
9669 u = *(PyUnicodeObject **)u;
9670 if (v->str)
9671 PyObject_DEL(v->str);
9672 Py_XDECREF(v->defenc);
9673 PyObject_Del(v);
9674 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009675 }
9676 free_list = NULL;
9677 assert(numfree == 0);
9678 return freelist_size;
9679}
9680
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681void
Thomas Wouters78890102000-07-22 19:25:51 +00009682_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009684 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009686 Py_XDECREF(unicode_empty);
9687 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009688
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009689 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009690 if (unicode_latin1[i]) {
9691 Py_DECREF(unicode_latin1[i]);
9692 unicode_latin1[i] = NULL;
9693 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009694 }
Christian Heimesa156e092008-02-16 07:38:31 +00009695 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009697
Walter Dörwald16807132007-05-25 13:52:07 +00009698void
9699PyUnicode_InternInPlace(PyObject **p)
9700{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009701 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9702 PyObject *t;
9703 if (s == NULL || !PyUnicode_Check(s))
9704 Py_FatalError(
9705 "PyUnicode_InternInPlace: unicode strings only please!");
9706 /* If it's a subclass, we don't really know what putting
9707 it in the interned dict might do. */
9708 if (!PyUnicode_CheckExact(s))
9709 return;
9710 if (PyUnicode_CHECK_INTERNED(s))
9711 return;
9712 if (interned == NULL) {
9713 interned = PyDict_New();
9714 if (interned == NULL) {
9715 PyErr_Clear(); /* Don't leave an exception */
9716 return;
9717 }
9718 }
9719 /* It might be that the GetItem call fails even
9720 though the key is present in the dictionary,
9721 namely when this happens during a stack overflow. */
9722 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009723 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009724 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009725
Benjamin Peterson29060642009-01-31 22:14:21 +00009726 if (t) {
9727 Py_INCREF(t);
9728 Py_DECREF(*p);
9729 *p = t;
9730 return;
9731 }
Walter Dörwald16807132007-05-25 13:52:07 +00009732
Benjamin Peterson14339b62009-01-31 16:36:08 +00009733 PyThreadState_GET()->recursion_critical = 1;
9734 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9735 PyErr_Clear();
9736 PyThreadState_GET()->recursion_critical = 0;
9737 return;
9738 }
9739 PyThreadState_GET()->recursion_critical = 0;
9740 /* The two references in interned are not counted by refcnt.
9741 The deallocator will take care of this */
9742 Py_REFCNT(s) -= 2;
9743 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009744}
9745
9746void
9747PyUnicode_InternImmortal(PyObject **p)
9748{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009749 PyUnicode_InternInPlace(p);
9750 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9751 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9752 Py_INCREF(*p);
9753 }
Walter Dörwald16807132007-05-25 13:52:07 +00009754}
9755
9756PyObject *
9757PyUnicode_InternFromString(const char *cp)
9758{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009759 PyObject *s = PyUnicode_FromString(cp);
9760 if (s == NULL)
9761 return NULL;
9762 PyUnicode_InternInPlace(&s);
9763 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009764}
9765
9766void _Py_ReleaseInternedUnicodeStrings(void)
9767{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009768 PyObject *keys;
9769 PyUnicodeObject *s;
9770 Py_ssize_t i, n;
9771 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009772
Benjamin Peterson14339b62009-01-31 16:36:08 +00009773 if (interned == NULL || !PyDict_Check(interned))
9774 return;
9775 keys = PyDict_Keys(interned);
9776 if (keys == NULL || !PyList_Check(keys)) {
9777 PyErr_Clear();
9778 return;
9779 }
Walter Dörwald16807132007-05-25 13:52:07 +00009780
Benjamin Peterson14339b62009-01-31 16:36:08 +00009781 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9782 detector, interned unicode strings are not forcibly deallocated;
9783 rather, we give them their stolen references back, and then clear
9784 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009785
Benjamin Peterson14339b62009-01-31 16:36:08 +00009786 n = PyList_GET_SIZE(keys);
9787 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009788 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009789 for (i = 0; i < n; i++) {
9790 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9791 switch (s->state) {
9792 case SSTATE_NOT_INTERNED:
9793 /* XXX Shouldn't happen */
9794 break;
9795 case SSTATE_INTERNED_IMMORTAL:
9796 Py_REFCNT(s) += 1;
9797 immortal_size += s->length;
9798 break;
9799 case SSTATE_INTERNED_MORTAL:
9800 Py_REFCNT(s) += 2;
9801 mortal_size += s->length;
9802 break;
9803 default:
9804 Py_FatalError("Inconsistent interned string state.");
9805 }
9806 s->state = SSTATE_NOT_INTERNED;
9807 }
9808 fprintf(stderr, "total size of all interned strings: "
9809 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9810 "mortal/immortal\n", mortal_size, immortal_size);
9811 Py_DECREF(keys);
9812 PyDict_Clear(interned);
9813 Py_DECREF(interned);
9814 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009815}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009816
9817
9818/********************* Unicode Iterator **************************/
9819
9820typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009821 PyObject_HEAD
9822 Py_ssize_t it_index;
9823 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009824} unicodeiterobject;
9825
9826static void
9827unicodeiter_dealloc(unicodeiterobject *it)
9828{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009829 _PyObject_GC_UNTRACK(it);
9830 Py_XDECREF(it->it_seq);
9831 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009832}
9833
9834static int
9835unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9836{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009837 Py_VISIT(it->it_seq);
9838 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009839}
9840
9841static PyObject *
9842unicodeiter_next(unicodeiterobject *it)
9843{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009844 PyUnicodeObject *seq;
9845 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009846
Benjamin Peterson14339b62009-01-31 16:36:08 +00009847 assert(it != NULL);
9848 seq = it->it_seq;
9849 if (seq == NULL)
9850 return NULL;
9851 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009852
Benjamin Peterson14339b62009-01-31 16:36:08 +00009853 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9854 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009855 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009856 if (item != NULL)
9857 ++it->it_index;
9858 return item;
9859 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009860
Benjamin Peterson14339b62009-01-31 16:36:08 +00009861 Py_DECREF(seq);
9862 it->it_seq = NULL;
9863 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009864}
9865
9866static PyObject *
9867unicodeiter_len(unicodeiterobject *it)
9868{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009869 Py_ssize_t len = 0;
9870 if (it->it_seq)
9871 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9872 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009873}
9874
9875PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9876
9877static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009878 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009879 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009880 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009881};
9882
9883PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009884 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9885 "str_iterator", /* tp_name */
9886 sizeof(unicodeiterobject), /* tp_basicsize */
9887 0, /* tp_itemsize */
9888 /* methods */
9889 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9890 0, /* tp_print */
9891 0, /* tp_getattr */
9892 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009893 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009894 0, /* tp_repr */
9895 0, /* tp_as_number */
9896 0, /* tp_as_sequence */
9897 0, /* tp_as_mapping */
9898 0, /* tp_hash */
9899 0, /* tp_call */
9900 0, /* tp_str */
9901 PyObject_GenericGetAttr, /* tp_getattro */
9902 0, /* tp_setattro */
9903 0, /* tp_as_buffer */
9904 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9905 0, /* tp_doc */
9906 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9907 0, /* tp_clear */
9908 0, /* tp_richcompare */
9909 0, /* tp_weaklistoffset */
9910 PyObject_SelfIter, /* tp_iter */
9911 (iternextfunc)unicodeiter_next, /* tp_iternext */
9912 unicodeiter_methods, /* tp_methods */
9913 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009914};
9915
9916static PyObject *
9917unicode_iter(PyObject *seq)
9918{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009919 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009920
Benjamin Peterson14339b62009-01-31 16:36:08 +00009921 if (!PyUnicode_Check(seq)) {
9922 PyErr_BadInternalCall();
9923 return NULL;
9924 }
9925 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9926 if (it == NULL)
9927 return NULL;
9928 it->it_index = 0;
9929 Py_INCREF(seq);
9930 it->it_seq = (PyUnicodeObject *)seq;
9931 _PyObject_GC_TRACK(it);
9932 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009933}
9934
Martin v. Löwis5b222132007-06-10 09:51:05 +00009935size_t
9936Py_UNICODE_strlen(const Py_UNICODE *u)
9937{
9938 int res = 0;
9939 while(*u++)
9940 res++;
9941 return res;
9942}
9943
9944Py_UNICODE*
9945Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9946{
9947 Py_UNICODE *u = s1;
9948 while ((*u++ = *s2++));
9949 return s1;
9950}
9951
9952Py_UNICODE*
9953Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9954{
9955 Py_UNICODE *u = s1;
9956 while ((*u++ = *s2++))
9957 if (n-- == 0)
9958 break;
9959 return s1;
9960}
9961
Victor Stinnerc4eb7652010-09-01 23:43:50 +00009962Py_UNICODE*
9963Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
9964{
9965 Py_UNICODE *u1 = s1;
9966 u1 += Py_UNICODE_strlen(u1);
9967 Py_UNICODE_strcpy(u1, s2);
9968 return s1;
9969}
9970
Martin v. Löwis5b222132007-06-10 09:51:05 +00009971int
9972Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9973{
9974 while (*s1 && *s2 && *s1 == *s2)
9975 s1++, s2++;
9976 if (*s1 && *s2)
9977 return (*s1 < *s2) ? -1 : +1;
9978 if (*s1)
9979 return 1;
9980 if (*s2)
9981 return -1;
9982 return 0;
9983}
9984
Victor Stinneref8d95c2010-08-16 22:03:11 +00009985int
9986Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9987{
9988 register Py_UNICODE u1, u2;
9989 for (; n != 0; n--) {
9990 u1 = *s1;
9991 u2 = *s2;
9992 if (u1 != u2)
9993 return (u1 < u2) ? -1 : +1;
9994 if (u1 == '\0')
9995 return 0;
9996 s1++;
9997 s2++;
9998 }
9999 return 0;
10000}
10001
Martin v. Löwis5b222132007-06-10 09:51:05 +000010002Py_UNICODE*
10003Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10004{
10005 const Py_UNICODE *p;
10006 for (p = s; *p; p++)
10007 if (*p == c)
10008 return (Py_UNICODE*)p;
10009 return NULL;
10010}
10011
Victor Stinner331ea922010-08-10 16:37:20 +000010012Py_UNICODE*
10013Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10014{
10015 const Py_UNICODE *p;
10016 p = s + Py_UNICODE_strlen(s);
10017 while (p != s) {
10018 p--;
10019 if (*p == c)
10020 return (Py_UNICODE*)p;
10021 }
10022 return NULL;
10023}
10024
Victor Stinner71133ff2010-09-01 23:43:53 +000010025Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010026PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010027{
10028 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10029 Py_UNICODE *copy;
10030 Py_ssize_t size;
10031
10032 /* Ensure we won't overflow the size. */
10033 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10034 PyErr_NoMemory();
10035 return NULL;
10036 }
10037 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10038 size *= sizeof(Py_UNICODE);
10039 copy = PyMem_Malloc(size);
10040 if (copy == NULL) {
10041 PyErr_NoMemory();
10042 return NULL;
10043 }
10044 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10045 return copy;
10046}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010047
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010048#ifdef __cplusplus
10049}
10050#endif