blob: 1c083b2f6f728a2810560ea88f276d64c0428d64 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Christian Heimes190d79e2008-01-30 11:58:22 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000119 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000120/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000121/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000122/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000123/* case 0x000C: * FORM FEED */
124/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000127/* case 0x001C: * FILE SEPARATOR */
128/* case 0x001D: * GROUP SEPARATOR */
129/* case 0x001E: * RECORD SEPARATOR */
130/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000131 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson14339b62009-01-31 16:36:08 +0000138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000146};
147
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149 PyObject **errorHandler,const char *encoding, const char *reason,
150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
Victor Stinner31be90b2010-04-22 19:38:16 +0000153static void raise_encode_exception(PyObject **exceptionObject,
154 const char *encoding,
155 const Py_UNICODE *unicode, Py_ssize_t size,
156 Py_ssize_t startpos, Py_ssize_t endpos,
157 const char *reason);
158
Christian Heimes190d79e2008-01-30 11:58:22 +0000159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrouf068f942010-01-13 14:19:12 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
Antoine Pitrouf068f942010-01-13 14:19:12 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
Benjamin Peterson29060642009-01-31 22:14:21 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252}
253
Benjamin Peterson29060642009-01-31 22:14:21 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000278 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000309 Ux0000 terminated; some code (e.g. new_identifier)
310 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000313 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314
315*/
316
317static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319{
320 register PyUnicodeObject *unicode;
321
Thomas Wouters477c8d52006-05-27 19:21:47 +0000322 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 if (length == 0 && unicode_empty != NULL) {
324 Py_INCREF(unicode_empty);
325 return unicode_empty;
326 }
327
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000328 /* Ensure we won't overflow the size. */
329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330 return (PyUnicodeObject *)PyErr_NoMemory();
331 }
332
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000334 if (free_list) {
335 unicode = free_list;
336 free_list = *(PyUnicodeObject **)unicode;
337 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000338 if (unicode->str) {
339 /* Keep-Alive optimization: we only upsize the buffer,
340 never downsize it. */
341 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000342 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000343 PyObject_DEL(unicode->str);
344 unicode->str = NULL;
345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000350 }
351 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 }
353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 if (unicode == NULL)
357 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 }
361
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 PyErr_NoMemory();
364 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000365 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000366 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000367 * the caller fails before initializing str -- unicode_resize()
368 * reads str[0], and the Keep-Alive optimization can keep memory
369 * allocated for str alive across a call to unicode_dealloc(unicode).
370 * We don't want unicode_resize to read uninitialized memory in
371 * that case.
372 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000373 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000377 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000378 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000380
Benjamin Peterson29060642009-01-31 22:14:21 +0000381 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000382 /* XXX UNREF/NEWREF interface should be more symmetrical */
383 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000385 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387}
388
389static
Guido van Rossum9475a232001-10-05 20:51:39 +0000390void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391{
Walter Dörwald16807132007-05-25 13:52:07 +0000392 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_NOT_INTERNED:
394 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000395
Benjamin Peterson29060642009-01-31 22:14:21 +0000396 case SSTATE_INTERNED_MORTAL:
397 /* revive dead object temporarily for DelItem */
398 Py_REFCNT(unicode) = 3;
399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400 Py_FatalError(
401 "deletion of interned string failed");
402 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 case SSTATE_INTERNED_IMMORTAL:
405 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406
Benjamin Peterson29060642009-01-31 22:14:21 +0000407 default:
408 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000409 }
410
Guido van Rossum604ddf82001-12-06 20:03:56 +0000411 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000412 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000413 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415 PyObject_DEL(unicode->str);
416 unicode->str = NULL;
417 unicode->length = 0;
418 }
419 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000420 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 }
422 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000423 *(PyUnicodeObject **)unicode = free_list;
424 free_list = unicode;
425 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000426 }
427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyObject_DEL(unicode->str);
429 Py_XDECREF(unicode->defenc);
430 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431 }
432}
433
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000436{
437 register PyUnicodeObject *v;
438
439 /* Argument checks */
440 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 PyErr_BadInternalCall();
442 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000443 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 PyErr_BadInternalCall();
447 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 }
449
450 /* Resizing unicode_empty and single character objects is not
451 possible since these are being shared. We simply return a fresh
452 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000453 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000454 (v == unicode_empty || v->length == 1)) {
455 PyUnicodeObject *w = _PyUnicode_New(length);
456 if (w == NULL)
457 return -1;
458 Py_UNICODE_COPY(w->str, v->str,
459 length < v->length ? length : v->length);
460 Py_DECREF(*unicode);
461 *unicode = w;
462 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000463 }
464
465 /* Note that we don't have to modify *unicode for unshared Unicode
466 objects, since we can modify them in-place. */
467 return unicode_resize(v, length);
468}
469
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477{
478 PyUnicodeObject *unicode;
479
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects. */
482 if (u != NULL) {
483
Benjamin Peterson29060642009-01-31 22:14:21 +0000484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000489
490 /* Single character Unicode objects in the Latin-1 range are
491 shared when using this constructor */
492 if (size == 1 && *u < 256) {
493 unicode = unicode_latin1[*u];
494 if (!unicode) {
495 unicode = _PyUnicode_New(1);
496 if (!unicode)
497 return NULL;
498 unicode->str[0] = *u;
499 unicode_latin1[*u] = unicode;
500 }
501 Py_INCREF(unicode);
502 return (PyObject *)unicode;
503 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000504 }
Tim Petersced69f82003-09-16 20:30:58 +0000505
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 unicode = _PyUnicode_New(size);
507 if (!unicode)
508 return NULL;
509
510 /* Copy the Unicode data into the new object */
511 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000512 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 return (PyObject *)unicode;
515}
516
Walter Dörwaldd2034312007-05-18 16:29:38 +0000517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518{
519 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000520
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 if (size < 0) {
522 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 return NULL;
525 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000526
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000528 some optimizations which share commonly used objects.
529 Also, this means the input must be UTF-8, so fall back to the
530 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (u != NULL) {
532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533 /* Optimization for empty strings */
534 if (size == 0 && unicode_empty != NULL) {
535 Py_INCREF(unicode_empty);
536 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000537 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000538
539 /* Single characters are shared when using this constructor.
540 Restrict to ASCII, since the input must be UTF-8. */
541 if (size == 1 && Py_CHARMASK(*u) < 128) {
542 unicode = unicode_latin1[Py_CHARMASK(*u)];
543 if (!unicode) {
544 unicode = _PyUnicode_New(1);
545 if (!unicode)
546 return NULL;
547 unicode->str[0] = Py_CHARMASK(*u);
548 unicode_latin1[Py_CHARMASK(*u)] = unicode;
549 }
550 Py_INCREF(unicode);
551 return (PyObject *)unicode;
552 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000553
554 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 }
556
Walter Dörwald55507312007-05-18 13:12:10 +0000557 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 if (!unicode)
559 return NULL;
560
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 return (PyObject *)unicode;
562}
563
Walter Dörwaldd2034312007-05-18 16:29:38 +0000564PyObject *PyUnicode_FromString(const char *u)
565{
566 size_t size = strlen(u);
567 if (size > PY_SSIZE_T_MAX) {
568 PyErr_SetString(PyExc_OverflowError, "input too long");
569 return NULL;
570 }
571
572 return PyUnicode_FromStringAndSize(u, size);
573}
574
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575#ifdef HAVE_WCHAR_H
576
Mark Dickinson081dfee2009-03-18 14:47:41 +0000577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584 to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
588{
589 PyUnicodeObject *unicode;
590 register Py_ssize_t i;
591 Py_ssize_t alloc;
592 const wchar_t *orig_w;
593
594 if (w == NULL) {
595 if (size == 0)
596 return PyUnicode_FromStringAndSize(NULL, 0);
597 PyErr_BadInternalCall();
598 return NULL;
599 }
600
601 if (size == -1) {
602 size = wcslen(w);
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000643 if (size == 0)
644 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000645 PyErr_BadInternalCall();
646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647 }
648
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == -1) {
650 size = wcslen(w);
651 }
652
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 unicode = _PyUnicode_New(size);
654 if (!unicode)
655 return NULL;
656
657 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000660#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000662 register Py_UNICODE *u;
663 register Py_ssize_t i;
664 u = PyUnicode_AS_UNICODE(unicode);
665 for (i = size; i > 0; i--)
666 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 }
668#endif
669
670 return (PyObject *)unicode;
671}
672
Mark Dickinson081dfee2009-03-18 14:47:41 +0000673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
Walter Dörwald346737f2007-05-31 10:44:43 +0000677static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000680{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000681 *fmt++ = '%';
682 if (width) {
683 if (zeropad)
684 *fmt++ = '0';
685 fmt += sprintf(fmt, "%d", width);
686 }
687 if (precision)
688 fmt += sprintf(fmt, ".%d", precision);
689 if (longflag)
690 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000691 else if (longlongflag) {
692 /* longlongflag should only ever be nonzero on machines with
693 HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695 char *f = PY_FORMAT_LONG_LONG;
696 while (*f)
697 *fmt++ = *f++;
698#else
699 /* we shouldn't ever get here */
700 assert(0);
701 *fmt++ = 'l';
702#endif
703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000704 else if (size_tflag) {
705 char *f = PY_FORMAT_SIZE_T;
706 while (*f)
707 *fmt++ = *f++;
708 }
709 *fmt++ = c;
710 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000711}
712
Walter Dörwaldd2034312007-05-18 16:29:38 +0000713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld. 21 characters
718 allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000728 va_list count;
729 Py_ssize_t callcount = 0;
730 PyObject **callresults = NULL;
731 PyObject **callresult = NULL;
732 Py_ssize_t n = 0;
733 int width = 0;
734 int precision = 0;
735 int zeropad;
736 const char* f;
737 Py_UNICODE *s;
738 PyObject *string;
739 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000740 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000741 /* use abuffer instead of buffer, if we need more space
742 * (which can happen if there's a format specifier with width). */
743 char *abuffer = NULL;
744 char *realbuffer;
745 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000749 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000750 /* step 1: count the number of %S/%R/%A/%s format specifications
751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000754 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000755 if (*f == '%') {
756 if (*(f+1)=='%')
757 continue;
758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759 ++callcount;
760 while (ISDIGIT((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763 ;
764 if (*f == 's')
765 ++callcount;
766 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000767 else if (128 <= (unsigned char)*f) {
768 PyErr_Format(PyExc_ValueError,
769 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000770 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000771 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000772 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 }
775 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000776 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000777 if (callcount) {
778 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
779 if (!callresults) {
780 PyErr_NoMemory();
781 return NULL;
782 }
783 callresult = callresults;
784 }
785 /* step 3: figure out how large a buffer we need */
786 for (f = format; *f; f++) {
787 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000788#ifdef HAVE_LONG_LONG
789 int longlongflag = 0;
790#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 const char* p = f;
792 width = 0;
793 while (ISDIGIT((unsigned)*f))
794 width = (width*10) + *f++ - '0';
795 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
796 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797
Benjamin Peterson14339b62009-01-31 16:36:08 +0000798 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
799 * they don't affect the amount of space we reserve.
800 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000801 if (*f == 'l') {
802 if (f[1] == 'd' || f[1] == 'u') {
803 ++f;
804 }
805#ifdef HAVE_LONG_LONG
806 else if (f[1] == 'l' &&
807 (f[2] == 'd' || f[2] == 'u')) {
808 longlongflag = 1;
809 f += 2;
810 }
811#endif
812 }
813 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000814 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000816
Benjamin Peterson14339b62009-01-31 16:36:08 +0000817 switch (*f) {
818 case 'c':
819 (void)va_arg(count, int);
820 /* fall through... */
821 case '%':
822 n++;
823 break;
824 case 'd': case 'u': case 'i': case 'x':
825 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000826#ifdef HAVE_LONG_LONG
827 if (longlongflag) {
828 if (width < MAX_LONG_LONG_CHARS)
829 width = MAX_LONG_LONG_CHARS;
830 }
831 else
832#endif
833 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
834 including sign. Decimal takes the most space. This
835 isn't enough for octal. If a width is specified we
836 need more (which we allocate later). */
837 if (width < MAX_LONG_CHARS)
838 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000839 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000840 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 if (abuffersize < width)
842 abuffersize = width;
843 break;
844 case 's':
845 {
846 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000847 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000848 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
849 if (!str)
850 goto fail;
851 n += PyUnicode_GET_SIZE(str);
852 /* Remember the str and switch to the next slot */
853 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000854 break;
855 }
856 case 'U':
857 {
858 PyObject *obj = va_arg(count, PyObject *);
859 assert(obj && PyUnicode_Check(obj));
860 n += PyUnicode_GET_SIZE(obj);
861 break;
862 }
863 case 'V':
864 {
865 PyObject *obj = va_arg(count, PyObject *);
866 const char *str = va_arg(count, const char *);
867 assert(obj || str);
868 assert(!obj || PyUnicode_Check(obj));
869 if (obj)
870 n += PyUnicode_GET_SIZE(obj);
871 else
872 n += strlen(str);
873 break;
874 }
875 case 'S':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 PyObject *str;
879 assert(obj);
880 str = PyObject_Str(obj);
881 if (!str)
882 goto fail;
883 n += PyUnicode_GET_SIZE(str);
884 /* Remember the str and switch to the next slot */
885 *callresult++ = str;
886 break;
887 }
888 case 'R':
889 {
890 PyObject *obj = va_arg(count, PyObject *);
891 PyObject *repr;
892 assert(obj);
893 repr = PyObject_Repr(obj);
894 if (!repr)
895 goto fail;
896 n += PyUnicode_GET_SIZE(repr);
897 /* Remember the repr and switch to the next slot */
898 *callresult++ = repr;
899 break;
900 }
901 case 'A':
902 {
903 PyObject *obj = va_arg(count, PyObject *);
904 PyObject *ascii;
905 assert(obj);
906 ascii = PyObject_ASCII(obj);
907 if (!ascii)
908 goto fail;
909 n += PyUnicode_GET_SIZE(ascii);
910 /* Remember the repr and switch to the next slot */
911 *callresult++ = ascii;
912 break;
913 }
914 case 'p':
915 (void) va_arg(count, int);
916 /* maximum 64-bit pointer representation:
917 * 0xffffffffffffffff
918 * so 19 characters is enough.
919 * XXX I count 18 -- what's the extra for?
920 */
921 n += 19;
922 break;
923 default:
924 /* if we stumble upon an unknown
925 formatting code, copy the rest of
926 the format string to the output
927 string. (we cannot just skip the
928 code, since there's no way to know
929 what's in the argument list) */
930 n += strlen(p);
931 goto expand;
932 }
933 } else
934 n++;
935 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000936 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000937 if (abuffersize > ITEM_BUFFER_LEN) {
938 /* add 1 for sprintf's trailing null byte */
939 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000940 if (!abuffer) {
941 PyErr_NoMemory();
942 goto fail;
943 }
944 realbuffer = abuffer;
945 }
946 else
947 realbuffer = buffer;
948 /* step 4: fill the buffer */
949 /* Since we've analyzed how much space we need for the worst case,
950 we don't have to resize the string.
951 There can be no errors beyond this point. */
952 string = PyUnicode_FromUnicode(NULL, n);
953 if (!string)
954 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000955
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 s = PyUnicode_AS_UNICODE(string);
957 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000958
Benjamin Peterson14339b62009-01-31 16:36:08 +0000959 for (f = format; *f; f++) {
960 if (*f == '%') {
961 const char* p = f++;
962 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000963 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000964 int size_tflag = 0;
965 zeropad = (*f == '0');
966 /* parse the width.precision part */
967 width = 0;
968 while (ISDIGIT((unsigned)*f))
969 width = (width*10) + *f++ - '0';
970 precision = 0;
971 if (*f == '.') {
972 f++;
973 while (ISDIGIT((unsigned)*f))
974 precision = (precision*10) + *f++ - '0';
975 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000976 /* Handle %ld, %lu, %lld and %llu. */
977 if (*f == 'l') {
978 if (f[1] == 'd' || f[1] == 'u') {
979 longflag = 1;
980 ++f;
981 }
982#ifdef HAVE_LONG_LONG
983 else if (f[1] == 'l' &&
984 (f[2] == 'd' || f[2] == 'u')) {
985 longlongflag = 1;
986 f += 2;
987 }
988#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 }
990 /* handle the size_t flag. */
991 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
992 size_tflag = 1;
993 ++f;
994 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000995
Benjamin Peterson14339b62009-01-31 16:36:08 +0000996 switch (*f) {
997 case 'c':
998 *s++ = va_arg(vargs, int);
999 break;
1000 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001001 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1002 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001003 if (longflag)
1004 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001005#ifdef HAVE_LONG_LONG
1006 else if (longlongflag)
1007 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1008#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001009 else if (size_tflag)
1010 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1011 else
1012 sprintf(realbuffer, fmt, va_arg(vargs, int));
1013 appendstring(realbuffer);
1014 break;
1015 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001016 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1017 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 if (longflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001020#ifdef HAVE_LONG_LONG
1021 else if (longlongflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs,
1023 unsigned PY_LONG_LONG));
1024#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001025 else if (size_tflag)
1026 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1027 else
1028 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1029 appendstring(realbuffer);
1030 break;
1031 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 sprintf(realbuffer, fmt, va_arg(vargs, int));
1034 appendstring(realbuffer);
1035 break;
1036 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001037 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 's':
1042 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001043 /* unused, since we already have the result */
1044 (void) va_arg(vargs, char *);
1045 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1046 PyUnicode_GET_SIZE(*callresult));
1047 s += PyUnicode_GET_SIZE(*callresult);
1048 /* We're done with the unicode()/repr() => forget it */
1049 Py_DECREF(*callresult);
1050 /* switch to next unicode()/repr() result */
1051 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001052 break;
1053 }
1054 case 'U':
1055 {
1056 PyObject *obj = va_arg(vargs, PyObject *);
1057 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1058 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1059 s += size;
1060 break;
1061 }
1062 case 'V':
1063 {
1064 PyObject *obj = va_arg(vargs, PyObject *);
1065 const char *str = va_arg(vargs, const char *);
1066 if (obj) {
1067 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1068 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1069 s += size;
1070 } else {
1071 appendstring(str);
1072 }
1073 break;
1074 }
1075 case 'S':
1076 case 'R':
1077 {
1078 Py_UNICODE *ucopy;
1079 Py_ssize_t usize;
1080 Py_ssize_t upos;
1081 /* unused, since we already have the result */
1082 (void) va_arg(vargs, PyObject *);
1083 ucopy = PyUnicode_AS_UNICODE(*callresult);
1084 usize = PyUnicode_GET_SIZE(*callresult);
1085 for (upos = 0; upos<usize;)
1086 *s++ = ucopy[upos++];
1087 /* We're done with the unicode()/repr() => forget it */
1088 Py_DECREF(*callresult);
1089 /* switch to next unicode()/repr() result */
1090 ++callresult;
1091 break;
1092 }
1093 case 'p':
1094 sprintf(buffer, "%p", va_arg(vargs, void*));
1095 /* %p is ill-defined: ensure leading 0x. */
1096 if (buffer[1] == 'X')
1097 buffer[1] = 'x';
1098 else if (buffer[1] != 'x') {
1099 memmove(buffer+2, buffer, strlen(buffer)+1);
1100 buffer[0] = '0';
1101 buffer[1] = 'x';
1102 }
1103 appendstring(buffer);
1104 break;
1105 case '%':
1106 *s++ = '%';
1107 break;
1108 default:
1109 appendstring(p);
1110 goto end;
1111 }
Victor Stinner1205f272010-09-11 00:54:47 +00001112 }
Victor Stinner1205f272010-09-11 00:54:47 +00001113 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001114 *s++ = *f;
1115 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001116
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 if (callresults)
1119 PyObject_Free(callresults);
1120 if (abuffer)
1121 PyObject_Free(abuffer);
1122 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1123 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001124 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 if (callresults) {
1126 PyObject **callresult2 = callresults;
1127 while (callresult2 < callresult) {
1128 Py_DECREF(*callresult2);
1129 ++callresult2;
1130 }
1131 PyObject_Free(callresults);
1132 }
1133 if (abuffer)
1134 PyObject_Free(abuffer);
1135 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001136}
1137
1138#undef appendstring
1139
1140PyObject *
1141PyUnicode_FromFormat(const char *format, ...)
1142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001143 PyObject* ret;
1144 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001145
1146#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001147 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001148#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001150#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001151 ret = PyUnicode_FromFormatV(format, vargs);
1152 va_end(vargs);
1153 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001154}
1155
Victor Stinner137c34c2010-09-29 10:25:54 +00001156static void
1157unicode_aswidechar(PyUnicodeObject *unicode,
1158 wchar_t *w,
1159 Py_ssize_t size)
1160{
1161#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1162 memcpy(w, unicode->str, size * sizeof(wchar_t));
1163#else
1164 register Py_UNICODE *u;
1165 register Py_ssize_t i;
1166 u = PyUnicode_AS_UNICODE(unicode);
1167 for (i = size; i > 0; i--)
1168 *w++ = *u++;
1169#endif
1170}
1171
1172Py_ssize_t
1173PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1174 wchar_t *w,
1175 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176{
1177 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 PyErr_BadInternalCall();
1179 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001181
1182 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001185
Victor Stinner137c34c2010-09-29 10:25:54 +00001186 unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001188 if (size > PyUnicode_GET_SIZE(unicode))
1189 return PyUnicode_GET_SIZE(unicode);
1190 else
Benjamin Peterson29060642009-01-31 22:14:21 +00001191 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192}
1193
Victor Stinner137c34c2010-09-29 10:25:54 +00001194wchar_t*
1195PyUnicode_AsWideCharString(PyUnicodeObject *unicode,
1196 Py_ssize_t *size)
1197{
1198 wchar_t* buffer;
1199 Py_ssize_t buflen;
1200
1201 if (unicode == NULL) {
1202 PyErr_BadInternalCall();
1203 return NULL;
1204 }
1205
1206 if ((PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) < PyUnicode_GET_SIZE(unicode)) {
1207 PyErr_NoMemory();
1208 return NULL;
1209 }
1210
1211 buflen = PyUnicode_GET_SIZE(unicode) + 1; /* copy L'\0' */
1212 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1213 if (buffer == NULL) {
1214 PyErr_NoMemory();
1215 return NULL;
1216 }
1217 unicode_aswidechar(unicode, buffer, buflen);
Victor Stinner71e91a32010-09-29 17:55:12 +00001218 if (size)
Victor Stinner1c24bd02010-10-02 11:03:13 +00001219 *size = buflen - 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00001220 return buffer;
1221}
1222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223#endif
1224
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001225PyObject *PyUnicode_FromOrdinal(int ordinal)
1226{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001227 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001228
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001229 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001230 PyErr_SetString(PyExc_ValueError,
1231 "chr() arg not in range(0x110000)");
1232 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001233 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001234
1235#ifndef Py_UNICODE_WIDE
1236 if (ordinal > 0xffff) {
1237 ordinal -= 0x10000;
1238 s[0] = 0xD800 | (ordinal >> 10);
1239 s[1] = 0xDC00 | (ordinal & 0x3FF);
1240 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001241 }
1242#endif
1243
Hye-Shik Chang40574832004-04-06 07:24:51 +00001244 s[0] = (Py_UNICODE)ordinal;
1245 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001246}
1247
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248PyObject *PyUnicode_FromObject(register PyObject *obj)
1249{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001250 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001251 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001252 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001253 Py_INCREF(obj);
1254 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001255 }
1256 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001257 /* For a Unicode subtype that's not a Unicode object,
1258 return a true Unicode object with the same data. */
1259 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1260 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001261 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001262 PyErr_Format(PyExc_TypeError,
1263 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001264 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001265 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001266}
1267
1268PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001269 const char *encoding,
1270 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001271{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001272 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001273 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001274
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001276 PyErr_BadInternalCall();
1277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001279
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001280 /* Decoding bytes objects is the most common case and should be fast */
1281 if (PyBytes_Check(obj)) {
1282 if (PyBytes_GET_SIZE(obj) == 0) {
1283 Py_INCREF(unicode_empty);
1284 v = (PyObject *) unicode_empty;
1285 }
1286 else {
1287 v = PyUnicode_Decode(
1288 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1289 encoding, errors);
1290 }
1291 return v;
1292 }
1293
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001294 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001295 PyErr_SetString(PyExc_TypeError,
1296 "decoding str is not supported");
1297 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001298 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001299
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001300 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1301 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1302 PyErr_Format(PyExc_TypeError,
1303 "coercing to str: need bytes, bytearray "
1304 "or buffer-like object, %.80s found",
1305 Py_TYPE(obj)->tp_name);
1306 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001307 }
Tim Petersced69f82003-09-16 20:30:58 +00001308
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001309 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001310 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001311 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 }
Tim Petersced69f82003-09-16 20:30:58 +00001313 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001314 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001315
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001316 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001317 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318}
1319
Victor Stinner600d3be2010-06-10 12:00:55 +00001320/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001321 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1322 1 on success. */
1323static int
1324normalize_encoding(const char *encoding,
1325 char *lower,
1326 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001328 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001329 char *l;
1330 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001332 e = encoding;
1333 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001334 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001335 while (*e) {
1336 if (l == l_end)
1337 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001338 if (ISUPPER(*e)) {
1339 *l++ = TOLOWER(*e++);
1340 }
1341 else if (*e == '_') {
1342 *l++ = '-';
1343 e++;
1344 }
1345 else {
1346 *l++ = *e++;
1347 }
1348 }
1349 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001350 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001351}
1352
1353PyObject *PyUnicode_Decode(const char *s,
1354 Py_ssize_t size,
1355 const char *encoding,
1356 const char *errors)
1357{
1358 PyObject *buffer = NULL, *unicode;
1359 Py_buffer info;
1360 char lower[11]; /* Enough for any encoding shortcut */
1361
1362 if (encoding == NULL)
1363 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001364
1365 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001366 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1367 if (strcmp(lower, "utf-8") == 0)
1368 return PyUnicode_DecodeUTF8(s, size, errors);
1369 else if ((strcmp(lower, "latin-1") == 0) ||
1370 (strcmp(lower, "iso-8859-1") == 0))
1371 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001372#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001373 else if (strcmp(lower, "mbcs") == 0)
1374 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001375#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001376 else if (strcmp(lower, "ascii") == 0)
1377 return PyUnicode_DecodeASCII(s, size, errors);
1378 else if (strcmp(lower, "utf-16") == 0)
1379 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1380 else if (strcmp(lower, "utf-32") == 0)
1381 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1382 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383
1384 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001385 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001386 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001387 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001388 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 if (buffer == NULL)
1390 goto onError;
1391 unicode = PyCodec_Decode(buffer, encoding, errors);
1392 if (unicode == NULL)
1393 goto onError;
1394 if (!PyUnicode_Check(unicode)) {
1395 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001396 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001397 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398 Py_DECREF(unicode);
1399 goto onError;
1400 }
1401 Py_DECREF(buffer);
1402 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001403
Benjamin Peterson29060642009-01-31 22:14:21 +00001404 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405 Py_XDECREF(buffer);
1406 return NULL;
1407}
1408
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001409PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1410 const char *encoding,
1411 const char *errors)
1412{
1413 PyObject *v;
1414
1415 if (!PyUnicode_Check(unicode)) {
1416 PyErr_BadArgument();
1417 goto onError;
1418 }
1419
1420 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001422
1423 /* Decode via the codec registry */
1424 v = PyCodec_Decode(unicode, encoding, errors);
1425 if (v == NULL)
1426 goto onError;
1427 return v;
1428
Benjamin Peterson29060642009-01-31 22:14:21 +00001429 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001430 return NULL;
1431}
1432
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001433PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1434 const char *encoding,
1435 const char *errors)
1436{
1437 PyObject *v;
1438
1439 if (!PyUnicode_Check(unicode)) {
1440 PyErr_BadArgument();
1441 goto onError;
1442 }
1443
1444 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001445 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001446
1447 /* Decode via the codec registry */
1448 v = PyCodec_Decode(unicode, encoding, errors);
1449 if (v == NULL)
1450 goto onError;
1451 if (!PyUnicode_Check(v)) {
1452 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001453 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001454 Py_TYPE(v)->tp_name);
1455 Py_DECREF(v);
1456 goto onError;
1457 }
1458 return v;
1459
Benjamin Peterson29060642009-01-31 22:14:21 +00001460 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001461 return NULL;
1462}
1463
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001465 Py_ssize_t size,
1466 const char *encoding,
1467 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001468{
1469 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001470
Guido van Rossumd57fd912000-03-10 22:53:23 +00001471 unicode = PyUnicode_FromUnicode(s, size);
1472 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001473 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1475 Py_DECREF(unicode);
1476 return v;
1477}
1478
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001479PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1480 const char *encoding,
1481 const char *errors)
1482{
1483 PyObject *v;
1484
1485 if (!PyUnicode_Check(unicode)) {
1486 PyErr_BadArgument();
1487 goto onError;
1488 }
1489
1490 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001491 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001492
1493 /* Encode via the codec registry */
1494 v = PyCodec_Encode(unicode, encoding, errors);
1495 if (v == NULL)
1496 goto onError;
1497 return v;
1498
Benjamin Peterson29060642009-01-31 22:14:21 +00001499 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001500 return NULL;
1501}
1502
Victor Stinnerae6265f2010-05-15 16:27:27 +00001503PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1504{
Victor Stinner313a1202010-06-11 23:56:51 +00001505 if (Py_FileSystemDefaultEncoding) {
1506#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1507 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1508 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1509 PyUnicode_GET_SIZE(unicode),
1510 NULL);
1511#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001512 return PyUnicode_AsEncodedString(unicode,
1513 Py_FileSystemDefaultEncoding,
1514 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001515 }
1516 else {
1517 /* if you change the default encoding, update also
1518 PyUnicode_DecodeFSDefaultAndSize() and redecode_filenames() */
Victor Stinnerae6265f2010-05-15 16:27:27 +00001519 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Victor Stinner3119ed72010-08-18 22:26:50 +00001520 PyUnicode_GET_SIZE(unicode),
1521 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001522 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00001523}
1524
Guido van Rossumd57fd912000-03-10 22:53:23 +00001525PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1526 const char *encoding,
1527 const char *errors)
1528{
1529 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001530 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001531
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532 if (!PyUnicode_Check(unicode)) {
1533 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001534 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535 }
Fred Drakee4315f52000-05-09 19:53:39 +00001536
Tim Petersced69f82003-09-16 20:30:58 +00001537 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001538 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001539
1540 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001541 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1542 if (strcmp(lower, "utf-8") == 0)
1543 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1544 PyUnicode_GET_SIZE(unicode),
1545 errors);
1546 else if ((strcmp(lower, "latin-1") == 0) ||
1547 (strcmp(lower, "iso-8859-1") == 0))
1548 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1549 PyUnicode_GET_SIZE(unicode),
1550 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001551#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001552 else if (strcmp(lower, "mbcs") == 0)
1553 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1554 PyUnicode_GET_SIZE(unicode),
1555 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001556#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001557 else if (strcmp(lower, "ascii") == 0)
1558 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1559 PyUnicode_GET_SIZE(unicode),
1560 errors);
1561 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001562 /* During bootstrap, we may need to find the encodings
1563 package, to load the file system encoding, and require the
1564 file system encoding in order to load the encodings
1565 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001566
Victor Stinner59e62db2010-05-15 13:14:32 +00001567 Break out of this dependency by assuming that the path to
1568 the encodings module is ASCII-only. XXX could try wcstombs
1569 instead, if the file system encoding is the locale's
1570 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001571 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001572 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1573 !PyThreadState_GET()->interp->codecs_initialized)
1574 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1575 PyUnicode_GET_SIZE(unicode),
1576 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577
1578 /* Encode via the codec registry */
1579 v = PyCodec_Encode(unicode, encoding, errors);
1580 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001581 return NULL;
1582
1583 /* The normal path */
1584 if (PyBytes_Check(v))
1585 return v;
1586
1587 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001588 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001589 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001590 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001591
1592 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1593 "encoder %s returned bytearray instead of bytes",
1594 encoding);
1595 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001596 Py_DECREF(v);
1597 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001598 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001599
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001600 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1601 Py_DECREF(v);
1602 return b;
1603 }
1604
1605 PyErr_Format(PyExc_TypeError,
1606 "encoder did not return a bytes object (type=%.400s)",
1607 Py_TYPE(v)->tp_name);
1608 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001609 return NULL;
1610}
1611
1612PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1613 const char *encoding,
1614 const char *errors)
1615{
1616 PyObject *v;
1617
1618 if (!PyUnicode_Check(unicode)) {
1619 PyErr_BadArgument();
1620 goto onError;
1621 }
1622
1623 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001624 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001625
1626 /* Encode via the codec registry */
1627 v = PyCodec_Encode(unicode, encoding, errors);
1628 if (v == NULL)
1629 goto onError;
1630 if (!PyUnicode_Check(v)) {
1631 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001632 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001633 Py_TYPE(v)->tp_name);
1634 Py_DECREF(v);
1635 goto onError;
1636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001638
Benjamin Peterson29060642009-01-31 22:14:21 +00001639 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640 return NULL;
1641}
1642
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001643PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001644 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001645{
1646 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001647 if (v)
1648 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001649 if (errors != NULL)
1650 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001651 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001652 PyUnicode_GET_SIZE(unicode),
1653 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001654 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001655 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001656 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001657 return v;
1658}
1659
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001660PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001661PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001662 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001663 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1664}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001665
Christian Heimes5894ba72007-11-04 11:43:14 +00001666PyObject*
1667PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1668{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001669 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1670 can be undefined. If it is case, decode using UTF-8. The following assumes
1671 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1672 bootstrapping process where the codecs aren't ready yet.
1673 */
1674 if (Py_FileSystemDefaultEncoding) {
1675#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001676 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001677 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001678 }
1679#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001680 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001681 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001682 }
1683#endif
1684 return PyUnicode_Decode(s, size,
1685 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001686 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001687 }
1688 else {
Victor Stinnerc39211f2010-09-29 16:35:47 +00001689 /* if you change the default encoding, update also
1690 PyUnicode_EncodeFSDefault() and redecode_filenames() */
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001691 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001692 }
1693}
1694
Martin v. Löwis011e8422009-05-05 04:43:17 +00001695
1696int
1697PyUnicode_FSConverter(PyObject* arg, void* addr)
1698{
1699 PyObject *output = NULL;
1700 Py_ssize_t size;
1701 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001702 if (arg == NULL) {
1703 Py_DECREF(*(PyObject**)addr);
1704 return 1;
1705 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001706 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001707 output = arg;
1708 Py_INCREF(output);
1709 }
1710 else {
1711 arg = PyUnicode_FromObject(arg);
1712 if (!arg)
1713 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001714 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001715 Py_DECREF(arg);
1716 if (!output)
1717 return 0;
1718 if (!PyBytes_Check(output)) {
1719 Py_DECREF(output);
1720 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1721 return 0;
1722 }
1723 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001724 size = PyBytes_GET_SIZE(output);
1725 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001726 if (size != strlen(data)) {
1727 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1728 Py_DECREF(output);
1729 return 0;
1730 }
1731 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001732 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001733}
1734
1735
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001736int
1737PyUnicode_FSDecoder(PyObject* arg, void* addr)
1738{
1739 PyObject *output = NULL;
1740 Py_ssize_t size;
1741 void *data;
1742 if (arg == NULL) {
1743 Py_DECREF(*(PyObject**)addr);
1744 return 1;
1745 }
1746 if (PyUnicode_Check(arg)) {
1747 output = arg;
1748 Py_INCREF(output);
1749 }
1750 else {
1751 arg = PyBytes_FromObject(arg);
1752 if (!arg)
1753 return 0;
1754 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1755 PyBytes_GET_SIZE(arg));
1756 Py_DECREF(arg);
1757 if (!output)
1758 return 0;
1759 if (!PyUnicode_Check(output)) {
1760 Py_DECREF(output);
1761 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1762 return 0;
1763 }
1764 }
1765 size = PyUnicode_GET_SIZE(output);
1766 data = PyUnicode_AS_UNICODE(output);
1767 if (size != Py_UNICODE_strlen(data)) {
1768 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1769 Py_DECREF(output);
1770 return 0;
1771 }
1772 *(PyObject**)addr = output;
1773 return Py_CLEANUP_SUPPORTED;
1774}
1775
1776
Martin v. Löwis5b222132007-06-10 09:51:05 +00001777char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001778_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001779{
Christian Heimesf3863112007-11-22 07:46:41 +00001780 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001781 if (!PyUnicode_Check(unicode)) {
1782 PyErr_BadArgument();
1783 return NULL;
1784 }
Christian Heimesf3863112007-11-22 07:46:41 +00001785 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1786 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001787 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001788 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001789 *psize = PyBytes_GET_SIZE(bytes);
1790 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001791}
1792
1793char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001794_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001795{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001796 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001797}
1798
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1800{
1801 if (!PyUnicode_Check(unicode)) {
1802 PyErr_BadArgument();
1803 goto onError;
1804 }
1805 return PyUnicode_AS_UNICODE(unicode);
1806
Benjamin Peterson29060642009-01-31 22:14:21 +00001807 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 return NULL;
1809}
1810
Martin v. Löwis18e16552006-02-15 17:27:45 +00001811Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812{
1813 if (!PyUnicode_Check(unicode)) {
1814 PyErr_BadArgument();
1815 goto onError;
1816 }
1817 return PyUnicode_GET_SIZE(unicode);
1818
Benjamin Peterson29060642009-01-31 22:14:21 +00001819 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001820 return -1;
1821}
1822
Thomas Wouters78890102000-07-22 19:25:51 +00001823const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001824{
Victor Stinner42cb4622010-09-01 19:39:01 +00001825 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001826}
1827
Victor Stinner554f3f02010-06-16 23:33:54 +00001828/* create or adjust a UnicodeDecodeError */
1829static void
1830make_decode_exception(PyObject **exceptionObject,
1831 const char *encoding,
1832 const char *input, Py_ssize_t length,
1833 Py_ssize_t startpos, Py_ssize_t endpos,
1834 const char *reason)
1835{
1836 if (*exceptionObject == NULL) {
1837 *exceptionObject = PyUnicodeDecodeError_Create(
1838 encoding, input, length, startpos, endpos, reason);
1839 }
1840 else {
1841 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1842 goto onError;
1843 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1844 goto onError;
1845 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1846 goto onError;
1847 }
1848 return;
1849
1850onError:
1851 Py_DECREF(*exceptionObject);
1852 *exceptionObject = NULL;
1853}
1854
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001855/* error handling callback helper:
1856 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001857 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 and adjust various state variables.
1859 return 0 on success, -1 on error
1860*/
1861
1862static
1863int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001864 const char *encoding, const char *reason,
1865 const char **input, const char **inend, Py_ssize_t *startinpos,
1866 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1867 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001868{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001869 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001870
1871 PyObject *restuple = NULL;
1872 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001873 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001874 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001875 Py_ssize_t requiredsize;
1876 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001877 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001878 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001879 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001880 int res = -1;
1881
1882 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001883 *errorHandler = PyCodec_LookupError(errors);
1884 if (*errorHandler == NULL)
1885 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001886 }
1887
Victor Stinner554f3f02010-06-16 23:33:54 +00001888 make_decode_exception(exceptionObject,
1889 encoding,
1890 *input, *inend - *input,
1891 *startinpos, *endinpos,
1892 reason);
1893 if (*exceptionObject == NULL)
1894 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895
1896 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1897 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001898 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001899 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00001900 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00001901 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001902 }
1903 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001905
1906 /* Copy back the bytes variables, which might have been modified by the
1907 callback */
1908 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1909 if (!inputobj)
1910 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00001911 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001912 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00001913 }
Christian Heimes72b710a2008-05-26 13:28:38 +00001914 *input = PyBytes_AS_STRING(inputobj);
1915 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001916 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00001917 /* we can DECREF safely, as the exception has another reference,
1918 so the object won't go away. */
1919 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001920
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001921 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001922 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001923 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001924 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1925 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001926 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001927
1928 /* need more space? (at least enough for what we
1929 have+the replacement+the rest of the string (starting
1930 at the new input position), so we won't have to check space
1931 when there are no errors in the rest of the string) */
1932 repptr = PyUnicode_AS_UNICODE(repunicode);
1933 repsize = PyUnicode_GET_SIZE(repunicode);
1934 requiredsize = *outpos + repsize + insize-newpos;
1935 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001936 if (requiredsize<2*outsize)
1937 requiredsize = 2*outsize;
1938 if (_PyUnicode_Resize(output, requiredsize) < 0)
1939 goto onError;
1940 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001941 }
1942 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001943 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001944 Py_UNICODE_COPY(*outptr, repptr, repsize);
1945 *outptr += repsize;
1946 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001947
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001948 /* we made it! */
1949 res = 0;
1950
Benjamin Peterson29060642009-01-31 22:14:21 +00001951 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001952 Py_XDECREF(restuple);
1953 return res;
1954}
1955
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001956/* --- UTF-7 Codec -------------------------------------------------------- */
1957
Antoine Pitrou244651a2009-05-04 18:56:13 +00001958/* See RFC2152 for details. We encode conservatively and decode liberally. */
1959
1960/* Three simple macros defining base-64. */
1961
1962/* Is c a base-64 character? */
1963
1964#define IS_BASE64(c) \
1965 (((c) >= 'A' && (c) <= 'Z') || \
1966 ((c) >= 'a' && (c) <= 'z') || \
1967 ((c) >= '0' && (c) <= '9') || \
1968 (c) == '+' || (c) == '/')
1969
1970/* given that c is a base-64 character, what is its base-64 value? */
1971
1972#define FROM_BASE64(c) \
1973 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1974 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1975 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1976 (c) == '+' ? 62 : 63)
1977
1978/* What is the base-64 character of the bottom 6 bits of n? */
1979
1980#define TO_BASE64(n) \
1981 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1982
1983/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1984 * decoded as itself. We are permissive on decoding; the only ASCII
1985 * byte not decoding to itself is the + which begins a base64
1986 * string. */
1987
1988#define DECODE_DIRECT(c) \
1989 ((c) <= 127 && (c) != '+')
1990
1991/* The UTF-7 encoder treats ASCII characters differently according to
1992 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1993 * the above). See RFC2152. This array identifies these different
1994 * sets:
1995 * 0 : "Set D"
1996 * alphanumeric and '(),-./:?
1997 * 1 : "Set O"
1998 * !"#$%&*;<=>@[]^_`{|}
1999 * 2 : "whitespace"
2000 * ht nl cr sp
2001 * 3 : special (must be base64 encoded)
2002 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2003 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002004
Tim Petersced69f82003-09-16 20:30:58 +00002005static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002006char utf7_category[128] = {
2007/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2008 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2009/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2010 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2011/* sp ! " # $ % & ' ( ) * + , - . / */
2012 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2013/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2014 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2015/* @ A B C D E F G H I J K L M N O */
2016 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2017/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2018 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2019/* ` a b c d e f g h i j k l m n o */
2020 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2021/* p q r s t u v w x y z { | } ~ del */
2022 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002023};
2024
Antoine Pitrou244651a2009-05-04 18:56:13 +00002025/* ENCODE_DIRECT: this character should be encoded as itself. The
2026 * answer depends on whether we are encoding set O as itself, and also
2027 * on whether we are encoding whitespace as itself. RFC2152 makes it
2028 * clear that the answers to these questions vary between
2029 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002030
Antoine Pitrou244651a2009-05-04 18:56:13 +00002031#define ENCODE_DIRECT(c, directO, directWS) \
2032 ((c) < 128 && (c) > 0 && \
2033 ((utf7_category[(c)] == 0) || \
2034 (directWS && (utf7_category[(c)] == 2)) || \
2035 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002036
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002037PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002038 Py_ssize_t size,
2039 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002040{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002041 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2042}
2043
Antoine Pitrou244651a2009-05-04 18:56:13 +00002044/* The decoder. The only state we preserve is our read position,
2045 * i.e. how many characters we have consumed. So if we end in the
2046 * middle of a shift sequence we have to back off the read position
2047 * and the output to the beginning of the sequence, otherwise we lose
2048 * all the shift state (seen bits, number of bits seen, high
2049 * surrogate). */
2050
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002051PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002052 Py_ssize_t size,
2053 const char *errors,
2054 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002055{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002057 Py_ssize_t startinpos;
2058 Py_ssize_t endinpos;
2059 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002060 const char *e;
2061 PyUnicodeObject *unicode;
2062 Py_UNICODE *p;
2063 const char *errmsg = "";
2064 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002065 Py_UNICODE *shiftOutStart;
2066 unsigned int base64bits = 0;
2067 unsigned long base64buffer = 0;
2068 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069 PyObject *errorHandler = NULL;
2070 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002071
2072 unicode = _PyUnicode_New(size);
2073 if (!unicode)
2074 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002075 if (size == 0) {
2076 if (consumed)
2077 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002078 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002079 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002080
2081 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002082 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002083 e = s + size;
2084
2085 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002086 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002087 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002088 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002089
Antoine Pitrou244651a2009-05-04 18:56:13 +00002090 if (inShift) { /* in a base-64 section */
2091 if (IS_BASE64(ch)) { /* consume a base-64 character */
2092 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2093 base64bits += 6;
2094 s++;
2095 if (base64bits >= 16) {
2096 /* we have enough bits for a UTF-16 value */
2097 Py_UNICODE outCh = (Py_UNICODE)
2098 (base64buffer >> (base64bits-16));
2099 base64bits -= 16;
2100 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2101 if (surrogate) {
2102 /* expecting a second surrogate */
2103 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2104#ifdef Py_UNICODE_WIDE
2105 *p++ = (((surrogate & 0x3FF)<<10)
2106 | (outCh & 0x3FF)) + 0x10000;
2107#else
2108 *p++ = surrogate;
2109 *p++ = outCh;
2110#endif
2111 surrogate = 0;
2112 }
2113 else {
2114 surrogate = 0;
2115 errmsg = "second surrogate missing";
2116 goto utf7Error;
2117 }
2118 }
2119 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2120 /* first surrogate */
2121 surrogate = outCh;
2122 }
2123 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2124 errmsg = "unexpected second surrogate";
2125 goto utf7Error;
2126 }
2127 else {
2128 *p++ = outCh;
2129 }
2130 }
2131 }
2132 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002133 inShift = 0;
2134 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002135 if (surrogate) {
2136 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002137 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002138 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002139 if (base64bits > 0) { /* left-over bits */
2140 if (base64bits >= 6) {
2141 /* We've seen at least one base-64 character */
2142 errmsg = "partial character in shift sequence";
2143 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002144 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002145 else {
2146 /* Some bits remain; they should be zero */
2147 if (base64buffer != 0) {
2148 errmsg = "non-zero padding bits in shift sequence";
2149 goto utf7Error;
2150 }
2151 }
2152 }
2153 if (ch != '-') {
2154 /* '-' is absorbed; other terminating
2155 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002156 *p++ = ch;
2157 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002158 }
2159 }
2160 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002161 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002162 s++; /* consume '+' */
2163 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002164 s++;
2165 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002166 }
2167 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002168 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002169 shiftOutStart = p;
2170 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002171 }
2172 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002173 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002174 *p++ = ch;
2175 s++;
2176 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002177 else {
2178 startinpos = s-starts;
2179 s++;
2180 errmsg = "unexpected special character";
2181 goto utf7Error;
2182 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002183 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002184utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002185 outpos = p-PyUnicode_AS_UNICODE(unicode);
2186 endinpos = s-starts;
2187 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002188 errors, &errorHandler,
2189 "utf7", errmsg,
2190 &starts, &e, &startinpos, &endinpos, &exc, &s,
2191 &unicode, &outpos, &p))
2192 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002193 }
2194
Antoine Pitrou244651a2009-05-04 18:56:13 +00002195 /* end of string */
2196
2197 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2198 /* if we're in an inconsistent state, that's an error */
2199 if (surrogate ||
2200 (base64bits >= 6) ||
2201 (base64bits > 0 && base64buffer != 0)) {
2202 outpos = p-PyUnicode_AS_UNICODE(unicode);
2203 endinpos = size;
2204 if (unicode_decode_call_errorhandler(
2205 errors, &errorHandler,
2206 "utf7", "unterminated shift sequence",
2207 &starts, &e, &startinpos, &endinpos, &exc, &s,
2208 &unicode, &outpos, &p))
2209 goto onError;
2210 if (s < e)
2211 goto restart;
2212 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002213 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002214
2215 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002216 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002217 if (inShift) {
2218 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002219 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002220 }
2221 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002222 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002223 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002224 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002225
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002226 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002227 goto onError;
2228
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002229 Py_XDECREF(errorHandler);
2230 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002231 return (PyObject *)unicode;
2232
Benjamin Peterson29060642009-01-31 22:14:21 +00002233 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002234 Py_XDECREF(errorHandler);
2235 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002236 Py_DECREF(unicode);
2237 return NULL;
2238}
2239
2240
2241PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002242 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002243 int base64SetO,
2244 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002245 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002246{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002247 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002248 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002249 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002250 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002251 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002252 unsigned int base64bits = 0;
2253 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002254 char * out;
2255 char * start;
2256
2257 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002258 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002259
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002260 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002261 return PyErr_NoMemory();
2262
Antoine Pitrou244651a2009-05-04 18:56:13 +00002263 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002264 if (v == NULL)
2265 return NULL;
2266
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002267 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002268 for (;i < size; ++i) {
2269 Py_UNICODE ch = s[i];
2270
Antoine Pitrou244651a2009-05-04 18:56:13 +00002271 if (inShift) {
2272 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2273 /* shifting out */
2274 if (base64bits) { /* output remaining bits */
2275 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2276 base64buffer = 0;
2277 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002278 }
2279 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002280 /* Characters not in the BASE64 set implicitly unshift the sequence
2281 so no '-' is required, except if the character is itself a '-' */
2282 if (IS_BASE64(ch) || ch == '-') {
2283 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002284 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002285 *out++ = (char) ch;
2286 }
2287 else {
2288 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002289 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002290 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002291 else { /* not in a shift sequence */
2292 if (ch == '+') {
2293 *out++ = '+';
2294 *out++ = '-';
2295 }
2296 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2297 *out++ = (char) ch;
2298 }
2299 else {
2300 *out++ = '+';
2301 inShift = 1;
2302 goto encode_char;
2303 }
2304 }
2305 continue;
2306encode_char:
2307#ifdef Py_UNICODE_WIDE
2308 if (ch >= 0x10000) {
2309 /* code first surrogate */
2310 base64bits += 16;
2311 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2312 while (base64bits >= 6) {
2313 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2314 base64bits -= 6;
2315 }
2316 /* prepare second surrogate */
2317 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2318 }
2319#endif
2320 base64bits += 16;
2321 base64buffer = (base64buffer << 16) | ch;
2322 while (base64bits >= 6) {
2323 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2324 base64bits -= 6;
2325 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002326 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002327 if (base64bits)
2328 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2329 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002330 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002331 if (_PyBytes_Resize(&v, out - start) < 0)
2332 return NULL;
2333 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002334}
2335
Antoine Pitrou244651a2009-05-04 18:56:13 +00002336#undef IS_BASE64
2337#undef FROM_BASE64
2338#undef TO_BASE64
2339#undef DECODE_DIRECT
2340#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002341
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342/* --- UTF-8 Codec -------------------------------------------------------- */
2343
Tim Petersced69f82003-09-16 20:30:58 +00002344static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002346 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2347 illegal prefix. See RFC 3629 for details */
2348 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2349 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002350 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2352 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2353 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2354 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002355 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2356 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2358 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002359 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2360 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2361 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2362 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2363 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002364};
2365
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002367 Py_ssize_t size,
2368 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369{
Walter Dörwald69652032004-09-07 20:24:22 +00002370 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2371}
2372
Antoine Pitrouab868312009-01-10 15:40:25 +00002373/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2374#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2375
2376/* Mask to quickly check whether a C 'long' contains a
2377 non-ASCII, UTF8-encoded char. */
2378#if (SIZEOF_LONG == 8)
2379# define ASCII_CHAR_MASK 0x8080808080808080L
2380#elif (SIZEOF_LONG == 4)
2381# define ASCII_CHAR_MASK 0x80808080L
2382#else
2383# error C 'long' size should be either 4 or 8!
2384#endif
2385
Walter Dörwald69652032004-09-07 20:24:22 +00002386PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002387 Py_ssize_t size,
2388 const char *errors,
2389 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002390{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002391 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002392 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002393 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002394 Py_ssize_t startinpos;
2395 Py_ssize_t endinpos;
2396 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002397 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398 PyUnicodeObject *unicode;
2399 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002400 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002401 PyObject *errorHandler = NULL;
2402 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403
2404 /* Note: size will always be longer than the resulting Unicode
2405 character count */
2406 unicode = _PyUnicode_New(size);
2407 if (!unicode)
2408 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002409 if (size == 0) {
2410 if (consumed)
2411 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414
2415 /* Unpack UTF-8 encoded data */
2416 p = unicode->str;
2417 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002418 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002419
2420 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002421 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002422
2423 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002424 /* Fast path for runs of ASCII characters. Given that common UTF-8
2425 input will consist of an overwhelming majority of ASCII
2426 characters, we try to optimize for this case by checking
2427 as many characters as a C 'long' can contain.
2428 First, check if we can do an aligned read, as most CPUs have
2429 a penalty for unaligned reads.
2430 */
2431 if (!((size_t) s & LONG_PTR_MASK)) {
2432 /* Help register allocation */
2433 register const char *_s = s;
2434 register Py_UNICODE *_p = p;
2435 while (_s < aligned_end) {
2436 /* Read a whole long at a time (either 4 or 8 bytes),
2437 and do a fast unrolled copy if it only contains ASCII
2438 characters. */
2439 unsigned long data = *(unsigned long *) _s;
2440 if (data & ASCII_CHAR_MASK)
2441 break;
2442 _p[0] = (unsigned char) _s[0];
2443 _p[1] = (unsigned char) _s[1];
2444 _p[2] = (unsigned char) _s[2];
2445 _p[3] = (unsigned char) _s[3];
2446#if (SIZEOF_LONG == 8)
2447 _p[4] = (unsigned char) _s[4];
2448 _p[5] = (unsigned char) _s[5];
2449 _p[6] = (unsigned char) _s[6];
2450 _p[7] = (unsigned char) _s[7];
2451#endif
2452 _s += SIZEOF_LONG;
2453 _p += SIZEOF_LONG;
2454 }
2455 s = _s;
2456 p = _p;
2457 if (s == e)
2458 break;
2459 ch = (unsigned char)*s;
2460 }
2461 }
2462
2463 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002464 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 s++;
2466 continue;
2467 }
2468
2469 n = utf8_code_length[ch];
2470
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002471 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002472 if (consumed)
2473 break;
2474 else {
2475 errmsg = "unexpected end of data";
2476 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002477 endinpos = startinpos+1;
2478 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2479 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 goto utf8Error;
2481 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002483
2484 switch (n) {
2485
2486 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002487 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 startinpos = s-starts;
2489 endinpos = startinpos+1;
2490 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491
2492 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002493 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 startinpos = s-starts;
2495 endinpos = startinpos+1;
2496 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497
2498 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002499 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002500 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002501 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002502 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002503 goto utf8Error;
2504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002506 assert ((ch > 0x007F) && (ch <= 0x07FF));
2507 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 break;
2509
2510 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002511 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2512 will result in surrogates in range d800-dfff. Surrogates are
2513 not valid UTF-8 so they are rejected.
2514 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2515 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002516 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002517 (s[2] & 0xc0) != 0x80 ||
2518 ((unsigned char)s[0] == 0xE0 &&
2519 (unsigned char)s[1] < 0xA0) ||
2520 ((unsigned char)s[0] == 0xED &&
2521 (unsigned char)s[1] > 0x9F)) {
2522 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002523 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002524 endinpos = startinpos + 1;
2525
2526 /* if s[1] first two bits are 1 and 0, then the invalid
2527 continuation byte is s[2], so increment endinpos by 1,
2528 if not, s[1] is invalid and endinpos doesn't need to
2529 be incremented. */
2530 if ((s[1] & 0xC0) == 0x80)
2531 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002532 goto utf8Error;
2533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002535 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2536 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002537 break;
2538
2539 case 4:
2540 if ((s[1] & 0xc0) != 0x80 ||
2541 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002542 (s[3] & 0xc0) != 0x80 ||
2543 ((unsigned char)s[0] == 0xF0 &&
2544 (unsigned char)s[1] < 0x90) ||
2545 ((unsigned char)s[0] == 0xF4 &&
2546 (unsigned char)s[1] > 0x8F)) {
2547 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002548 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002549 endinpos = startinpos + 1;
2550 if ((s[1] & 0xC0) == 0x80) {
2551 endinpos++;
2552 if ((s[2] & 0xC0) == 0x80)
2553 endinpos++;
2554 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002555 goto utf8Error;
2556 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002557 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002558 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2559 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2560
Fredrik Lundh8f455852001-06-27 18:59:43 +00002561#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002562 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002563#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002564 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002565
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002566 /* translate from 10000..10FFFF to 0..FFFF */
2567 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002568
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002569 /* high surrogate = top 10 bits added to D800 */
2570 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002571
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002572 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002573 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002574#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576 }
2577 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002578 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002579
Benjamin Peterson29060642009-01-31 22:14:21 +00002580 utf8Error:
2581 outpos = p-PyUnicode_AS_UNICODE(unicode);
2582 if (unicode_decode_call_errorhandler(
2583 errors, &errorHandler,
2584 "utf8", errmsg,
2585 &starts, &e, &startinpos, &endinpos, &exc, &s,
2586 &unicode, &outpos, &p))
2587 goto onError;
2588 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 }
Walter Dörwald69652032004-09-07 20:24:22 +00002590 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002591 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592
2593 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002594 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 goto onError;
2596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002597 Py_XDECREF(errorHandler);
2598 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 return (PyObject *)unicode;
2600
Benjamin Peterson29060642009-01-31 22:14:21 +00002601 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002602 Py_XDECREF(errorHandler);
2603 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 Py_DECREF(unicode);
2605 return NULL;
2606}
2607
Antoine Pitrouab868312009-01-10 15:40:25 +00002608#undef ASCII_CHAR_MASK
2609
2610
Tim Peters602f7402002-04-27 18:03:26 +00002611/* Allocation strategy: if the string is short, convert into a stack buffer
2612 and allocate exactly as much space needed at the end. Else allocate the
2613 maximum possible needed (4 result bytes per Unicode character), and return
2614 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002615*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002616PyObject *
2617PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002618 Py_ssize_t size,
2619 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620{
Tim Peters602f7402002-04-27 18:03:26 +00002621#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002622
Guido van Rossum98297ee2007-11-06 21:34:58 +00002623 Py_ssize_t i; /* index into s of next input byte */
2624 PyObject *result; /* result string object */
2625 char *p; /* next free byte in output buffer */
2626 Py_ssize_t nallocated; /* number of result bytes allocated */
2627 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002628 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002629 PyObject *errorHandler = NULL;
2630 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002631
Tim Peters602f7402002-04-27 18:03:26 +00002632 assert(s != NULL);
2633 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634
Tim Peters602f7402002-04-27 18:03:26 +00002635 if (size <= MAX_SHORT_UNICHARS) {
2636 /* Write into the stack buffer; nallocated can't overflow.
2637 * At the end, we'll allocate exactly as much heap space as it
2638 * turns out we need.
2639 */
2640 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002641 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002642 p = stackbuf;
2643 }
2644 else {
2645 /* Overallocate on the heap, and give the excess back at the end. */
2646 nallocated = size * 4;
2647 if (nallocated / 4 != size) /* overflow! */
2648 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002649 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002650 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002651 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002652 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002653 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002654
Tim Peters602f7402002-04-27 18:03:26 +00002655 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002656 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002657
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002658 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002659 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002661
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002663 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002664 *p++ = (char)(0xc0 | (ch >> 6));
2665 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002666 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002667#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002668 /* Special case: check for high and low surrogate */
2669 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2670 Py_UCS4 ch2 = s[i];
2671 /* Combine the two surrogates to form a UCS4 value */
2672 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2673 i++;
2674
2675 /* Encode UCS4 Unicode ordinals */
2676 *p++ = (char)(0xf0 | (ch >> 18));
2677 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002678 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2679 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002680 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002681#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002682 Py_ssize_t newpos;
2683 PyObject *rep;
2684 Py_ssize_t repsize, k;
2685 rep = unicode_encode_call_errorhandler
2686 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2687 s, size, &exc, i-1, i, &newpos);
2688 if (!rep)
2689 goto error;
2690
2691 if (PyBytes_Check(rep))
2692 repsize = PyBytes_GET_SIZE(rep);
2693 else
2694 repsize = PyUnicode_GET_SIZE(rep);
2695
2696 if (repsize > 4) {
2697 Py_ssize_t offset;
2698
2699 if (result == NULL)
2700 offset = p - stackbuf;
2701 else
2702 offset = p - PyBytes_AS_STRING(result);
2703
2704 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2705 /* integer overflow */
2706 PyErr_NoMemory();
2707 goto error;
2708 }
2709 nallocated += repsize - 4;
2710 if (result != NULL) {
2711 if (_PyBytes_Resize(&result, nallocated) < 0)
2712 goto error;
2713 } else {
2714 result = PyBytes_FromStringAndSize(NULL, nallocated);
2715 if (result == NULL)
2716 goto error;
2717 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2718 }
2719 p = PyBytes_AS_STRING(result) + offset;
2720 }
2721
2722 if (PyBytes_Check(rep)) {
2723 char *prep = PyBytes_AS_STRING(rep);
2724 for(k = repsize; k > 0; k--)
2725 *p++ = *prep++;
2726 } else /* rep is unicode */ {
2727 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2728 Py_UNICODE c;
2729
2730 for(k=0; k<repsize; k++) {
2731 c = prep[k];
2732 if (0x80 <= c) {
2733 raise_encode_exception(&exc, "utf-8", s, size,
2734 i-1, i, "surrogates not allowed");
2735 goto error;
2736 }
2737 *p++ = (char)prep[k];
2738 }
2739 }
2740 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002741#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002742 }
Victor Stinner445a6232010-04-22 20:01:57 +00002743#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002744 } else if (ch < 0x10000) {
2745 *p++ = (char)(0xe0 | (ch >> 12));
2746 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2747 *p++ = (char)(0x80 | (ch & 0x3f));
2748 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002749 /* Encode UCS4 Unicode ordinals */
2750 *p++ = (char)(0xf0 | (ch >> 18));
2751 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2752 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2753 *p++ = (char)(0x80 | (ch & 0x3f));
2754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002756
Guido van Rossum98297ee2007-11-06 21:34:58 +00002757 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002758 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002759 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002760 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002761 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002762 }
2763 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002764 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002765 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002766 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002767 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002768 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002769 Py_XDECREF(errorHandler);
2770 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002771 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002772 error:
2773 Py_XDECREF(errorHandler);
2774 Py_XDECREF(exc);
2775 Py_XDECREF(result);
2776 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002777
Tim Peters602f7402002-04-27 18:03:26 +00002778#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779}
2780
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2782{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 if (!PyUnicode_Check(unicode)) {
2784 PyErr_BadArgument();
2785 return NULL;
2786 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002787 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00002788 PyUnicode_GET_SIZE(unicode),
2789 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790}
2791
Walter Dörwald41980ca2007-08-16 21:55:45 +00002792/* --- UTF-32 Codec ------------------------------------------------------- */
2793
2794PyObject *
2795PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002796 Py_ssize_t size,
2797 const char *errors,
2798 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002799{
2800 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2801}
2802
2803PyObject *
2804PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 Py_ssize_t size,
2806 const char *errors,
2807 int *byteorder,
2808 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002809{
2810 const char *starts = s;
2811 Py_ssize_t startinpos;
2812 Py_ssize_t endinpos;
2813 Py_ssize_t outpos;
2814 PyUnicodeObject *unicode;
2815 Py_UNICODE *p;
2816#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002817 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00002818 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002819#else
2820 const int pairs = 0;
2821#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00002822 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002823 int bo = 0; /* assume native ordering by default */
2824 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00002825 /* Offsets from q for retrieving bytes in the right order. */
2826#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2827 int iorder[] = {0, 1, 2, 3};
2828#else
2829 int iorder[] = {3, 2, 1, 0};
2830#endif
2831 PyObject *errorHandler = NULL;
2832 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00002833
Walter Dörwald41980ca2007-08-16 21:55:45 +00002834 q = (unsigned char *)s;
2835 e = q + size;
2836
2837 if (byteorder)
2838 bo = *byteorder;
2839
2840 /* Check for BOM marks (U+FEFF) in the input and adjust current
2841 byte order setting accordingly. In native mode, the leading BOM
2842 mark is skipped, in all other modes, it is copied to the output
2843 stream as-is (giving a ZWNBSP character). */
2844 if (bo == 0) {
2845 if (size >= 4) {
2846 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00002847 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002848#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00002849 if (bom == 0x0000FEFF) {
2850 q += 4;
2851 bo = -1;
2852 }
2853 else if (bom == 0xFFFE0000) {
2854 q += 4;
2855 bo = 1;
2856 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002857#else
Benjamin Peterson29060642009-01-31 22:14:21 +00002858 if (bom == 0x0000FEFF) {
2859 q += 4;
2860 bo = 1;
2861 }
2862 else if (bom == 0xFFFE0000) {
2863 q += 4;
2864 bo = -1;
2865 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002866#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002867 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002868 }
2869
2870 if (bo == -1) {
2871 /* force LE */
2872 iorder[0] = 0;
2873 iorder[1] = 1;
2874 iorder[2] = 2;
2875 iorder[3] = 3;
2876 }
2877 else if (bo == 1) {
2878 /* force BE */
2879 iorder[0] = 3;
2880 iorder[1] = 2;
2881 iorder[2] = 1;
2882 iorder[3] = 0;
2883 }
2884
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00002885 /* On narrow builds we split characters outside the BMP into two
2886 codepoints => count how much extra space we need. */
2887#ifndef Py_UNICODE_WIDE
2888 for (qq = q; qq < e; qq += 4)
2889 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2890 pairs++;
2891#endif
2892
2893 /* This might be one to much, because of a BOM */
2894 unicode = _PyUnicode_New((size+3)/4+pairs);
2895 if (!unicode)
2896 return NULL;
2897 if (size == 0)
2898 return (PyObject *)unicode;
2899
2900 /* Unpack UTF-32 encoded data */
2901 p = unicode->str;
2902
Walter Dörwald41980ca2007-08-16 21:55:45 +00002903 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002904 Py_UCS4 ch;
2905 /* remaining bytes at the end? (size should be divisible by 4) */
2906 if (e-q<4) {
2907 if (consumed)
2908 break;
2909 errmsg = "truncated data";
2910 startinpos = ((const char *)q)-starts;
2911 endinpos = ((const char *)e)-starts;
2912 goto utf32Error;
2913 /* The remaining input chars are ignored if the callback
2914 chooses to skip the input */
2915 }
2916 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2917 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00002918
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 if (ch >= 0x110000)
2920 {
2921 errmsg = "codepoint not in range(0x110000)";
2922 startinpos = ((const char *)q)-starts;
2923 endinpos = startinpos+4;
2924 goto utf32Error;
2925 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00002926#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002927 if (ch >= 0x10000)
2928 {
2929 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2930 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2931 }
2932 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00002933#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 *p++ = ch;
2935 q += 4;
2936 continue;
2937 utf32Error:
2938 outpos = p-PyUnicode_AS_UNICODE(unicode);
2939 if (unicode_decode_call_errorhandler(
2940 errors, &errorHandler,
2941 "utf32", errmsg,
2942 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2943 &unicode, &outpos, &p))
2944 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002945 }
2946
2947 if (byteorder)
2948 *byteorder = bo;
2949
2950 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002952
2953 /* Adjust length */
2954 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2955 goto onError;
2956
2957 Py_XDECREF(errorHandler);
2958 Py_XDECREF(exc);
2959 return (PyObject *)unicode;
2960
Benjamin Peterson29060642009-01-31 22:14:21 +00002961 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00002962 Py_DECREF(unicode);
2963 Py_XDECREF(errorHandler);
2964 Py_XDECREF(exc);
2965 return NULL;
2966}
2967
2968PyObject *
2969PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002970 Py_ssize_t size,
2971 const char *errors,
2972 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00002973{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002974 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002975 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002976 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002977#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002978 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00002979#else
2980 const int pairs = 0;
2981#endif
2982 /* Offsets from p for storing byte pairs in the right order. */
2983#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2984 int iorder[] = {0, 1, 2, 3};
2985#else
2986 int iorder[] = {3, 2, 1, 0};
2987#endif
2988
Benjamin Peterson29060642009-01-31 22:14:21 +00002989#define STORECHAR(CH) \
2990 do { \
2991 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2992 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2993 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2994 p[iorder[0]] = (CH) & 0xff; \
2995 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00002996 } while(0)
2997
2998 /* In narrow builds we can output surrogate pairs as one codepoint,
2999 so we need less space. */
3000#ifndef Py_UNICODE_WIDE
3001 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003002 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3003 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3004 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003005#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003006 nsize = (size - pairs + (byteorder == 0));
3007 bytesize = nsize * 4;
3008 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003009 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003010 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003011 if (v == NULL)
3012 return NULL;
3013
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003014 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003015 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003016 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003017 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003018 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003019
3020 if (byteorder == -1) {
3021 /* force LE */
3022 iorder[0] = 0;
3023 iorder[1] = 1;
3024 iorder[2] = 2;
3025 iorder[3] = 3;
3026 }
3027 else if (byteorder == 1) {
3028 /* force BE */
3029 iorder[0] = 3;
3030 iorder[1] = 2;
3031 iorder[2] = 1;
3032 iorder[3] = 0;
3033 }
3034
3035 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003037#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3039 Py_UCS4 ch2 = *s;
3040 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3041 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3042 s++;
3043 size--;
3044 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003045 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003046#endif
3047 STORECHAR(ch);
3048 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003049
3050 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003051 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003052#undef STORECHAR
3053}
3054
3055PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3056{
3057 if (!PyUnicode_Check(unicode)) {
3058 PyErr_BadArgument();
3059 return NULL;
3060 }
3061 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003062 PyUnicode_GET_SIZE(unicode),
3063 NULL,
3064 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003065}
3066
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067/* --- UTF-16 Codec ------------------------------------------------------- */
3068
Tim Peters772747b2001-08-09 22:21:55 +00003069PyObject *
3070PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 Py_ssize_t size,
3072 const char *errors,
3073 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074{
Walter Dörwald69652032004-09-07 20:24:22 +00003075 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3076}
3077
Antoine Pitrouab868312009-01-10 15:40:25 +00003078/* Two masks for fast checking of whether a C 'long' may contain
3079 UTF16-encoded surrogate characters. This is an efficient heuristic,
3080 assuming that non-surrogate characters with a code point >= 0x8000 are
3081 rare in most input.
3082 FAST_CHAR_MASK is used when the input is in native byte ordering,
3083 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003084*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003085#if (SIZEOF_LONG == 8)
3086# define FAST_CHAR_MASK 0x8000800080008000L
3087# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3088#elif (SIZEOF_LONG == 4)
3089# define FAST_CHAR_MASK 0x80008000L
3090# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3091#else
3092# error C 'long' size should be either 4 or 8!
3093#endif
3094
Walter Dörwald69652032004-09-07 20:24:22 +00003095PyObject *
3096PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003097 Py_ssize_t size,
3098 const char *errors,
3099 int *byteorder,
3100 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003101{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003102 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003103 Py_ssize_t startinpos;
3104 Py_ssize_t endinpos;
3105 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106 PyUnicodeObject *unicode;
3107 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003108 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003109 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003110 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003111 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003112 /* Offsets from q for retrieving byte pairs in the right order. */
3113#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3114 int ihi = 1, ilo = 0;
3115#else
3116 int ihi = 0, ilo = 1;
3117#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003118 PyObject *errorHandler = NULL;
3119 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120
3121 /* Note: size will always be longer than the resulting Unicode
3122 character count */
3123 unicode = _PyUnicode_New(size);
3124 if (!unicode)
3125 return NULL;
3126 if (size == 0)
3127 return (PyObject *)unicode;
3128
3129 /* Unpack UTF-16 encoded data */
3130 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003131 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003132 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133
3134 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003135 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003137 /* Check for BOM marks (U+FEFF) in the input and adjust current
3138 byte order setting accordingly. In native mode, the leading BOM
3139 mark is skipped, in all other modes, it is copied to the output
3140 stream as-is (giving a ZWNBSP character). */
3141 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003142 if (size >= 2) {
3143 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003144#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 if (bom == 0xFEFF) {
3146 q += 2;
3147 bo = -1;
3148 }
3149 else if (bom == 0xFFFE) {
3150 q += 2;
3151 bo = 1;
3152 }
Tim Petersced69f82003-09-16 20:30:58 +00003153#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003154 if (bom == 0xFEFF) {
3155 q += 2;
3156 bo = 1;
3157 }
3158 else if (bom == 0xFFFE) {
3159 q += 2;
3160 bo = -1;
3161 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003162#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003163 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165
Tim Peters772747b2001-08-09 22:21:55 +00003166 if (bo == -1) {
3167 /* force LE */
3168 ihi = 1;
3169 ilo = 0;
3170 }
3171 else if (bo == 1) {
3172 /* force BE */
3173 ihi = 0;
3174 ilo = 1;
3175 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003176#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3177 native_ordering = ilo < ihi;
3178#else
3179 native_ordering = ilo > ihi;
3180#endif
Tim Peters772747b2001-08-09 22:21:55 +00003181
Antoine Pitrouab868312009-01-10 15:40:25 +00003182 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003183 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003184 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003185 /* First check for possible aligned read of a C 'long'. Unaligned
3186 reads are more expensive, better to defer to another iteration. */
3187 if (!((size_t) q & LONG_PTR_MASK)) {
3188 /* Fast path for runs of non-surrogate chars. */
3189 register const unsigned char *_q = q;
3190 Py_UNICODE *_p = p;
3191 if (native_ordering) {
3192 /* Native ordering is simple: as long as the input cannot
3193 possibly contain a surrogate char, do an unrolled copy
3194 of several 16-bit code points to the target object.
3195 The non-surrogate check is done on several input bytes
3196 at a time (as many as a C 'long' can contain). */
3197 while (_q < aligned_end) {
3198 unsigned long data = * (unsigned long *) _q;
3199 if (data & FAST_CHAR_MASK)
3200 break;
3201 _p[0] = ((unsigned short *) _q)[0];
3202 _p[1] = ((unsigned short *) _q)[1];
3203#if (SIZEOF_LONG == 8)
3204 _p[2] = ((unsigned short *) _q)[2];
3205 _p[3] = ((unsigned short *) _q)[3];
3206#endif
3207 _q += SIZEOF_LONG;
3208 _p += SIZEOF_LONG / 2;
3209 }
3210 }
3211 else {
3212 /* Byteswapped ordering is similar, but we must decompose
3213 the copy bytewise, and take care of zero'ing out the
3214 upper bytes if the target object is in 32-bit units
3215 (that is, in UCS-4 builds). */
3216 while (_q < aligned_end) {
3217 unsigned long data = * (unsigned long *) _q;
3218 if (data & SWAPPED_FAST_CHAR_MASK)
3219 break;
3220 /* Zero upper bytes in UCS-4 builds */
3221#if (Py_UNICODE_SIZE > 2)
3222 _p[0] = 0;
3223 _p[1] = 0;
3224#if (SIZEOF_LONG == 8)
3225 _p[2] = 0;
3226 _p[3] = 0;
3227#endif
3228#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003229 /* Issue #4916; UCS-4 builds on big endian machines must
3230 fill the two last bytes of each 4-byte unit. */
3231#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3232# define OFF 2
3233#else
3234# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003235#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003236 ((unsigned char *) _p)[OFF + 1] = _q[0];
3237 ((unsigned char *) _p)[OFF + 0] = _q[1];
3238 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3239 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3240#if (SIZEOF_LONG == 8)
3241 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3242 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3243 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3244 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3245#endif
3246#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003247 _q += SIZEOF_LONG;
3248 _p += SIZEOF_LONG / 2;
3249 }
3250 }
3251 p = _p;
3252 q = _q;
3253 if (q >= e)
3254 break;
3255 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257
Benjamin Peterson14339b62009-01-31 16:36:08 +00003258 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003259
3260 if (ch < 0xD800 || ch > 0xDFFF) {
3261 *p++ = ch;
3262 continue;
3263 }
3264
3265 /* UTF-16 code pair: */
3266 if (q > e) {
3267 errmsg = "unexpected end of data";
3268 startinpos = (((const char *)q) - 2) - starts;
3269 endinpos = ((const char *)e) + 1 - starts;
3270 goto utf16Error;
3271 }
3272 if (0xD800 <= ch && ch <= 0xDBFF) {
3273 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3274 q += 2;
3275 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003276#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003277 *p++ = ch;
3278 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003279#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003280 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003281#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003282 continue;
3283 }
3284 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003285 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003286 startinpos = (((const char *)q)-4)-starts;
3287 endinpos = startinpos+2;
3288 goto utf16Error;
3289 }
3290
Benjamin Peterson14339b62009-01-31 16:36:08 +00003291 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003292 errmsg = "illegal encoding";
3293 startinpos = (((const char *)q)-2)-starts;
3294 endinpos = startinpos+2;
3295 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003296
Benjamin Peterson29060642009-01-31 22:14:21 +00003297 utf16Error:
3298 outpos = p - PyUnicode_AS_UNICODE(unicode);
3299 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003300 errors,
3301 &errorHandler,
3302 "utf16", errmsg,
3303 &starts,
3304 (const char **)&e,
3305 &startinpos,
3306 &endinpos,
3307 &exc,
3308 (const char **)&q,
3309 &unicode,
3310 &outpos,
3311 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003312 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003314 /* remaining byte at the end? (size should be even) */
3315 if (e == q) {
3316 if (!consumed) {
3317 errmsg = "truncated data";
3318 startinpos = ((const char *)q) - starts;
3319 endinpos = ((const char *)e) + 1 - starts;
3320 outpos = p - PyUnicode_AS_UNICODE(unicode);
3321 if (unicode_decode_call_errorhandler(
3322 errors,
3323 &errorHandler,
3324 "utf16", errmsg,
3325 &starts,
3326 (const char **)&e,
3327 &startinpos,
3328 &endinpos,
3329 &exc,
3330 (const char **)&q,
3331 &unicode,
3332 &outpos,
3333 &p))
3334 goto onError;
3335 /* The remaining input chars are ignored if the callback
3336 chooses to skip the input */
3337 }
3338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339
3340 if (byteorder)
3341 *byteorder = bo;
3342
Walter Dörwald69652032004-09-07 20:24:22 +00003343 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003344 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003345
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003347 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 goto onError;
3349
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003350 Py_XDECREF(errorHandler);
3351 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 return (PyObject *)unicode;
3353
Benjamin Peterson29060642009-01-31 22:14:21 +00003354 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003356 Py_XDECREF(errorHandler);
3357 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 return NULL;
3359}
3360
Antoine Pitrouab868312009-01-10 15:40:25 +00003361#undef FAST_CHAR_MASK
3362#undef SWAPPED_FAST_CHAR_MASK
3363
Tim Peters772747b2001-08-09 22:21:55 +00003364PyObject *
3365PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003366 Py_ssize_t size,
3367 const char *errors,
3368 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003370 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003371 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003372 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003373#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003374 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003375#else
3376 const int pairs = 0;
3377#endif
Tim Peters772747b2001-08-09 22:21:55 +00003378 /* Offsets from p for storing byte pairs in the right order. */
3379#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3380 int ihi = 1, ilo = 0;
3381#else
3382 int ihi = 0, ilo = 1;
3383#endif
3384
Benjamin Peterson29060642009-01-31 22:14:21 +00003385#define STORECHAR(CH) \
3386 do { \
3387 p[ihi] = ((CH) >> 8) & 0xff; \
3388 p[ilo] = (CH) & 0xff; \
3389 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003390 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003392#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003393 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 if (s[i] >= 0x10000)
3395 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003396#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003397 /* 2 * (size + pairs + (byteorder == 0)) */
3398 if (size > PY_SSIZE_T_MAX ||
3399 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003400 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003401 nsize = size + pairs + (byteorder == 0);
3402 bytesize = nsize * 2;
3403 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003404 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003405 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406 if (v == NULL)
3407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003409 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003411 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003412 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003413 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003414
3415 if (byteorder == -1) {
3416 /* force LE */
3417 ihi = 1;
3418 ilo = 0;
3419 }
3420 else if (byteorder == 1) {
3421 /* force BE */
3422 ihi = 0;
3423 ilo = 1;
3424 }
3425
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003426 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 Py_UNICODE ch = *s++;
3428 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003429#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003430 if (ch >= 0x10000) {
3431 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3432 ch = 0xD800 | ((ch-0x10000) >> 10);
3433 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003434#endif
Tim Peters772747b2001-08-09 22:21:55 +00003435 STORECHAR(ch);
3436 if (ch2)
3437 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003438 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003439
3440 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003441 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003442#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443}
3444
3445PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3446{
3447 if (!PyUnicode_Check(unicode)) {
3448 PyErr_BadArgument();
3449 return NULL;
3450 }
3451 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003452 PyUnicode_GET_SIZE(unicode),
3453 NULL,
3454 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455}
3456
3457/* --- Unicode Escape Codec ----------------------------------------------- */
3458
Fredrik Lundh06d12682001-01-24 07:59:11 +00003459static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003460
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003462 Py_ssize_t size,
3463 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003465 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003466 Py_ssize_t startinpos;
3467 Py_ssize_t endinpos;
3468 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003473 char* message;
3474 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 PyObject *errorHandler = NULL;
3476 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003477
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 /* Escaped strings will always be longer than the resulting
3479 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 length after conversion to the true value.
3481 (but if the error callback returns a long replacement string
3482 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 v = _PyUnicode_New(size);
3484 if (v == NULL)
3485 goto onError;
3486 if (size == 0)
3487 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003488
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003491
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 while (s < end) {
3493 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003494 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496
3497 /* Non-escape characters are interpreted as Unicode ordinals */
3498 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003499 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 continue;
3501 }
3502
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 /* \ - Escapes */
3505 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003506 c = *s++;
3507 if (s > end)
3508 c = '\0'; /* Invalid after \ */
3509 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510
Benjamin Peterson29060642009-01-31 22:14:21 +00003511 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512 case '\n': break;
3513 case '\\': *p++ = '\\'; break;
3514 case '\'': *p++ = '\''; break;
3515 case '\"': *p++ = '\"'; break;
3516 case 'b': *p++ = '\b'; break;
3517 case 'f': *p++ = '\014'; break; /* FF */
3518 case 't': *p++ = '\t'; break;
3519 case 'n': *p++ = '\n'; break;
3520 case 'r': *p++ = '\r'; break;
3521 case 'v': *p++ = '\013'; break; /* VT */
3522 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3523
Benjamin Peterson29060642009-01-31 22:14:21 +00003524 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 case '0': case '1': case '2': case '3':
3526 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003527 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003528 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003529 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003530 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003531 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003533 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 break;
3535
Benjamin Peterson29060642009-01-31 22:14:21 +00003536 /* hex escapes */
3537 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003539 digits = 2;
3540 message = "truncated \\xXX escape";
3541 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542
Benjamin Peterson29060642009-01-31 22:14:21 +00003543 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003545 digits = 4;
3546 message = "truncated \\uXXXX escape";
3547 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548
Benjamin Peterson29060642009-01-31 22:14:21 +00003549 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003550 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003551 digits = 8;
3552 message = "truncated \\UXXXXXXXX escape";
3553 hexescape:
3554 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 outpos = p-PyUnicode_AS_UNICODE(v);
3556 if (s+digits>end) {
3557 endinpos = size;
3558 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003559 errors, &errorHandler,
3560 "unicodeescape", "end of string in escape sequence",
3561 &starts, &end, &startinpos, &endinpos, &exc, &s,
3562 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 goto onError;
3564 goto nextByte;
3565 }
3566 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003567 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003568 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003569 endinpos = (s+i+1)-starts;
3570 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003571 errors, &errorHandler,
3572 "unicodeescape", message,
3573 &starts, &end, &startinpos, &endinpos, &exc, &s,
3574 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003575 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003577 }
3578 chr = (chr<<4) & ~0xF;
3579 if (c >= '0' && c <= '9')
3580 chr += c - '0';
3581 else if (c >= 'a' && c <= 'f')
3582 chr += 10 + c - 'a';
3583 else
3584 chr += 10 + c - 'A';
3585 }
3586 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003587 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 /* _decoding_error will have already written into the
3589 target buffer. */
3590 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003591 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003592 /* when we get here, chr is a 32-bit unicode character */
3593 if (chr <= 0xffff)
3594 /* UCS-2 character */
3595 *p++ = (Py_UNICODE) chr;
3596 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003597 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003598 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003599#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003600 *p++ = chr;
3601#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003602 chr -= 0x10000L;
3603 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003604 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003605#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003606 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 endinpos = s-starts;
3608 outpos = p-PyUnicode_AS_UNICODE(v);
3609 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003610 errors, &errorHandler,
3611 "unicodeescape", "illegal Unicode character",
3612 &starts, &end, &startinpos, &endinpos, &exc, &s,
3613 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003614 goto onError;
3615 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003616 break;
3617
Benjamin Peterson29060642009-01-31 22:14:21 +00003618 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003619 case 'N':
3620 message = "malformed \\N character escape";
3621 if (ucnhash_CAPI == NULL) {
3622 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003623 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003624 if (ucnhash_CAPI == NULL)
3625 goto ucnhashError;
3626 }
3627 if (*s == '{') {
3628 const char *start = s+1;
3629 /* look for the closing brace */
3630 while (*s != '}' && s < end)
3631 s++;
3632 if (s > start && s < end && *s == '}') {
3633 /* found a name. look it up in the unicode database */
3634 message = "unknown Unicode character name";
3635 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003636 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003637 goto store;
3638 }
3639 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 endinpos = s-starts;
3641 outpos = p-PyUnicode_AS_UNICODE(v);
3642 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003643 errors, &errorHandler,
3644 "unicodeescape", message,
3645 &starts, &end, &startinpos, &endinpos, &exc, &s,
3646 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003647 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003648 break;
3649
3650 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003651 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652 message = "\\ at end of string";
3653 s--;
3654 endinpos = s-starts;
3655 outpos = p-PyUnicode_AS_UNICODE(v);
3656 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 errors, &errorHandler,
3658 "unicodeescape", message,
3659 &starts, &end, &startinpos, &endinpos, &exc, &s,
3660 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003661 goto onError;
3662 }
3663 else {
3664 *p++ = '\\';
3665 *p++ = (unsigned char)s[-1];
3666 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003667 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003672 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003673 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003674 Py_XDECREF(errorHandler);
3675 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003677
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003679 PyErr_SetString(
3680 PyExc_UnicodeError,
3681 "\\N escapes not supported (can't load unicodedata module)"
3682 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003683 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 Py_XDECREF(errorHandler);
3685 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003686 return NULL;
3687
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 Py_XDECREF(errorHandler);
3691 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 return NULL;
3693}
3694
3695/* Return a Unicode-Escape string version of the Unicode object.
3696
3697 If quotes is true, the string is enclosed in u"" or u'' quotes as
3698 appropriate.
3699
3700*/
3701
Thomas Wouters477c8d52006-05-27 19:21:47 +00003702Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 Py_ssize_t size,
3704 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003705{
3706 /* like wcschr, but doesn't stop at NULL characters */
3707
3708 while (size-- > 0) {
3709 if (*s == ch)
3710 return s;
3711 s++;
3712 }
3713
3714 return NULL;
3715}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003716
Walter Dörwald79e913e2007-05-12 11:08:06 +00003717static const char *hexdigits = "0123456789abcdef";
3718
3719PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003720 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003722 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003725#ifdef Py_UNICODE_WIDE
3726 const Py_ssize_t expandsize = 10;
3727#else
3728 const Py_ssize_t expandsize = 6;
3729#endif
3730
Thomas Wouters89f507f2006-12-13 04:49:30 +00003731 /* XXX(nnorwitz): rather than over-allocating, it would be
3732 better to choose a different scheme. Perhaps scan the
3733 first N-chars of the string and allocate based on that size.
3734 */
3735 /* Initial allocation is based on the longest-possible unichr
3736 escape.
3737
3738 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3739 unichr, so in this case it's the longest unichr escape. In
3740 narrow (UTF-16) builds this is five chars per source unichr
3741 since there are two unichrs in the surrogate pair, so in narrow
3742 (UTF-16) builds it's not the longest unichr escape.
3743
3744 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3745 so in the narrow (UTF-16) build case it's the longest unichr
3746 escape.
3747 */
3748
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003749 if (size == 0)
3750 return PyBytes_FromStringAndSize(NULL, 0);
3751
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003752 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003753 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003754
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003755 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 2
3757 + expandsize*size
3758 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759 if (repr == NULL)
3760 return NULL;
3761
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003762 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 while (size-- > 0) {
3765 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003766
Walter Dörwald79e913e2007-05-12 11:08:06 +00003767 /* Escape backslashes */
3768 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 *p++ = '\\';
3770 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003771 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003772 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003773
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003774#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003775 /* Map 21-bit characters to '\U00xxxxxx' */
3776 else if (ch >= 0x10000) {
3777 *p++ = '\\';
3778 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003779 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3780 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3781 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3782 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3783 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3784 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3785 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3786 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003788 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003789#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003790 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3791 else if (ch >= 0xD800 && ch < 0xDC00) {
3792 Py_UNICODE ch2;
3793 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003794
Benjamin Peterson29060642009-01-31 22:14:21 +00003795 ch2 = *s++;
3796 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00003797 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003798 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3799 *p++ = '\\';
3800 *p++ = 'U';
3801 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3802 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3803 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3804 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3805 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3806 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3807 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3808 *p++ = hexdigits[ucs & 0x0000000F];
3809 continue;
3810 }
3811 /* Fall through: isolated surrogates are copied as-is */
3812 s--;
3813 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003814 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00003815#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003816
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003818 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 *p++ = '\\';
3820 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003821 *p++ = hexdigits[(ch >> 12) & 0x000F];
3822 *p++ = hexdigits[(ch >> 8) & 0x000F];
3823 *p++ = hexdigits[(ch >> 4) & 0x000F];
3824 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003826
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003827 /* Map special whitespace to '\t', \n', '\r' */
3828 else if (ch == '\t') {
3829 *p++ = '\\';
3830 *p++ = 't';
3831 }
3832 else if (ch == '\n') {
3833 *p++ = '\\';
3834 *p++ = 'n';
3835 }
3836 else if (ch == '\r') {
3837 *p++ = '\\';
3838 *p++ = 'r';
3839 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003840
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003841 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003842 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003844 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00003845 *p++ = hexdigits[(ch >> 4) & 0x000F];
3846 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003847 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003848
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 /* Copy everything else as-is */
3850 else
3851 *p++ = (char) ch;
3852 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003854 assert(p - PyBytes_AS_STRING(repr) > 0);
3855 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3856 return NULL;
3857 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858}
3859
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00003860PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003862 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 if (!PyUnicode_Check(unicode)) {
3864 PyErr_BadArgument();
3865 return NULL;
3866 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00003867 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3868 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00003869 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870}
3871
3872/* --- Raw Unicode Escape Codec ------------------------------------------- */
3873
3874PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003875 Py_ssize_t size,
3876 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003878 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003879 Py_ssize_t startinpos;
3880 Py_ssize_t endinpos;
3881 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 const char *end;
3885 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886 PyObject *errorHandler = NULL;
3887 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003888
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889 /* Escaped strings will always be longer than the resulting
3890 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003891 length after conversion to the true value. (But decoding error
3892 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893 v = _PyUnicode_New(size);
3894 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003895 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003897 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003899 end = s + size;
3900 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 unsigned char c;
3902 Py_UCS4 x;
3903 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003904 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905
Benjamin Peterson29060642009-01-31 22:14:21 +00003906 /* Non-escape characters are interpreted as Unicode ordinals */
3907 if (*s != '\\') {
3908 *p++ = (unsigned char)*s++;
3909 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003910 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003911 startinpos = s-starts;
3912
3913 /* \u-escapes are only interpreted iff the number of leading
3914 backslashes if odd */
3915 bs = s;
3916 for (;s < end;) {
3917 if (*s != '\\')
3918 break;
3919 *p++ = (unsigned char)*s++;
3920 }
3921 if (((s - bs) & 1) == 0 ||
3922 s >= end ||
3923 (*s != 'u' && *s != 'U')) {
3924 continue;
3925 }
3926 p--;
3927 count = *s=='u' ? 4 : 8;
3928 s++;
3929
3930 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3931 outpos = p-PyUnicode_AS_UNICODE(v);
3932 for (x = 0, i = 0; i < count; ++i, ++s) {
3933 c = (unsigned char)*s;
3934 if (!ISXDIGIT(c)) {
3935 endinpos = s-starts;
3936 if (unicode_decode_call_errorhandler(
3937 errors, &errorHandler,
3938 "rawunicodeescape", "truncated \\uXXXX",
3939 &starts, &end, &startinpos, &endinpos, &exc, &s,
3940 &v, &outpos, &p))
3941 goto onError;
3942 goto nextByte;
3943 }
3944 x = (x<<4) & ~0xF;
3945 if (c >= '0' && c <= '9')
3946 x += c - '0';
3947 else if (c >= 'a' && c <= 'f')
3948 x += 10 + c - 'a';
3949 else
3950 x += 10 + c - 'A';
3951 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00003952 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00003953 /* UCS-2 character */
3954 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003955 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003956 /* UCS-4 character. Either store directly, or as
3957 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00003958#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003959 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00003960#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003961 x -= 0x10000L;
3962 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3963 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00003964#endif
3965 } else {
3966 endinpos = s-starts;
3967 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003968 if (unicode_decode_call_errorhandler(
3969 errors, &errorHandler,
3970 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 &starts, &end, &startinpos, &endinpos, &exc, &s,
3972 &v, &outpos, &p))
3973 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003974 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 nextByte:
3976 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003978 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003979 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980 Py_XDECREF(errorHandler);
3981 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003983
Benjamin Peterson29060642009-01-31 22:14:21 +00003984 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986 Py_XDECREF(errorHandler);
3987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988 return NULL;
3989}
3990
3991PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003994 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 char *p;
3996 char *q;
3997
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003998#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003999 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004000#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004001 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004002#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004003
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004004 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004005 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004006
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004007 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 if (repr == NULL)
4009 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004010 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004011 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004013 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 while (size-- > 0) {
4015 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004016#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004017 /* Map 32-bit characters to '\Uxxxxxxxx' */
4018 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004019 *p++ = '\\';
4020 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004021 *p++ = hexdigits[(ch >> 28) & 0xf];
4022 *p++ = hexdigits[(ch >> 24) & 0xf];
4023 *p++ = hexdigits[(ch >> 20) & 0xf];
4024 *p++ = hexdigits[(ch >> 16) & 0xf];
4025 *p++ = hexdigits[(ch >> 12) & 0xf];
4026 *p++ = hexdigits[(ch >> 8) & 0xf];
4027 *p++ = hexdigits[(ch >> 4) & 0xf];
4028 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004029 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004030 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004031#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004032 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4033 if (ch >= 0xD800 && ch < 0xDC00) {
4034 Py_UNICODE ch2;
4035 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004036
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 ch2 = *s++;
4038 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004039 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004040 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4041 *p++ = '\\';
4042 *p++ = 'U';
4043 *p++ = hexdigits[(ucs >> 28) & 0xf];
4044 *p++ = hexdigits[(ucs >> 24) & 0xf];
4045 *p++ = hexdigits[(ucs >> 20) & 0xf];
4046 *p++ = hexdigits[(ucs >> 16) & 0xf];
4047 *p++ = hexdigits[(ucs >> 12) & 0xf];
4048 *p++ = hexdigits[(ucs >> 8) & 0xf];
4049 *p++ = hexdigits[(ucs >> 4) & 0xf];
4050 *p++ = hexdigits[ucs & 0xf];
4051 continue;
4052 }
4053 /* Fall through: isolated surrogates are copied as-is */
4054 s--;
4055 size++;
4056 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004057#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004058 /* Map 16-bit characters to '\uxxxx' */
4059 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 *p++ = '\\';
4061 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004062 *p++ = hexdigits[(ch >> 12) & 0xf];
4063 *p++ = hexdigits[(ch >> 8) & 0xf];
4064 *p++ = hexdigits[(ch >> 4) & 0xf];
4065 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 /* Copy everything else as-is */
4068 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 *p++ = (char) ch;
4070 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004071 size = p - q;
4072
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004073 assert(size > 0);
4074 if (_PyBytes_Resize(&repr, size) < 0)
4075 return NULL;
4076 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077}
4078
4079PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4080{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004081 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004083 PyErr_BadArgument();
4084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004086 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4087 PyUnicode_GET_SIZE(unicode));
4088
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004089 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090}
4091
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004092/* --- Unicode Internal Codec ------------------------------------------- */
4093
4094PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004095 Py_ssize_t size,
4096 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004097{
4098 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004099 Py_ssize_t startinpos;
4100 Py_ssize_t endinpos;
4101 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004102 PyUnicodeObject *v;
4103 Py_UNICODE *p;
4104 const char *end;
4105 const char *reason;
4106 PyObject *errorHandler = NULL;
4107 PyObject *exc = NULL;
4108
Neal Norwitzd43069c2006-01-08 01:12:10 +00004109#ifdef Py_UNICODE_WIDE
4110 Py_UNICODE unimax = PyUnicode_GetMax();
4111#endif
4112
Thomas Wouters89f507f2006-12-13 04:49:30 +00004113 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004114 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4115 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004117 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004119 p = PyUnicode_AS_UNICODE(v);
4120 end = s + size;
4121
4122 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004123 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004124 /* We have to sanity check the raw data, otherwise doom looms for
4125 some malformed UCS-4 data. */
4126 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004127#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004128 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004129#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004130 end-s < Py_UNICODE_SIZE
4131 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004133 startinpos = s - starts;
4134 if (end-s < Py_UNICODE_SIZE) {
4135 endinpos = end-starts;
4136 reason = "truncated input";
4137 }
4138 else {
4139 endinpos = s - starts + Py_UNICODE_SIZE;
4140 reason = "illegal code point (> 0x10FFFF)";
4141 }
4142 outpos = p - PyUnicode_AS_UNICODE(v);
4143 if (unicode_decode_call_errorhandler(
4144 errors, &errorHandler,
4145 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004146 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004147 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004148 goto onError;
4149 }
4150 }
4151 else {
4152 p++;
4153 s += Py_UNICODE_SIZE;
4154 }
4155 }
4156
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004157 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004158 goto onError;
4159 Py_XDECREF(errorHandler);
4160 Py_XDECREF(exc);
4161 return (PyObject *)v;
4162
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004164 Py_XDECREF(v);
4165 Py_XDECREF(errorHandler);
4166 Py_XDECREF(exc);
4167 return NULL;
4168}
4169
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170/* --- Latin-1 Codec ------------------------------------------------------ */
4171
4172PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 Py_ssize_t size,
4174 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175{
4176 PyUnicodeObject *v;
4177 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004178 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004179
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004181 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 Py_UNICODE r = *(unsigned char*)s;
4183 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004184 }
4185
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 v = _PyUnicode_New(size);
4187 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004192 e = s + size;
4193 /* Unrolling the copy makes it much faster by reducing the looping
4194 overhead. This is similar to what many memcpy() implementations do. */
4195 unrolled_end = e - 4;
4196 while (s < unrolled_end) {
4197 p[0] = (unsigned char) s[0];
4198 p[1] = (unsigned char) s[1];
4199 p[2] = (unsigned char) s[2];
4200 p[3] = (unsigned char) s[3];
4201 s += 4;
4202 p += 4;
4203 }
4204 while (s < e)
4205 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004207
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 Py_XDECREF(v);
4210 return NULL;
4211}
4212
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213/* create or adjust a UnicodeEncodeError */
4214static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004215 const char *encoding,
4216 const Py_UNICODE *unicode, Py_ssize_t size,
4217 Py_ssize_t startpos, Py_ssize_t endpos,
4218 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 *exceptionObject = PyUnicodeEncodeError_Create(
4222 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223 }
4224 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4226 goto onError;
4227 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4228 goto onError;
4229 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4230 goto onError;
4231 return;
4232 onError:
4233 Py_DECREF(*exceptionObject);
4234 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 }
4236}
4237
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238/* raises a UnicodeEncodeError */
4239static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 const char *encoding,
4241 const Py_UNICODE *unicode, Py_ssize_t size,
4242 Py_ssize_t startpos, Py_ssize_t endpos,
4243 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244{
4245 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004248 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004249}
4250
4251/* error handling callback helper:
4252 build arguments, call the callback and check the arguments,
4253 put the result into newpos and return the replacement string, which
4254 has to be freed by the caller */
4255static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 PyObject **errorHandler,
4257 const char *encoding, const char *reason,
4258 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4259 Py_ssize_t startpos, Py_ssize_t endpos,
4260 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004262 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263
4264 PyObject *restuple;
4265 PyObject *resunicode;
4266
4267 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004270 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 }
4272
4273 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277
4278 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004279 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004283 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004284 Py_DECREF(restuple);
4285 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004287 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 &resunicode, newpos)) {
4289 Py_DECREF(restuple);
4290 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004292 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4293 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4294 Py_DECREF(restuple);
4295 return NULL;
4296 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004298 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004299 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4301 Py_DECREF(restuple);
4302 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004303 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 Py_INCREF(resunicode);
4305 Py_DECREF(restuple);
4306 return resunicode;
4307}
4308
4309static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004310 Py_ssize_t size,
4311 const char *errors,
4312 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004313{
4314 /* output object */
4315 PyObject *res;
4316 /* pointers to the beginning and end+1 of input */
4317 const Py_UNICODE *startp = p;
4318 const Py_UNICODE *endp = p + size;
4319 /* pointer to the beginning of the unencodable characters */
4320 /* const Py_UNICODE *badp = NULL; */
4321 /* pointer into the output */
4322 char *str;
4323 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004324 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004325 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4326 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 PyObject *errorHandler = NULL;
4328 PyObject *exc = NULL;
4329 /* the following variable is used for caching string comparisons
4330 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4331 int known_errorHandler = -1;
4332
4333 /* allocate enough for a simple encoding without
4334 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004335 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004336 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004337 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004339 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004340 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004341 ressize = size;
4342
4343 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345
Benjamin Peterson29060642009-01-31 22:14:21 +00004346 /* can we encode this? */
4347 if (c<limit) {
4348 /* no overflow check, because we know that the space is enough */
4349 *str++ = (char)c;
4350 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004351 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 else {
4353 Py_ssize_t unicodepos = p-startp;
4354 Py_ssize_t requiredsize;
4355 PyObject *repunicode;
4356 Py_ssize_t repsize;
4357 Py_ssize_t newpos;
4358 Py_ssize_t respos;
4359 Py_UNICODE *uni2;
4360 /* startpos for collecting unencodable chars */
4361 const Py_UNICODE *collstart = p;
4362 const Py_UNICODE *collend = p;
4363 /* find all unecodable characters */
4364 while ((collend < endp) && ((*collend)>=limit))
4365 ++collend;
4366 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4367 if (known_errorHandler==-1) {
4368 if ((errors==NULL) || (!strcmp(errors, "strict")))
4369 known_errorHandler = 1;
4370 else if (!strcmp(errors, "replace"))
4371 known_errorHandler = 2;
4372 else if (!strcmp(errors, "ignore"))
4373 known_errorHandler = 3;
4374 else if (!strcmp(errors, "xmlcharrefreplace"))
4375 known_errorHandler = 4;
4376 else
4377 known_errorHandler = 0;
4378 }
4379 switch (known_errorHandler) {
4380 case 1: /* strict */
4381 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4382 goto onError;
4383 case 2: /* replace */
4384 while (collstart++<collend)
4385 *str++ = '?'; /* fall through */
4386 case 3: /* ignore */
4387 p = collend;
4388 break;
4389 case 4: /* xmlcharrefreplace */
4390 respos = str - PyBytes_AS_STRING(res);
4391 /* determine replacement size (temporarily (mis)uses p) */
4392 for (p = collstart, repsize = 0; p < collend; ++p) {
4393 if (*p<10)
4394 repsize += 2+1+1;
4395 else if (*p<100)
4396 repsize += 2+2+1;
4397 else if (*p<1000)
4398 repsize += 2+3+1;
4399 else if (*p<10000)
4400 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004401#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004402 else
4403 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004404#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 else if (*p<100000)
4406 repsize += 2+5+1;
4407 else if (*p<1000000)
4408 repsize += 2+6+1;
4409 else
4410 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004411#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 }
4413 requiredsize = respos+repsize+(endp-collend);
4414 if (requiredsize > ressize) {
4415 if (requiredsize<2*ressize)
4416 requiredsize = 2*ressize;
4417 if (_PyBytes_Resize(&res, requiredsize))
4418 goto onError;
4419 str = PyBytes_AS_STRING(res) + respos;
4420 ressize = requiredsize;
4421 }
4422 /* generate replacement (temporarily (mis)uses p) */
4423 for (p = collstart; p < collend; ++p) {
4424 str += sprintf(str, "&#%d;", (int)*p);
4425 }
4426 p = collend;
4427 break;
4428 default:
4429 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4430 encoding, reason, startp, size, &exc,
4431 collstart-startp, collend-startp, &newpos);
4432 if (repunicode == NULL)
4433 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004434 if (PyBytes_Check(repunicode)) {
4435 /* Directly copy bytes result to output. */
4436 repsize = PyBytes_Size(repunicode);
4437 if (repsize > 1) {
4438 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004439 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004440 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4441 Py_DECREF(repunicode);
4442 goto onError;
4443 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004444 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004445 ressize += repsize-1;
4446 }
4447 memcpy(str, PyBytes_AsString(repunicode), repsize);
4448 str += repsize;
4449 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004450 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004451 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004452 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 /* need more space? (at least enough for what we
4454 have+the replacement+the rest of the string, so
4455 we won't have to check space for encodable characters) */
4456 respos = str - PyBytes_AS_STRING(res);
4457 repsize = PyUnicode_GET_SIZE(repunicode);
4458 requiredsize = respos+repsize+(endp-collend);
4459 if (requiredsize > ressize) {
4460 if (requiredsize<2*ressize)
4461 requiredsize = 2*ressize;
4462 if (_PyBytes_Resize(&res, requiredsize)) {
4463 Py_DECREF(repunicode);
4464 goto onError;
4465 }
4466 str = PyBytes_AS_STRING(res) + respos;
4467 ressize = requiredsize;
4468 }
4469 /* check if there is anything unencodable in the replacement
4470 and copy it to the output */
4471 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4472 c = *uni2;
4473 if (c >= limit) {
4474 raise_encode_exception(&exc, encoding, startp, size,
4475 unicodepos, unicodepos+1, reason);
4476 Py_DECREF(repunicode);
4477 goto onError;
4478 }
4479 *str = (char)c;
4480 }
4481 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004482 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004483 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004484 }
4485 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004486 /* Resize if we allocated to much */
4487 size = str - PyBytes_AS_STRING(res);
4488 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004489 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004490 if (_PyBytes_Resize(&res, size) < 0)
4491 goto onError;
4492 }
4493
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 Py_XDECREF(errorHandler);
4495 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004496 return res;
4497
4498 onError:
4499 Py_XDECREF(res);
4500 Py_XDECREF(errorHandler);
4501 Py_XDECREF(exc);
4502 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503}
4504
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 Py_ssize_t size,
4507 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510}
4511
4512PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4513{
4514 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 PyErr_BadArgument();
4516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517 }
4518 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 PyUnicode_GET_SIZE(unicode),
4520 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521}
4522
4523/* --- 7-bit ASCII Codec -------------------------------------------------- */
4524
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 Py_ssize_t size,
4527 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 PyUnicodeObject *v;
4531 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004532 Py_ssize_t startinpos;
4533 Py_ssize_t endinpos;
4534 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 const char *e;
4536 PyObject *errorHandler = NULL;
4537 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004538
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004540 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 Py_UNICODE r = *(unsigned char*)s;
4542 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004543 }
Tim Petersced69f82003-09-16 20:30:58 +00004544
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 v = _PyUnicode_New(size);
4546 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004547 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 e = s + size;
4552 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 register unsigned char c = (unsigned char)*s;
4554 if (c < 128) {
4555 *p++ = c;
4556 ++s;
4557 }
4558 else {
4559 startinpos = s-starts;
4560 endinpos = startinpos + 1;
4561 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4562 if (unicode_decode_call_errorhandler(
4563 errors, &errorHandler,
4564 "ascii", "ordinal not in range(128)",
4565 &starts, &e, &startinpos, &endinpos, &exc, &s,
4566 &v, &outpos, &p))
4567 goto onError;
4568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004570 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004571 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4572 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 Py_XDECREF(errorHandler);
4574 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004576
Benjamin Peterson29060642009-01-31 22:14:21 +00004577 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 Py_XDECREF(errorHandler);
4580 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581 return NULL;
4582}
4583
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004585 Py_ssize_t size,
4586 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589}
4590
4591PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4592{
4593 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 PyErr_BadArgument();
4595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 }
4597 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 PyUnicode_GET_SIZE(unicode),
4599 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600}
4601
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004602#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004603
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004604/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004605
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004606#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004607#define NEED_RETRY
4608#endif
4609
4610/* XXX This code is limited to "true" double-byte encodings, as
4611 a) it assumes an incomplete character consists of a single byte, and
4612 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004614
4615static int is_dbcs_lead_byte(const char *s, int offset)
4616{
4617 const char *curr = s + offset;
4618
4619 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 const char *prev = CharPrev(s, curr);
4621 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004622 }
4623 return 0;
4624}
4625
4626/*
4627 * Decode MBCS string into unicode object. If 'final' is set, converts
4628 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4629 */
4630static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004631 const char *s, /* MBCS string */
4632 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004633 int final,
4634 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004635{
4636 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004637 Py_ssize_t n;
4638 DWORD usize;
4639 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004640
4641 assert(size >= 0);
4642
Victor Stinner554f3f02010-06-16 23:33:54 +00004643 /* check and handle 'errors' arg */
4644 if (errors==NULL || strcmp(errors, "strict")==0)
4645 flags = MB_ERR_INVALID_CHARS;
4646 else if (strcmp(errors, "ignore")==0)
4647 flags = 0;
4648 else {
4649 PyErr_Format(PyExc_ValueError,
4650 "mbcs encoding does not support errors='%s'",
4651 errors);
4652 return -1;
4653 }
4654
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004655 /* Skip trailing lead-byte unless 'final' is set */
4656 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004658
4659 /* First get the size of the result */
4660 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004661 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4662 if (usize==0)
4663 goto mbcs_decode_error;
4664 } else
4665 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004666
4667 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 /* Create unicode object */
4669 *v = _PyUnicode_New(usize);
4670 if (*v == NULL)
4671 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004672 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004673 }
4674 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 /* Extend unicode object */
4676 n = PyUnicode_GET_SIZE(*v);
4677 if (_PyUnicode_Resize(v, n + usize) < 0)
4678 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004679 }
4680
4681 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004682 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004683 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004684 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4685 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004687 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004688 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004689
4690mbcs_decode_error:
4691 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4692 we raise a UnicodeDecodeError - else it is a 'generic'
4693 windows error
4694 */
4695 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4696 /* Ideally, we should get reason from FormatMessage - this
4697 is the Windows 2000 English version of the message
4698 */
4699 PyObject *exc = NULL;
4700 const char *reason = "No mapping for the Unicode character exists "
4701 "in the target multi-byte code page.";
4702 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4703 if (exc != NULL) {
4704 PyCodec_StrictErrors(exc);
4705 Py_DECREF(exc);
4706 }
4707 } else {
4708 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4709 }
4710 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004711}
4712
4713PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004714 Py_ssize_t size,
4715 const char *errors,
4716 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004717{
4718 PyUnicodeObject *v = NULL;
4719 int done;
4720
4721 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004723
4724#ifdef NEED_RETRY
4725 retry:
4726 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004727 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004728 else
4729#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004730 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004731
4732 if (done < 0) {
4733 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004735 }
4736
4737 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004739
4740#ifdef NEED_RETRY
4741 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 s += done;
4743 size -= done;
4744 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004745 }
4746#endif
4747
4748 return (PyObject *)v;
4749}
4750
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004751PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004752 Py_ssize_t size,
4753 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004754{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004755 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4756}
4757
4758/*
4759 * Convert unicode into string object (MBCS).
4760 * Returns 0 if succeed, -1 otherwise.
4761 */
4762static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004763 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004764 int size, /* size of unicode */
4765 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004766{
Victor Stinner554f3f02010-06-16 23:33:54 +00004767 BOOL usedDefaultChar = FALSE;
4768 BOOL *pusedDefaultChar;
4769 int mbcssize;
4770 Py_ssize_t n;
4771 PyObject *exc = NULL;
4772 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004773
4774 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004775
Victor Stinner554f3f02010-06-16 23:33:54 +00004776 /* check and handle 'errors' arg */
4777 if (errors==NULL || strcmp(errors, "strict")==0) {
4778 flags = WC_NO_BEST_FIT_CHARS;
4779 pusedDefaultChar = &usedDefaultChar;
4780 } else if (strcmp(errors, "replace")==0) {
4781 flags = 0;
4782 pusedDefaultChar = NULL;
4783 } else {
4784 PyErr_Format(PyExc_ValueError,
4785 "mbcs encoding does not support errors='%s'",
4786 errors);
4787 return -1;
4788 }
4789
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004790 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004791 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004792 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4793 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 if (mbcssize == 0) {
4795 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4796 return -1;
4797 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004798 /* If we used a default char, then we failed! */
4799 if (pusedDefaultChar && *pusedDefaultChar)
4800 goto mbcs_encode_error;
4801 } else {
4802 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004803 }
4804
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004805 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004806 /* Create string object */
4807 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4808 if (*repr == NULL)
4809 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004810 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004811 }
4812 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 /* Extend string object */
4814 n = PyBytes_Size(*repr);
4815 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4816 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004817 }
4818
4819 /* Do the conversion */
4820 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004822 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4823 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004824 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4825 return -1;
4826 }
Victor Stinner554f3f02010-06-16 23:33:54 +00004827 if (pusedDefaultChar && *pusedDefaultChar)
4828 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004829 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004830 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00004831
4832mbcs_encode_error:
4833 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4834 Py_XDECREF(exc);
4835 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004836}
4837
4838PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 Py_ssize_t size,
4840 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004841{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004842 PyObject *repr = NULL;
4843 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004844
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004845#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004847 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004848 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004849 else
4850#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004851 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004852
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004853 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 Py_XDECREF(repr);
4855 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004856 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004857
4858#ifdef NEED_RETRY
4859 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 p += INT_MAX;
4861 size -= INT_MAX;
4862 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004863 }
4864#endif
4865
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004866 return repr;
4867}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004868
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004869PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4870{
4871 if (!PyUnicode_Check(unicode)) {
4872 PyErr_BadArgument();
4873 return NULL;
4874 }
4875 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 PyUnicode_GET_SIZE(unicode),
4877 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004878}
4879
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004880#undef NEED_RETRY
4881
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004882#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004883
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884/* --- Character Mapping Codec -------------------------------------------- */
4885
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 Py_ssize_t size,
4888 PyObject *mapping,
4889 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004892 Py_ssize_t startinpos;
4893 Py_ssize_t endinpos;
4894 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004895 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 PyUnicodeObject *v;
4897 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004898 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899 PyObject *errorHandler = NULL;
4900 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004901 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004902 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004903
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 /* Default to Latin-1 */
4905 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907
4908 v = _PyUnicode_New(size);
4909 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004912 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004915 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 mapstring = PyUnicode_AS_UNICODE(mapping);
4917 maplen = PyUnicode_GET_SIZE(mapping);
4918 while (s < e) {
4919 unsigned char ch = *s;
4920 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 if (ch < maplen)
4923 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 if (x == 0xfffe) {
4926 /* undefined mapping */
4927 outpos = p-PyUnicode_AS_UNICODE(v);
4928 startinpos = s-starts;
4929 endinpos = startinpos+1;
4930 if (unicode_decode_call_errorhandler(
4931 errors, &errorHandler,
4932 "charmap", "character maps to <undefined>",
4933 &starts, &e, &startinpos, &endinpos, &exc, &s,
4934 &v, &outpos, &p)) {
4935 goto onError;
4936 }
4937 continue;
4938 }
4939 *p++ = x;
4940 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004941 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004942 }
4943 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004944 while (s < e) {
4945 unsigned char ch = *s;
4946 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004947
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4949 w = PyLong_FromLong((long)ch);
4950 if (w == NULL)
4951 goto onError;
4952 x = PyObject_GetItem(mapping, w);
4953 Py_DECREF(w);
4954 if (x == NULL) {
4955 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4956 /* No mapping found means: mapping is undefined. */
4957 PyErr_Clear();
4958 x = Py_None;
4959 Py_INCREF(x);
4960 } else
4961 goto onError;
4962 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004963
Benjamin Peterson29060642009-01-31 22:14:21 +00004964 /* Apply mapping */
4965 if (PyLong_Check(x)) {
4966 long value = PyLong_AS_LONG(x);
4967 if (value < 0 || value > 65535) {
4968 PyErr_SetString(PyExc_TypeError,
4969 "character mapping must be in range(65536)");
4970 Py_DECREF(x);
4971 goto onError;
4972 }
4973 *p++ = (Py_UNICODE)value;
4974 }
4975 else if (x == Py_None) {
4976 /* undefined mapping */
4977 outpos = p-PyUnicode_AS_UNICODE(v);
4978 startinpos = s-starts;
4979 endinpos = startinpos+1;
4980 if (unicode_decode_call_errorhandler(
4981 errors, &errorHandler,
4982 "charmap", "character maps to <undefined>",
4983 &starts, &e, &startinpos, &endinpos, &exc, &s,
4984 &v, &outpos, &p)) {
4985 Py_DECREF(x);
4986 goto onError;
4987 }
4988 Py_DECREF(x);
4989 continue;
4990 }
4991 else if (PyUnicode_Check(x)) {
4992 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004993
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 if (targetsize == 1)
4995 /* 1-1 mapping */
4996 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004997
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 else if (targetsize > 1) {
4999 /* 1-n mapping */
5000 if (targetsize > extrachars) {
5001 /* resize first */
5002 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5003 Py_ssize_t needed = (targetsize - extrachars) + \
5004 (targetsize << 2);
5005 extrachars += needed;
5006 /* XXX overflow detection missing */
5007 if (_PyUnicode_Resize(&v,
5008 PyUnicode_GET_SIZE(v) + needed) < 0) {
5009 Py_DECREF(x);
5010 goto onError;
5011 }
5012 p = PyUnicode_AS_UNICODE(v) + oldpos;
5013 }
5014 Py_UNICODE_COPY(p,
5015 PyUnicode_AS_UNICODE(x),
5016 targetsize);
5017 p += targetsize;
5018 extrachars -= targetsize;
5019 }
5020 /* 1-0 mapping: skip the character */
5021 }
5022 else {
5023 /* wrong return value */
5024 PyErr_SetString(PyExc_TypeError,
5025 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005026 Py_DECREF(x);
5027 goto onError;
5028 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 Py_DECREF(x);
5030 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032 }
5033 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5035 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005036 Py_XDECREF(errorHandler);
5037 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005039
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005041 Py_XDECREF(errorHandler);
5042 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 Py_XDECREF(v);
5044 return NULL;
5045}
5046
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005047/* Charmap encoding: the lookup table */
5048
5049struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 PyObject_HEAD
5051 unsigned char level1[32];
5052 int count2, count3;
5053 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005054};
5055
5056static PyObject*
5057encoding_map_size(PyObject *obj, PyObject* args)
5058{
5059 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005060 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005062}
5063
5064static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005065 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 PyDoc_STR("Return the size (in bytes) of this object") },
5067 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005068};
5069
5070static void
5071encoding_map_dealloc(PyObject* o)
5072{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005073 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005074}
5075
5076static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005077 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 "EncodingMap", /*tp_name*/
5079 sizeof(struct encoding_map), /*tp_basicsize*/
5080 0, /*tp_itemsize*/
5081 /* methods */
5082 encoding_map_dealloc, /*tp_dealloc*/
5083 0, /*tp_print*/
5084 0, /*tp_getattr*/
5085 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005086 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 0, /*tp_repr*/
5088 0, /*tp_as_number*/
5089 0, /*tp_as_sequence*/
5090 0, /*tp_as_mapping*/
5091 0, /*tp_hash*/
5092 0, /*tp_call*/
5093 0, /*tp_str*/
5094 0, /*tp_getattro*/
5095 0, /*tp_setattro*/
5096 0, /*tp_as_buffer*/
5097 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5098 0, /*tp_doc*/
5099 0, /*tp_traverse*/
5100 0, /*tp_clear*/
5101 0, /*tp_richcompare*/
5102 0, /*tp_weaklistoffset*/
5103 0, /*tp_iter*/
5104 0, /*tp_iternext*/
5105 encoding_map_methods, /*tp_methods*/
5106 0, /*tp_members*/
5107 0, /*tp_getset*/
5108 0, /*tp_base*/
5109 0, /*tp_dict*/
5110 0, /*tp_descr_get*/
5111 0, /*tp_descr_set*/
5112 0, /*tp_dictoffset*/
5113 0, /*tp_init*/
5114 0, /*tp_alloc*/
5115 0, /*tp_new*/
5116 0, /*tp_free*/
5117 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005118};
5119
5120PyObject*
5121PyUnicode_BuildEncodingMap(PyObject* string)
5122{
5123 Py_UNICODE *decode;
5124 PyObject *result;
5125 struct encoding_map *mresult;
5126 int i;
5127 int need_dict = 0;
5128 unsigned char level1[32];
5129 unsigned char level2[512];
5130 unsigned char *mlevel1, *mlevel2, *mlevel3;
5131 int count2 = 0, count3 = 0;
5132
5133 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5134 PyErr_BadArgument();
5135 return NULL;
5136 }
5137 decode = PyUnicode_AS_UNICODE(string);
5138 memset(level1, 0xFF, sizeof level1);
5139 memset(level2, 0xFF, sizeof level2);
5140
5141 /* If there isn't a one-to-one mapping of NULL to \0,
5142 or if there are non-BMP characters, we need to use
5143 a mapping dictionary. */
5144 if (decode[0] != 0)
5145 need_dict = 1;
5146 for (i = 1; i < 256; i++) {
5147 int l1, l2;
5148 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005149#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005150 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005151#endif
5152 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005153 need_dict = 1;
5154 break;
5155 }
5156 if (decode[i] == 0xFFFE)
5157 /* unmapped character */
5158 continue;
5159 l1 = decode[i] >> 11;
5160 l2 = decode[i] >> 7;
5161 if (level1[l1] == 0xFF)
5162 level1[l1] = count2++;
5163 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005164 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005165 }
5166
5167 if (count2 >= 0xFF || count3 >= 0xFF)
5168 need_dict = 1;
5169
5170 if (need_dict) {
5171 PyObject *result = PyDict_New();
5172 PyObject *key, *value;
5173 if (!result)
5174 return NULL;
5175 for (i = 0; i < 256; i++) {
5176 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005177 key = PyLong_FromLong(decode[i]);
5178 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005179 if (!key || !value)
5180 goto failed1;
5181 if (PyDict_SetItem(result, key, value) == -1)
5182 goto failed1;
5183 Py_DECREF(key);
5184 Py_DECREF(value);
5185 }
5186 return result;
5187 failed1:
5188 Py_XDECREF(key);
5189 Py_XDECREF(value);
5190 Py_DECREF(result);
5191 return NULL;
5192 }
5193
5194 /* Create a three-level trie */
5195 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5196 16*count2 + 128*count3 - 1);
5197 if (!result)
5198 return PyErr_NoMemory();
5199 PyObject_Init(result, &EncodingMapType);
5200 mresult = (struct encoding_map*)result;
5201 mresult->count2 = count2;
5202 mresult->count3 = count3;
5203 mlevel1 = mresult->level1;
5204 mlevel2 = mresult->level23;
5205 mlevel3 = mresult->level23 + 16*count2;
5206 memcpy(mlevel1, level1, 32);
5207 memset(mlevel2, 0xFF, 16*count2);
5208 memset(mlevel3, 0, 128*count3);
5209 count3 = 0;
5210 for (i = 1; i < 256; i++) {
5211 int o1, o2, o3, i2, i3;
5212 if (decode[i] == 0xFFFE)
5213 /* unmapped character */
5214 continue;
5215 o1 = decode[i]>>11;
5216 o2 = (decode[i]>>7) & 0xF;
5217 i2 = 16*mlevel1[o1] + o2;
5218 if (mlevel2[i2] == 0xFF)
5219 mlevel2[i2] = count3++;
5220 o3 = decode[i] & 0x7F;
5221 i3 = 128*mlevel2[i2] + o3;
5222 mlevel3[i3] = i;
5223 }
5224 return result;
5225}
5226
5227static int
5228encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5229{
5230 struct encoding_map *map = (struct encoding_map*)mapping;
5231 int l1 = c>>11;
5232 int l2 = (c>>7) & 0xF;
5233 int l3 = c & 0x7F;
5234 int i;
5235
5236#ifdef Py_UNICODE_WIDE
5237 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005239 }
5240#endif
5241 if (c == 0)
5242 return 0;
5243 /* level 1*/
5244 i = map->level1[l1];
5245 if (i == 0xFF) {
5246 return -1;
5247 }
5248 /* level 2*/
5249 i = map->level23[16*i+l2];
5250 if (i == 0xFF) {
5251 return -1;
5252 }
5253 /* level 3 */
5254 i = map->level23[16*map->count2 + 128*i + l3];
5255 if (i == 0) {
5256 return -1;
5257 }
5258 return i;
5259}
5260
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005261/* Lookup the character ch in the mapping. If the character
5262 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005263 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005264static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265{
Christian Heimes217cfd12007-12-02 14:31:20 +00005266 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005267 PyObject *x;
5268
5269 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005271 x = PyObject_GetItem(mapping, w);
5272 Py_DECREF(w);
5273 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5275 /* No mapping found means: mapping is undefined. */
5276 PyErr_Clear();
5277 x = Py_None;
5278 Py_INCREF(x);
5279 return x;
5280 } else
5281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005283 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005285 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 long value = PyLong_AS_LONG(x);
5287 if (value < 0 || value > 255) {
5288 PyErr_SetString(PyExc_TypeError,
5289 "character mapping must be in range(256)");
5290 Py_DECREF(x);
5291 return NULL;
5292 }
5293 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005295 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 /* wrong return value */
5299 PyErr_Format(PyExc_TypeError,
5300 "character mapping must return integer, bytes or None, not %.400s",
5301 x->ob_type->tp_name);
5302 Py_DECREF(x);
5303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 }
5305}
5306
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005307static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005308charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005309{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005310 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5311 /* exponentially overallocate to minimize reallocations */
5312 if (requiredsize < 2*outsize)
5313 requiredsize = 2*outsize;
5314 if (_PyBytes_Resize(outobj, requiredsize))
5315 return -1;
5316 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005317}
5318
Benjamin Peterson14339b62009-01-31 16:36:08 +00005319typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005321}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005322/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005323 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324 space is available. Return a new reference to the object that
5325 was put in the output buffer, or Py_None, if the mapping was undefined
5326 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005327 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005328static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005329charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005331{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005332 PyObject *rep;
5333 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005334 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005335
Christian Heimes90aa7642007-12-19 02:45:37 +00005336 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005337 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005339 if (res == -1)
5340 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 if (outsize<requiredsize)
5342 if (charmapencode_resize(outobj, outpos, requiredsize))
5343 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005344 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 outstart[(*outpos)++] = (char)res;
5346 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005347 }
5348
5349 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005350 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005352 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 Py_DECREF(rep);
5354 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005355 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 if (PyLong_Check(rep)) {
5357 Py_ssize_t requiredsize = *outpos+1;
5358 if (outsize<requiredsize)
5359 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5360 Py_DECREF(rep);
5361 return enc_EXCEPTION;
5362 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005363 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005365 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 else {
5367 const char *repchars = PyBytes_AS_STRING(rep);
5368 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5369 Py_ssize_t requiredsize = *outpos+repsize;
5370 if (outsize<requiredsize)
5371 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5372 Py_DECREF(rep);
5373 return enc_EXCEPTION;
5374 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005375 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 memcpy(outstart + *outpos, repchars, repsize);
5377 *outpos += repsize;
5378 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005379 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005380 Py_DECREF(rep);
5381 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005382}
5383
5384/* handle an error in PyUnicode_EncodeCharmap
5385 Return 0 on success, -1 on error */
5386static
5387int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005388 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005390 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005391 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005392{
5393 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005394 Py_ssize_t repsize;
5395 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005396 Py_UNICODE *uni2;
5397 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005398 Py_ssize_t collstartpos = *inpos;
5399 Py_ssize_t collendpos = *inpos+1;
5400 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005401 char *encoding = "charmap";
5402 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005403 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005404
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005405 /* find all unencodable characters */
5406 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005407 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005408 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 int res = encoding_map_lookup(p[collendpos], mapping);
5410 if (res != -1)
5411 break;
5412 ++collendpos;
5413 continue;
5414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005415
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 rep = charmapencode_lookup(p[collendpos], mapping);
5417 if (rep==NULL)
5418 return -1;
5419 else if (rep!=Py_None) {
5420 Py_DECREF(rep);
5421 break;
5422 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005423 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005425 }
5426 /* cache callback name lookup
5427 * (if not done yet, i.e. it's the first error) */
5428 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 if ((errors==NULL) || (!strcmp(errors, "strict")))
5430 *known_errorHandler = 1;
5431 else if (!strcmp(errors, "replace"))
5432 *known_errorHandler = 2;
5433 else if (!strcmp(errors, "ignore"))
5434 *known_errorHandler = 3;
5435 else if (!strcmp(errors, "xmlcharrefreplace"))
5436 *known_errorHandler = 4;
5437 else
5438 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005439 }
5440 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005441 case 1: /* strict */
5442 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5443 return -1;
5444 case 2: /* replace */
5445 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 x = charmapencode_output('?', mapping, res, respos);
5447 if (x==enc_EXCEPTION) {
5448 return -1;
5449 }
5450 else if (x==enc_FAILED) {
5451 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5452 return -1;
5453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005454 }
5455 /* fall through */
5456 case 3: /* ignore */
5457 *inpos = collendpos;
5458 break;
5459 case 4: /* xmlcharrefreplace */
5460 /* generate replacement (temporarily (mis)uses p) */
5461 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 char buffer[2+29+1+1];
5463 char *cp;
5464 sprintf(buffer, "&#%d;", (int)p[collpos]);
5465 for (cp = buffer; *cp; ++cp) {
5466 x = charmapencode_output(*cp, mapping, res, respos);
5467 if (x==enc_EXCEPTION)
5468 return -1;
5469 else if (x==enc_FAILED) {
5470 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5471 return -1;
5472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005473 }
5474 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005475 *inpos = collendpos;
5476 break;
5477 default:
5478 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 encoding, reason, p, size, exceptionObject,
5480 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005481 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005483 if (PyBytes_Check(repunicode)) {
5484 /* Directly copy bytes result to output. */
5485 Py_ssize_t outsize = PyBytes_Size(*res);
5486 Py_ssize_t requiredsize;
5487 repsize = PyBytes_Size(repunicode);
5488 requiredsize = *respos + repsize;
5489 if (requiredsize > outsize)
5490 /* Make room for all additional bytes. */
5491 if (charmapencode_resize(res, respos, requiredsize)) {
5492 Py_DECREF(repunicode);
5493 return -1;
5494 }
5495 memcpy(PyBytes_AsString(*res) + *respos,
5496 PyBytes_AsString(repunicode), repsize);
5497 *respos += repsize;
5498 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005499 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005500 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005501 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005502 /* generate replacement */
5503 repsize = PyUnicode_GET_SIZE(repunicode);
5504 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 x = charmapencode_output(*uni2, mapping, res, respos);
5506 if (x==enc_EXCEPTION) {
5507 return -1;
5508 }
5509 else if (x==enc_FAILED) {
5510 Py_DECREF(repunicode);
5511 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5512 return -1;
5513 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005514 }
5515 *inpos = newpos;
5516 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 }
5518 return 0;
5519}
5520
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 Py_ssize_t size,
5523 PyObject *mapping,
5524 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005526 /* output object */
5527 PyObject *res = NULL;
5528 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005529 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005530 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005531 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005532 PyObject *errorHandler = NULL;
5533 PyObject *exc = NULL;
5534 /* the following variable is used for caching string comparisons
5535 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5536 * 3=ignore, 4=xmlcharrefreplace */
5537 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538
5539 /* Default to Latin-1 */
5540 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005543 /* allocate enough for a simple encoding without
5544 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005545 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005546 if (res == NULL)
5547 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005548 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005551 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005552 /* try to encode it */
5553 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5554 if (x==enc_EXCEPTION) /* error */
5555 goto onError;
5556 if (x==enc_FAILED) { /* unencodable character */
5557 if (charmap_encoding_error(p, size, &inpos, mapping,
5558 &exc,
5559 &known_errorHandler, &errorHandler, errors,
5560 &res, &respos)) {
5561 goto onError;
5562 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005563 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 else
5565 /* done with this character => adjust input position */
5566 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005570 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005571 if (_PyBytes_Resize(&res, respos) < 0)
5572 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 Py_XDECREF(exc);
5575 Py_XDECREF(errorHandler);
5576 return res;
5577
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005579 Py_XDECREF(res);
5580 Py_XDECREF(exc);
5581 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 return NULL;
5583}
5584
5585PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587{
5588 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 PyErr_BadArgument();
5590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 }
5592 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 PyUnicode_GET_SIZE(unicode),
5594 mapping,
5595 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596}
5597
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598/* create or adjust a UnicodeTranslateError */
5599static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 const Py_UNICODE *unicode, Py_ssize_t size,
5601 Py_ssize_t startpos, Py_ssize_t endpos,
5602 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005604 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005605 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 }
5608 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5610 goto onError;
5611 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5612 goto onError;
5613 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5614 goto onError;
5615 return;
5616 onError:
5617 Py_DECREF(*exceptionObject);
5618 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 }
5620}
5621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622/* raises a UnicodeTranslateError */
5623static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 const Py_UNICODE *unicode, Py_ssize_t size,
5625 Py_ssize_t startpos, Py_ssize_t endpos,
5626 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627{
5628 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632}
5633
5634/* error handling callback helper:
5635 build arguments, call the callback and check the arguments,
5636 put the result into newpos and return the replacement string, which
5637 has to be freed by the caller */
5638static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 PyObject **errorHandler,
5640 const char *reason,
5641 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5642 Py_ssize_t startpos, Py_ssize_t endpos,
5643 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005644{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005645 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005647 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 PyObject *restuple;
5649 PyObject *resunicode;
5650
5651 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005654 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 }
5656
5657 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005659 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661
5662 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005666 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005667 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 Py_DECREF(restuple);
5669 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005670 }
5671 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 &resunicode, &i_newpos)) {
5673 Py_DECREF(restuple);
5674 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005676 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005678 else
5679 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005680 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5682 Py_DECREF(restuple);
5683 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005684 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685 Py_INCREF(resunicode);
5686 Py_DECREF(restuple);
5687 return resunicode;
5688}
5689
5690/* Lookup the character ch in the mapping and put the result in result,
5691 which must be decrefed by the caller.
5692 Return 0 on success, -1 on error */
5693static
5694int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5695{
Christian Heimes217cfd12007-12-02 14:31:20 +00005696 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 PyObject *x;
5698
5699 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 x = PyObject_GetItem(mapping, w);
5702 Py_DECREF(w);
5703 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5705 /* No mapping found means: use 1:1 mapping. */
5706 PyErr_Clear();
5707 *result = NULL;
5708 return 0;
5709 } else
5710 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 }
5712 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 *result = x;
5714 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005716 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 long value = PyLong_AS_LONG(x);
5718 long max = PyUnicode_GetMax();
5719 if (value < 0 || value > max) {
5720 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005721 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 Py_DECREF(x);
5723 return -1;
5724 }
5725 *result = x;
5726 return 0;
5727 }
5728 else if (PyUnicode_Check(x)) {
5729 *result = x;
5730 return 0;
5731 }
5732 else {
5733 /* wrong return value */
5734 PyErr_SetString(PyExc_TypeError,
5735 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005736 Py_DECREF(x);
5737 return -1;
5738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739}
5740/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 if not reallocate and adjust various state variables.
5742 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743static
Walter Dörwald4894c302003-10-24 14:25:28 +00005744int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005747 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005748 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 /* remember old output position */
5750 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5751 /* exponentially overallocate to minimize reallocations */
5752 if (requiredsize < 2 * oldsize)
5753 requiredsize = 2 * oldsize;
5754 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5755 return -1;
5756 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005757 }
5758 return 0;
5759}
5760/* lookup the character, put the result in the output string and adjust
5761 various state variables. Return a new reference to the object that
5762 was put in the output buffer in *result, or Py_None, if the mapping was
5763 undefined (in which case no character was written).
5764 The called must decref result.
5765 Return 0 on success, -1 on error. */
5766static
Walter Dörwald4894c302003-10-24 14:25:28 +00005767int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5769 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770{
Walter Dörwald4894c302003-10-24 14:25:28 +00005771 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 /* not found => default to 1:1 mapping */
5775 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 }
5777 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00005779 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 /* no overflow check, because we know that the space is enough */
5781 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 }
5783 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5785 if (repsize==1) {
5786 /* no overflow check, because we know that the space is enough */
5787 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5788 }
5789 else if (repsize!=0) {
5790 /* more than one character */
5791 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5792 (insize - (curinp-startinp)) +
5793 repsize - 1;
5794 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5795 return -1;
5796 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5797 *outp += repsize;
5798 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799 }
5800 else
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 return 0;
5803}
5804
5805PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 Py_ssize_t size,
5807 PyObject *mapping,
5808 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 /* output object */
5811 PyObject *res = NULL;
5812 /* pointers to the beginning and end+1 of input */
5813 const Py_UNICODE *startp = p;
5814 const Py_UNICODE *endp = p + size;
5815 /* pointer into the output */
5816 Py_UNICODE *str;
5817 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005818 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005819 char *reason = "character maps to <undefined>";
5820 PyObject *errorHandler = NULL;
5821 PyObject *exc = NULL;
5822 /* the following variable is used for caching string comparisons
5823 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5824 * 3=ignore, 4=xmlcharrefreplace */
5825 int known_errorHandler = -1;
5826
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 PyErr_BadArgument();
5829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005831
5832 /* allocate enough for a simple 1:1 translation without
5833 replacements, if we need more, we'll resize */
5834 res = PyUnicode_FromUnicode(NULL, size);
5835 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005841 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 /* try to encode it */
5843 PyObject *x = NULL;
5844 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5845 Py_XDECREF(x);
5846 goto onError;
5847 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005848 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 if (x!=Py_None) /* it worked => adjust input pointer */
5850 ++p;
5851 else { /* untranslatable character */
5852 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5853 Py_ssize_t repsize;
5854 Py_ssize_t newpos;
5855 Py_UNICODE *uni2;
5856 /* startpos for collecting untranslatable chars */
5857 const Py_UNICODE *collstart = p;
5858 const Py_UNICODE *collend = p+1;
5859 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 /* find all untranslatable characters */
5862 while (collend < endp) {
5863 if (charmaptranslate_lookup(*collend, mapping, &x))
5864 goto onError;
5865 Py_XDECREF(x);
5866 if (x!=Py_None)
5867 break;
5868 ++collend;
5869 }
5870 /* cache callback name lookup
5871 * (if not done yet, i.e. it's the first error) */
5872 if (known_errorHandler==-1) {
5873 if ((errors==NULL) || (!strcmp(errors, "strict")))
5874 known_errorHandler = 1;
5875 else if (!strcmp(errors, "replace"))
5876 known_errorHandler = 2;
5877 else if (!strcmp(errors, "ignore"))
5878 known_errorHandler = 3;
5879 else if (!strcmp(errors, "xmlcharrefreplace"))
5880 known_errorHandler = 4;
5881 else
5882 known_errorHandler = 0;
5883 }
5884 switch (known_errorHandler) {
5885 case 1: /* strict */
5886 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005887 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 case 2: /* replace */
5889 /* No need to check for space, this is a 1:1 replacement */
5890 for (coll = collstart; coll<collend; ++coll)
5891 *str++ = '?';
5892 /* fall through */
5893 case 3: /* ignore */
5894 p = collend;
5895 break;
5896 case 4: /* xmlcharrefreplace */
5897 /* generate replacement (temporarily (mis)uses p) */
5898 for (p = collstart; p < collend; ++p) {
5899 char buffer[2+29+1+1];
5900 char *cp;
5901 sprintf(buffer, "&#%d;", (int)*p);
5902 if (charmaptranslate_makespace(&res, &str,
5903 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5904 goto onError;
5905 for (cp = buffer; *cp; ++cp)
5906 *str++ = *cp;
5907 }
5908 p = collend;
5909 break;
5910 default:
5911 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5912 reason, startp, size, &exc,
5913 collstart-startp, collend-startp, &newpos);
5914 if (repunicode == NULL)
5915 goto onError;
5916 /* generate replacement */
5917 repsize = PyUnicode_GET_SIZE(repunicode);
5918 if (charmaptranslate_makespace(&res, &str,
5919 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5920 Py_DECREF(repunicode);
5921 goto onError;
5922 }
5923 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5924 *str++ = *uni2;
5925 p = startp + newpos;
5926 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005928 }
5929 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930 /* Resize if we allocated to much */
5931 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005932 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 if (PyUnicode_Resize(&res, respos) < 0)
5934 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005935 }
5936 Py_XDECREF(exc);
5937 Py_XDECREF(errorHandler);
5938 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941 Py_XDECREF(res);
5942 Py_XDECREF(exc);
5943 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 return NULL;
5945}
5946
5947PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 PyObject *mapping,
5949 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950{
5951 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005952
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 str = PyUnicode_FromObject(str);
5954 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 PyUnicode_GET_SIZE(str),
5958 mapping,
5959 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 Py_DECREF(str);
5961 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005962
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 Py_XDECREF(str);
5965 return NULL;
5966}
Tim Petersced69f82003-09-16 20:30:58 +00005967
Guido van Rossum9e896b32000-04-05 20:11:21 +00005968/* --- Decimal Encoder ---------------------------------------------------- */
5969
5970int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 Py_ssize_t length,
5972 char *output,
5973 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005974{
5975 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976 PyObject *errorHandler = NULL;
5977 PyObject *exc = NULL;
5978 const char *encoding = "decimal";
5979 const char *reason = "invalid decimal Unicode string";
5980 /* the following variable is used for caching string comparisons
5981 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5982 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005983
5984 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 PyErr_BadArgument();
5986 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005987 }
5988
5989 p = s;
5990 end = s + length;
5991 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 register Py_UNICODE ch = *p;
5993 int decimal;
5994 PyObject *repunicode;
5995 Py_ssize_t repsize;
5996 Py_ssize_t newpos;
5997 Py_UNICODE *uni2;
5998 Py_UNICODE *collstart;
5999 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006000
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006002 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 ++p;
6004 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006005 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 decimal = Py_UNICODE_TODECIMAL(ch);
6007 if (decimal >= 0) {
6008 *output++ = '0' + decimal;
6009 ++p;
6010 continue;
6011 }
6012 if (0 < ch && ch < 256) {
6013 *output++ = (char)ch;
6014 ++p;
6015 continue;
6016 }
6017 /* All other characters are considered unencodable */
6018 collstart = p;
6019 collend = p+1;
6020 while (collend < end) {
6021 if ((0 < *collend && *collend < 256) ||
6022 !Py_UNICODE_ISSPACE(*collend) ||
6023 Py_UNICODE_TODECIMAL(*collend))
6024 break;
6025 }
6026 /* cache callback name lookup
6027 * (if not done yet, i.e. it's the first error) */
6028 if (known_errorHandler==-1) {
6029 if ((errors==NULL) || (!strcmp(errors, "strict")))
6030 known_errorHandler = 1;
6031 else if (!strcmp(errors, "replace"))
6032 known_errorHandler = 2;
6033 else if (!strcmp(errors, "ignore"))
6034 known_errorHandler = 3;
6035 else if (!strcmp(errors, "xmlcharrefreplace"))
6036 known_errorHandler = 4;
6037 else
6038 known_errorHandler = 0;
6039 }
6040 switch (known_errorHandler) {
6041 case 1: /* strict */
6042 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6043 goto onError;
6044 case 2: /* replace */
6045 for (p = collstart; p < collend; ++p)
6046 *output++ = '?';
6047 /* fall through */
6048 case 3: /* ignore */
6049 p = collend;
6050 break;
6051 case 4: /* xmlcharrefreplace */
6052 /* generate replacement (temporarily (mis)uses p) */
6053 for (p = collstart; p < collend; ++p)
6054 output += sprintf(output, "&#%d;", (int)*p);
6055 p = collend;
6056 break;
6057 default:
6058 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6059 encoding, reason, s, length, &exc,
6060 collstart-s, collend-s, &newpos);
6061 if (repunicode == NULL)
6062 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006063 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006064 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006065 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6066 Py_DECREF(repunicode);
6067 goto onError;
6068 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 /* generate replacement */
6070 repsize = PyUnicode_GET_SIZE(repunicode);
6071 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6072 Py_UNICODE ch = *uni2;
6073 if (Py_UNICODE_ISSPACE(ch))
6074 *output++ = ' ';
6075 else {
6076 decimal = Py_UNICODE_TODECIMAL(ch);
6077 if (decimal >= 0)
6078 *output++ = '0' + decimal;
6079 else if (0 < ch && ch < 256)
6080 *output++ = (char)ch;
6081 else {
6082 Py_DECREF(repunicode);
6083 raise_encode_exception(&exc, encoding,
6084 s, length, collstart-s, collend-s, reason);
6085 goto onError;
6086 }
6087 }
6088 }
6089 p = s + newpos;
6090 Py_DECREF(repunicode);
6091 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006092 }
6093 /* 0-terminate the output string */
6094 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006095 Py_XDECREF(exc);
6096 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006097 return 0;
6098
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006100 Py_XDECREF(exc);
6101 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006102 return -1;
6103}
6104
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105/* --- Helpers ------------------------------------------------------------ */
6106
Eric Smith8c663262007-08-25 02:26:07 +00006107#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006108#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006109
Thomas Wouters477c8d52006-05-27 19:21:47 +00006110#include "stringlib/count.h"
6111#include "stringlib/find.h"
6112#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006113#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006114
Eric Smith5807c412008-05-11 21:00:57 +00006115#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006116#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006117#include "stringlib/localeutil.h"
6118
Thomas Wouters477c8d52006-05-27 19:21:47 +00006119/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006120#define ADJUST_INDICES(start, end, len) \
6121 if (end > len) \
6122 end = len; \
6123 else if (end < 0) { \
6124 end += len; \
6125 if (end < 0) \
6126 end = 0; \
6127 } \
6128 if (start < 0) { \
6129 start += len; \
6130 if (start < 0) \
6131 start = 0; \
6132 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006133
Martin v. Löwis18e16552006-02-15 17:27:45 +00006134Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006135 PyObject *substr,
6136 Py_ssize_t start,
6137 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006139 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006140 PyUnicodeObject* str_obj;
6141 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006142
Thomas Wouters477c8d52006-05-27 19:21:47 +00006143 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6144 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006146 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6147 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 Py_DECREF(str_obj);
6149 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 }
Tim Petersced69f82003-09-16 20:30:58 +00006151
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006152 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006153 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006154 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6155 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006156 );
6157
6158 Py_DECREF(sub_obj);
6159 Py_DECREF(str_obj);
6160
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 return result;
6162}
6163
Martin v. Löwis18e16552006-02-15 17:27:45 +00006164Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006165 PyObject *sub,
6166 Py_ssize_t start,
6167 Py_ssize_t end,
6168 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006170 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006171
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006173 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006175 sub = PyUnicode_FromObject(sub);
6176 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 Py_DECREF(str);
6178 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 }
Tim Petersced69f82003-09-16 20:30:58 +00006180
Thomas Wouters477c8d52006-05-27 19:21:47 +00006181 if (direction > 0)
6182 result = stringlib_find_slice(
6183 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6184 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6185 start, end
6186 );
6187 else
6188 result = stringlib_rfind_slice(
6189 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6190 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6191 start, end
6192 );
6193
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006195 Py_DECREF(sub);
6196
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 return result;
6198}
6199
Tim Petersced69f82003-09-16 20:30:58 +00006200static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 PyUnicodeObject *substring,
6203 Py_ssize_t start,
6204 Py_ssize_t end,
6205 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 if (substring->length == 0)
6208 return 1;
6209
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006210 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 end -= substring->length;
6212 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214
6215 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 if (Py_UNICODE_MATCH(self, end, substring))
6217 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 } else {
6219 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 }
6222
6223 return 0;
6224}
6225
Martin v. Löwis18e16552006-02-15 17:27:45 +00006226Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 PyObject *substr,
6228 Py_ssize_t start,
6229 Py_ssize_t end,
6230 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006232 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006233
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 str = PyUnicode_FromObject(str);
6235 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 substr = PyUnicode_FromObject(substr);
6238 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 Py_DECREF(str);
6240 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 }
Tim Petersced69f82003-09-16 20:30:58 +00006242
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 (PyUnicodeObject *)substr,
6245 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 Py_DECREF(str);
6247 Py_DECREF(substr);
6248 return result;
6249}
6250
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251/* Apply fixfct filter to the Unicode object self and return a
6252 reference to the modified object */
6253
Tim Petersced69f82003-09-16 20:30:58 +00006254static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257{
6258
6259 PyUnicodeObject *u;
6260
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006261 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006264
6265 Py_UNICODE_COPY(u->str, self->str, self->length);
6266
Tim Peters7a29bd52001-09-12 03:03:31 +00006267 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 /* fixfct should return TRUE if it modified the buffer. If
6269 FALSE, return a reference to the original buffer instead
6270 (to save space, not time) */
6271 Py_INCREF(self);
6272 Py_DECREF(u);
6273 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 }
6275 return (PyObject*) u;
6276}
6277
Tim Petersced69f82003-09-16 20:30:58 +00006278static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279int fixupper(PyUnicodeObject *self)
6280{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006281 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 Py_UNICODE *s = self->str;
6283 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006284
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006287
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 ch = Py_UNICODE_TOUPPER(*s);
6289 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 *s = ch;
6292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 s++;
6294 }
6295
6296 return status;
6297}
6298
Tim Petersced69f82003-09-16 20:30:58 +00006299static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300int fixlower(PyUnicodeObject *self)
6301{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006302 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 Py_UNICODE *s = self->str;
6304 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006305
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006308
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 ch = Py_UNICODE_TOLOWER(*s);
6310 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 *s = ch;
6313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 s++;
6315 }
6316
6317 return status;
6318}
6319
Tim Petersced69f82003-09-16 20:30:58 +00006320static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321int fixswapcase(PyUnicodeObject *self)
6322{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006323 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 Py_UNICODE *s = self->str;
6325 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006326
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 while (len-- > 0) {
6328 if (Py_UNICODE_ISUPPER(*s)) {
6329 *s = Py_UNICODE_TOLOWER(*s);
6330 status = 1;
6331 } else if (Py_UNICODE_ISLOWER(*s)) {
6332 *s = Py_UNICODE_TOUPPER(*s);
6333 status = 1;
6334 }
6335 s++;
6336 }
6337
6338 return status;
6339}
6340
Tim Petersced69f82003-09-16 20:30:58 +00006341static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342int fixcapitalize(PyUnicodeObject *self)
6343{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006344 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006345 Py_UNICODE *s = self->str;
6346 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006347
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006348 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006350 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 *s = Py_UNICODE_TOUPPER(*s);
6352 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006354 s++;
6355 while (--len > 0) {
6356 if (Py_UNICODE_ISUPPER(*s)) {
6357 *s = Py_UNICODE_TOLOWER(*s);
6358 status = 1;
6359 }
6360 s++;
6361 }
6362 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363}
6364
6365static
6366int fixtitle(PyUnicodeObject *self)
6367{
6368 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6369 register Py_UNICODE *e;
6370 int previous_is_cased;
6371
6372 /* Shortcut for single character strings */
6373 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6375 if (*p != ch) {
6376 *p = ch;
6377 return 1;
6378 }
6379 else
6380 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 }
Tim Petersced69f82003-09-16 20:30:58 +00006382
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 e = p + PyUnicode_GET_SIZE(self);
6384 previous_is_cased = 0;
6385 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006387
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 if (previous_is_cased)
6389 *p = Py_UNICODE_TOLOWER(ch);
6390 else
6391 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006392
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 if (Py_UNICODE_ISLOWER(ch) ||
6394 Py_UNICODE_ISUPPER(ch) ||
6395 Py_UNICODE_ISTITLE(ch))
6396 previous_is_cased = 1;
6397 else
6398 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 }
6400 return 1;
6401}
6402
Tim Peters8ce9f162004-08-27 01:49:32 +00006403PyObject *
6404PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405{
Skip Montanaro6543b452004-09-16 03:28:13 +00006406 const Py_UNICODE blank = ' ';
6407 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006408 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006409 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006410 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6411 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006412 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6413 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006414 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006415 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416
Tim Peters05eba1f2004-08-27 21:32:02 +00006417 fseq = PySequence_Fast(seq, "");
6418 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006419 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006420 }
6421
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006422 /* NOTE: the following code can't call back into Python code,
6423 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006424 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006425
Tim Peters05eba1f2004-08-27 21:32:02 +00006426 seqlen = PySequence_Fast_GET_SIZE(fseq);
6427 /* If empty sequence, return u"". */
6428 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006429 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6430 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006431 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006432 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006433 /* If singleton sequence with an exact Unicode, return that. */
6434 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 item = items[0];
6436 if (PyUnicode_CheckExact(item)) {
6437 Py_INCREF(item);
6438 res = (PyUnicodeObject *)item;
6439 goto Done;
6440 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006441 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006442 else {
6443 /* Set up sep and seplen */
6444 if (separator == NULL) {
6445 sep = &blank;
6446 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006447 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006448 else {
6449 if (!PyUnicode_Check(separator)) {
6450 PyErr_Format(PyExc_TypeError,
6451 "separator: expected str instance,"
6452 " %.80s found",
6453 Py_TYPE(separator)->tp_name);
6454 goto onError;
6455 }
6456 sep = PyUnicode_AS_UNICODE(separator);
6457 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006458 }
6459 }
6460
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006461 /* There are at least two things to join, or else we have a subclass
6462 * of str in the sequence.
6463 * Do a pre-pass to figure out the total amount of space we'll
6464 * need (sz), and see whether all argument are strings.
6465 */
6466 sz = 0;
6467 for (i = 0; i < seqlen; i++) {
6468 const Py_ssize_t old_sz = sz;
6469 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 if (!PyUnicode_Check(item)) {
6471 PyErr_Format(PyExc_TypeError,
6472 "sequence item %zd: expected str instance,"
6473 " %.80s found",
6474 i, Py_TYPE(item)->tp_name);
6475 goto onError;
6476 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006477 sz += PyUnicode_GET_SIZE(item);
6478 if (i != 0)
6479 sz += seplen;
6480 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6481 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006483 goto onError;
6484 }
6485 }
Tim Petersced69f82003-09-16 20:30:58 +00006486
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006487 res = _PyUnicode_New(sz);
6488 if (res == NULL)
6489 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006490
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006491 /* Catenate everything. */
6492 res_p = PyUnicode_AS_UNICODE(res);
6493 for (i = 0; i < seqlen; ++i) {
6494 Py_ssize_t itemlen;
6495 item = items[i];
6496 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 /* Copy item, and maybe the separator. */
6498 if (i) {
6499 Py_UNICODE_COPY(res_p, sep, seplen);
6500 res_p += seplen;
6501 }
6502 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6503 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006504 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006505
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006507 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 return (PyObject *)res;
6509
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006511 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006512 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 return NULL;
6514}
6515
Tim Petersced69f82003-09-16 20:30:58 +00006516static
6517PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 Py_ssize_t left,
6519 Py_ssize_t right,
6520 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521{
6522 PyUnicodeObject *u;
6523
6524 if (left < 0)
6525 left = 0;
6526 if (right < 0)
6527 right = 0;
6528
Tim Peters7a29bd52001-09-12 03:03:31 +00006529 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 Py_INCREF(self);
6531 return self;
6532 }
6533
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006534 if (left > PY_SSIZE_T_MAX - self->length ||
6535 right > PY_SSIZE_T_MAX - (left + self->length)) {
6536 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6537 return NULL;
6538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 u = _PyUnicode_New(left + self->length + right);
6540 if (u) {
6541 if (left)
6542 Py_UNICODE_FILL(u->str, fill, left);
6543 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6544 if (right)
6545 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6546 }
6547
6548 return u;
6549}
6550
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006551PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554
6555 string = PyUnicode_FromObject(string);
6556 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006559 list = stringlib_splitlines(
6560 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6561 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562
6563 Py_DECREF(string);
6564 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565}
6566
Tim Petersced69f82003-09-16 20:30:58 +00006567static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 PyUnicodeObject *substring,
6570 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006573 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006576 return stringlib_split_whitespace(
6577 (PyObject*) self, self->str, self->length, maxcount
6578 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006580 return stringlib_split(
6581 (PyObject*) self, self->str, self->length,
6582 substring->str, substring->length,
6583 maxcount
6584 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585}
6586
Tim Petersced69f82003-09-16 20:30:58 +00006587static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006588PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 PyUnicodeObject *substring,
6590 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006591{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006592 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006593 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006594
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006595 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006596 return stringlib_rsplit_whitespace(
6597 (PyObject*) self, self->str, self->length, maxcount
6598 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006599
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006600 return stringlib_rsplit(
6601 (PyObject*) self, self->str, self->length,
6602 substring->str, substring->length,
6603 maxcount
6604 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006605}
6606
6607static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 PyUnicodeObject *str1,
6610 PyUnicodeObject *str2,
6611 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612{
6613 PyUnicodeObject *u;
6614
6615 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006617 else if (maxcount == 0 || self->length == 0)
6618 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619
Thomas Wouters477c8d52006-05-27 19:21:47 +00006620 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006621 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006622 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006623 if (str1->length == 0)
6624 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006625 if (str1->length == 1) {
6626 /* replace characters */
6627 Py_UNICODE u1, u2;
6628 if (!findchar(self->str, self->length, str1->str[0]))
6629 goto nothing;
6630 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6631 if (!u)
6632 return NULL;
6633 Py_UNICODE_COPY(u->str, self->str, self->length);
6634 u1 = str1->str[0];
6635 u2 = str2->str[0];
6636 for (i = 0; i < u->length; i++)
6637 if (u->str[i] == u1) {
6638 if (--maxcount < 0)
6639 break;
6640 u->str[i] = u2;
6641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006643 i = stringlib_find(
6644 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006646 if (i < 0)
6647 goto nothing;
6648 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6649 if (!u)
6650 return NULL;
6651 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006652
6653 /* change everything in-place, starting with this one */
6654 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6655 i += str1->length;
6656
6657 while ( --maxcount > 0) {
6658 i = stringlib_find(self->str+i, self->length-i,
6659 str1->str, str1->length,
6660 i);
6661 if (i == -1)
6662 break;
6663 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6664 i += str1->length;
6665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006668
6669 Py_ssize_t n, i, j, e;
6670 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 Py_UNICODE *p;
6672
6673 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006674 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6675 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006676 if (n == 0)
6677 goto nothing;
6678 /* new_size = self->length + n * (str2->length - str1->length)); */
6679 delta = (str2->length - str1->length);
6680 if (delta == 0) {
6681 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006683 product = n * (str2->length - str1->length);
6684 if ((product / (str2->length - str1->length)) != n) {
6685 PyErr_SetString(PyExc_OverflowError,
6686 "replace string is too long");
6687 return NULL;
6688 }
6689 new_size = self->length + product;
6690 if (new_size < 0) {
6691 PyErr_SetString(PyExc_OverflowError,
6692 "replace string is too long");
6693 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 }
6695 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006696 u = _PyUnicode_New(new_size);
6697 if (!u)
6698 return NULL;
6699 i = 0;
6700 p = u->str;
6701 e = self->length - str1->length;
6702 if (str1->length > 0) {
6703 while (n-- > 0) {
6704 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006705 j = stringlib_find(self->str+i, self->length-i,
6706 str1->str, str1->length,
6707 i);
6708 if (j == -1)
6709 break;
6710 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006711 /* copy unchanged part [i:j] */
6712 Py_UNICODE_COPY(p, self->str+i, j-i);
6713 p += j - i;
6714 }
6715 /* copy substitution string */
6716 if (str2->length > 0) {
6717 Py_UNICODE_COPY(p, str2->str, str2->length);
6718 p += str2->length;
6719 }
6720 i = j + str1->length;
6721 }
6722 if (i < self->length)
6723 /* copy tail [i:] */
6724 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6725 } else {
6726 /* interleave */
6727 while (n > 0) {
6728 Py_UNICODE_COPY(p, str2->str, str2->length);
6729 p += str2->length;
6730 if (--n <= 0)
6731 break;
6732 *p++ = self->str[i++];
6733 }
6734 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006738
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006740 /* nothing to replace; return original string (when possible) */
6741 if (PyUnicode_CheckExact(self)) {
6742 Py_INCREF(self);
6743 return (PyObject *) self;
6744 }
6745 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746}
6747
6748/* --- Unicode Object Methods --------------------------------------------- */
6749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006750PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752\n\
6753Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006754characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755
6756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006757unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 return fixup(self, fixtitle);
6760}
6761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006762PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764\n\
6765Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006766have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767
6768static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006769unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 return fixup(self, fixcapitalize);
6772}
6773
6774#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006775PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777\n\
6778Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006779normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780
6781static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006782unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783{
6784 PyObject *list;
6785 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006786 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 /* Split into words */
6789 list = split(self, NULL, -1);
6790 if (!list)
6791 return NULL;
6792
6793 /* Capitalize each word */
6794 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6795 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00006796 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 if (item == NULL)
6798 goto onError;
6799 Py_DECREF(PyList_GET_ITEM(list, i));
6800 PyList_SET_ITEM(list, i, item);
6801 }
6802
6803 /* Join the words to form a new string */
6804 item = PyUnicode_Join(NULL, list);
6805
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 Py_DECREF(list);
6808 return (PyObject *)item;
6809}
6810#endif
6811
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006812/* Argument converter. Coerces to a single unicode character */
6813
6814static int
6815convert_uc(PyObject *obj, void *addr)
6816{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006817 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6818 PyObject *uniobj;
6819 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006820
Benjamin Peterson14339b62009-01-31 16:36:08 +00006821 uniobj = PyUnicode_FromObject(obj);
6822 if (uniobj == NULL) {
6823 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006825 return 0;
6826 }
6827 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6828 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006830 Py_DECREF(uniobj);
6831 return 0;
6832 }
6833 unistr = PyUnicode_AS_UNICODE(uniobj);
6834 *fillcharloc = unistr[0];
6835 Py_DECREF(uniobj);
6836 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006837}
6838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006839PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00006842Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006843done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
6845static PyObject *
6846unicode_center(PyUnicodeObject *self, PyObject *args)
6847{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006848 Py_ssize_t marg, left;
6849 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006850 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
Thomas Woutersde017742006-02-16 19:34:37 +00006852 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 return NULL;
6854
Tim Peters7a29bd52001-09-12 03:03:31 +00006855 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 Py_INCREF(self);
6857 return (PyObject*) self;
6858 }
6859
6860 marg = width - self->length;
6861 left = marg / 2 + (marg & width & 1);
6862
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006863 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864}
6865
Marc-André Lemburge5034372000-08-08 08:04:29 +00006866#if 0
6867
6868/* This code should go into some future Unicode collation support
6869 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00006870 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006871
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006872/* speedy UTF-16 code point order comparison */
6873/* gleaned from: */
6874/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6875
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006876static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006877{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006878 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006879 0, 0, 0, 0, 0, 0, 0, 0,
6880 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006881 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006882};
6883
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884static int
6885unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6886{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006887 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006888
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 Py_UNICODE *s1 = str1->str;
6890 Py_UNICODE *s2 = str2->str;
6891
6892 len1 = str1->length;
6893 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006894
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006896 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006897
6898 c1 = *s1++;
6899 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006900
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 if (c1 > (1<<11) * 26)
6902 c1 += utf16Fixup[c1>>11];
6903 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006904 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006905 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006906
6907 if (c1 != c2)
6908 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006909
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006910 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 }
6912
6913 return (len1 < len2) ? -1 : (len1 != len2);
6914}
6915
Marc-André Lemburge5034372000-08-08 08:04:29 +00006916#else
6917
6918static int
6919unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6920{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006921 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006922
6923 Py_UNICODE *s1 = str1->str;
6924 Py_UNICODE *s2 = str2->str;
6925
6926 len1 = str1->length;
6927 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006928
Marc-André Lemburge5034372000-08-08 08:04:29 +00006929 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006930 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006931
Fredrik Lundh45714e92001-06-26 16:39:36 +00006932 c1 = *s1++;
6933 c2 = *s2++;
6934
6935 if (c1 != c2)
6936 return (c1 < c2) ? -1 : 1;
6937
Marc-André Lemburge5034372000-08-08 08:04:29 +00006938 len1--; len2--;
6939 }
6940
6941 return (len1 < len2) ? -1 : (len1 != len2);
6942}
6943
6944#endif
6945
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006949 if (PyUnicode_Check(left) && PyUnicode_Check(right))
6950 return unicode_compare((PyUnicodeObject *)left,
6951 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00006952 PyErr_Format(PyExc_TypeError,
6953 "Can't compare %.100s and %.100s",
6954 left->ob_type->tp_name,
6955 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 return -1;
6957}
6958
Martin v. Löwis5b222132007-06-10 09:51:05 +00006959int
6960PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6961{
6962 int i;
6963 Py_UNICODE *id;
6964 assert(PyUnicode_Check(uni));
6965 id = PyUnicode_AS_UNICODE(uni);
6966 /* Compare Unicode string and source character set string */
6967 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 if (id[i] != str[i])
6969 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00006970 /* This check keeps Python strings that end in '\0' from comparing equal
6971 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00006972 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006974 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00006976 return 0;
6977}
6978
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006979
Benjamin Peterson29060642009-01-31 22:14:21 +00006980#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00006981 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006982
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00006983PyObject *PyUnicode_RichCompare(PyObject *left,
6984 PyObject *right,
6985 int op)
6986{
6987 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006988
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00006989 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6990 PyObject *v;
6991 if (((PyUnicodeObject *) left)->length !=
6992 ((PyUnicodeObject *) right)->length) {
6993 if (op == Py_EQ) {
6994 Py_INCREF(Py_False);
6995 return Py_False;
6996 }
6997 if (op == Py_NE) {
6998 Py_INCREF(Py_True);
6999 return Py_True;
7000 }
7001 }
7002 if (left == right)
7003 result = 0;
7004 else
7005 result = unicode_compare((PyUnicodeObject *)left,
7006 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007007
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007008 /* Convert the return value to a Boolean */
7009 switch (op) {
7010 case Py_EQ:
7011 v = TEST_COND(result == 0);
7012 break;
7013 case Py_NE:
7014 v = TEST_COND(result != 0);
7015 break;
7016 case Py_LE:
7017 v = TEST_COND(result <= 0);
7018 break;
7019 case Py_GE:
7020 v = TEST_COND(result >= 0);
7021 break;
7022 case Py_LT:
7023 v = TEST_COND(result == -1);
7024 break;
7025 case Py_GT:
7026 v = TEST_COND(result == 1);
7027 break;
7028 default:
7029 PyErr_BadArgument();
7030 return NULL;
7031 }
7032 Py_INCREF(v);
7033 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007034 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007035
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007036 Py_INCREF(Py_NotImplemented);
7037 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007038}
7039
Guido van Rossum403d68b2000-03-13 15:55:09 +00007040int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007042{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007043 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007044 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007045
7046 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007047 sub = PyUnicode_FromObject(element);
7048 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 PyErr_Format(PyExc_TypeError,
7050 "'in <string>' requires string as left operand, not %s",
7051 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007052 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007053 }
7054
Thomas Wouters477c8d52006-05-27 19:21:47 +00007055 str = PyUnicode_FromObject(container);
7056 if (!str) {
7057 Py_DECREF(sub);
7058 return -1;
7059 }
7060
7061 result = stringlib_contains_obj(str, sub);
7062
7063 Py_DECREF(str);
7064 Py_DECREF(sub);
7065
Guido van Rossum403d68b2000-03-13 15:55:09 +00007066 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007067}
7068
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069/* Concat to string or Unicode object giving a new Unicode object. */
7070
7071PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073{
7074 PyUnicodeObject *u = NULL, *v = NULL, *w;
7075
7076 /* Coerce the two arguments */
7077 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7078 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007079 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7081 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007082 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083
7084 /* Shortcuts */
7085 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 Py_DECREF(v);
7087 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 }
7089 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 Py_DECREF(u);
7091 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 }
7093
7094 /* Concat the two Unicode strings */
7095 w = _PyUnicode_New(u->length + v->length);
7096 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 Py_UNICODE_COPY(w->str, u->str, u->length);
7099 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7100
7101 Py_DECREF(u);
7102 Py_DECREF(v);
7103 return (PyObject *)w;
7104
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106 Py_XDECREF(u);
7107 Py_XDECREF(v);
7108 return NULL;
7109}
7110
Walter Dörwald1ab83302007-05-18 17:15:44 +00007111void
7112PyUnicode_Append(PyObject **pleft, PyObject *right)
7113{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007114 PyObject *new;
7115 if (*pleft == NULL)
7116 return;
7117 if (right == NULL || !PyUnicode_Check(*pleft)) {
7118 Py_DECREF(*pleft);
7119 *pleft = NULL;
7120 return;
7121 }
7122 new = PyUnicode_Concat(*pleft, right);
7123 Py_DECREF(*pleft);
7124 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007125}
7126
7127void
7128PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7129{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007130 PyUnicode_Append(pleft, right);
7131 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007132}
7133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007134PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007137Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007138string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007139interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140
7141static PyObject *
7142unicode_count(PyUnicodeObject *self, PyObject *args)
7143{
7144 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007145 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007146 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 PyObject *result;
7148
Guido van Rossumb8872e62000-05-09 14:14:27 +00007149 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007150 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151 return NULL;
7152
7153 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007154 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007157
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007158 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007159 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007160 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007161 substring->str, substring->length,
7162 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007163 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164
7165 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007166
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 return result;
7168}
7169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007170PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007173Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007174to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007175handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007176a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7177'xmlcharrefreplace' as well as any other name registered with\n\
7178codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179
7180static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007181unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007183 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184 char *encoding = NULL;
7185 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007186 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007187
Benjamin Peterson308d6372009-09-18 21:42:35 +00007188 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7189 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007191 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007192 if (v == NULL)
7193 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007194 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007195 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007196 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007197 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007198 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007199 Py_DECREF(v);
7200 return NULL;
7201 }
7202 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007203
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007205 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007206}
7207
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007208PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210\n\
7211Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007212If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213
7214static PyObject*
7215unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7216{
7217 Py_UNICODE *e;
7218 Py_UNICODE *p;
7219 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007220 Py_UNICODE *qe;
7221 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 PyUnicodeObject *u;
7223 int tabsize = 8;
7224
7225 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
Thomas Wouters7e474022000-07-16 12:04:32 +00007228 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007229 i = 0; /* chars up to and including most recent \n or \r */
7230 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7231 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 for (p = self->str; p < e; p++)
7233 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 if (tabsize > 0) {
7235 incr = tabsize - (j % tabsize); /* cannot overflow */
7236 if (j > PY_SSIZE_T_MAX - incr)
7237 goto overflow1;
7238 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007239 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 if (j > PY_SSIZE_T_MAX - 1)
7243 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244 j++;
7245 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 if (i > PY_SSIZE_T_MAX - j)
7247 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007249 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 }
7251 }
7252
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007253 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007255
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256 /* Second pass: create output string and fill it */
7257 u = _PyUnicode_New(i + j);
7258 if (!u)
7259 return NULL;
7260
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007261 j = 0; /* same as in first pass */
7262 q = u->str; /* next output char */
7263 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264
7265 for (p = self->str; p < e; p++)
7266 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 if (tabsize > 0) {
7268 i = tabsize - (j % tabsize);
7269 j += i;
7270 while (i--) {
7271 if (q >= qe)
7272 goto overflow2;
7273 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007274 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007275 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007276 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 else {
7278 if (q >= qe)
7279 goto overflow2;
7280 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007281 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 if (*p == '\n' || *p == '\r')
7283 j = 0;
7284 }
7285
7286 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007287
7288 overflow2:
7289 Py_DECREF(u);
7290 overflow1:
7291 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293}
7294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007295PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007296 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297\n\
7298Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007299such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300arguments start and end are interpreted as in slice notation.\n\
7301\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007302Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303
7304static PyObject *
7305unicode_find(PyUnicodeObject *self, PyObject *args)
7306{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007307 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007308 Py_ssize_t start;
7309 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007310 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311
Christian Heimes9cd17752007-11-18 19:35:23 +00007312 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314
Thomas Wouters477c8d52006-05-27 19:21:47 +00007315 result = stringlib_find_slice(
7316 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7317 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7318 start, end
7319 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320
7321 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007322
Christian Heimes217cfd12007-12-02 14:31:20 +00007323 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324}
7325
7326static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007327unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328{
7329 if (index < 0 || index >= self->length) {
7330 PyErr_SetString(PyExc_IndexError, "string index out of range");
7331 return NULL;
7332 }
7333
7334 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7335}
7336
Guido van Rossumc2504932007-09-18 19:42:40 +00007337/* Believe it or not, this produces the same value for ASCII strings
7338 as string_hash(). */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339static long
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007340unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341{
Guido van Rossumc2504932007-09-18 19:42:40 +00007342 Py_ssize_t len;
7343 Py_UNICODE *p;
7344 long x;
7345
7346 if (self->hash != -1)
7347 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007348 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007349 p = self->str;
7350 x = *p << 7;
7351 while (--len >= 0)
7352 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007353 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007354 if (x == -1)
7355 x = -2;
7356 self->hash = x;
7357 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358}
7359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007360PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007363Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364
7365static PyObject *
7366unicode_index(PyUnicodeObject *self, PyObject *args)
7367{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007368 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007369 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007370 Py_ssize_t start;
7371 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372
Christian Heimes9cd17752007-11-18 19:35:23 +00007373 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375
Thomas Wouters477c8d52006-05-27 19:21:47 +00007376 result = stringlib_find_slice(
7377 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7378 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7379 start, end
7380 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381
7382 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007383
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 if (result < 0) {
7385 PyErr_SetString(PyExc_ValueError, "substring not found");
7386 return NULL;
7387 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007388
Christian Heimes217cfd12007-12-02 14:31:20 +00007389 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390}
7391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007392PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007395Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007396at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397
7398static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007399unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400{
7401 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7402 register const Py_UNICODE *e;
7403 int cased;
7404
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405 /* Shortcut for single character strings */
7406 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007409 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007410 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007412
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413 e = p + PyUnicode_GET_SIZE(self);
7414 cased = 0;
7415 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007417
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7419 return PyBool_FromLong(0);
7420 else if (!cased && Py_UNICODE_ISLOWER(ch))
7421 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007423 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424}
7425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007426PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007429Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007430at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431
7432static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007433unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434{
7435 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7436 register const Py_UNICODE *e;
7437 int cased;
7438
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 /* Shortcut for single character strings */
7440 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007443 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007444 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007446
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 e = p + PyUnicode_GET_SIZE(self);
7448 cased = 0;
7449 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007451
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7453 return PyBool_FromLong(0);
7454 else if (!cased && Py_UNICODE_ISUPPER(ch))
7455 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007457 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458}
7459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007460PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007463Return True if S is a titlecased string and there is at least one\n\
7464character in S, i.e. upper- and titlecase characters may only\n\
7465follow uncased characters and lowercase characters only cased ones.\n\
7466Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
7468static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007469unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470{
7471 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7472 register const Py_UNICODE *e;
7473 int cased, previous_is_cased;
7474
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 /* Shortcut for single character strings */
7476 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7478 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007480 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007481 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007483
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484 e = p + PyUnicode_GET_SIZE(self);
7485 cased = 0;
7486 previous_is_cased = 0;
7487 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007489
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7491 if (previous_is_cased)
7492 return PyBool_FromLong(0);
7493 previous_is_cased = 1;
7494 cased = 1;
7495 }
7496 else if (Py_UNICODE_ISLOWER(ch)) {
7497 if (!previous_is_cased)
7498 return PyBool_FromLong(0);
7499 previous_is_cased = 1;
7500 cased = 1;
7501 }
7502 else
7503 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007505 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506}
7507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007508PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007511Return True if all characters in S are whitespace\n\
7512and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513
7514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007515unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516{
7517 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7518 register const Py_UNICODE *e;
7519
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 /* Shortcut for single character strings */
7521 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 Py_UNICODE_ISSPACE(*p))
7523 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007525 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007526 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007528
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 e = p + PyUnicode_GET_SIZE(self);
7530 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 if (!Py_UNICODE_ISSPACE(*p))
7532 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007534 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535}
7536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007537PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007539\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007540Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007541and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007542
7543static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007544unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007545{
7546 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7547 register const Py_UNICODE *e;
7548
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007549 /* Shortcut for single character strings */
7550 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 Py_UNICODE_ISALPHA(*p))
7552 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007553
7554 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007555 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007557
7558 e = p + PyUnicode_GET_SIZE(self);
7559 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 if (!Py_UNICODE_ISALPHA(*p))
7561 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007562 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007563 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007564}
7565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007566PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007567 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007568\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007569Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007570and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007571
7572static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007573unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007574{
7575 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7576 register const Py_UNICODE *e;
7577
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007578 /* Shortcut for single character strings */
7579 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 Py_UNICODE_ISALNUM(*p))
7581 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007582
7583 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007584 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007586
7587 e = p + PyUnicode_GET_SIZE(self);
7588 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 if (!Py_UNICODE_ISALNUM(*p))
7590 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007591 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007592 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007593}
7594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007595PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007598Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007599False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600
7601static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007602unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603{
7604 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7605 register const Py_UNICODE *e;
7606
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 /* Shortcut for single character strings */
7608 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 Py_UNICODE_ISDECIMAL(*p))
7610 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007612 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007613 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007615
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616 e = p + PyUnicode_GET_SIZE(self);
7617 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 if (!Py_UNICODE_ISDECIMAL(*p))
7619 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007621 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622}
7623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007624PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007627Return True if all characters in S are digits\n\
7628and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629
7630static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007631unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632{
7633 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7634 register const Py_UNICODE *e;
7635
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 /* Shortcut for single character strings */
7637 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 Py_UNICODE_ISDIGIT(*p))
7639 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007641 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007642 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007643 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007644
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645 e = p + PyUnicode_GET_SIZE(self);
7646 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 if (!Py_UNICODE_ISDIGIT(*p))
7648 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007650 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651}
7652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007653PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007656Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007657False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658
7659static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007660unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661{
7662 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7663 register const Py_UNICODE *e;
7664
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665 /* Shortcut for single character strings */
7666 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 Py_UNICODE_ISNUMERIC(*p))
7668 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007670 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007671 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007673
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 e = p + PyUnicode_GET_SIZE(self);
7675 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 if (!Py_UNICODE_ISNUMERIC(*p))
7677 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007679 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680}
7681
Martin v. Löwis47383402007-08-15 07:32:56 +00007682int
7683PyUnicode_IsIdentifier(PyObject *self)
7684{
7685 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7686 register const Py_UNICODE *e;
7687
7688 /* Special case for empty strings */
7689 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007690 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007691
7692 /* PEP 3131 says that the first character must be in
7693 XID_Start and subsequent characters in XID_Continue,
7694 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007695 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007696 letters, digits, underscore). However, given the current
7697 definition of XID_Start and XID_Continue, it is sufficient
7698 to check just for these, except that _ must be allowed
7699 as starting an identifier. */
7700 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7701 return 0;
7702
7703 e = p + PyUnicode_GET_SIZE(self);
7704 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 if (!_PyUnicode_IsXidContinue(*p))
7706 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007707 }
7708 return 1;
7709}
7710
7711PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007713\n\
7714Return True if S is a valid identifier according\n\
7715to the language definition.");
7716
7717static PyObject*
7718unicode_isidentifier(PyObject *self)
7719{
7720 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7721}
7722
Georg Brandl559e5d72008-06-11 18:37:52 +00007723PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007725\n\
7726Return True if all characters in S are considered\n\
7727printable in repr() or S is empty, False otherwise.");
7728
7729static PyObject*
7730unicode_isprintable(PyObject *self)
7731{
7732 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7733 register const Py_UNICODE *e;
7734
7735 /* Shortcut for single character strings */
7736 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7737 Py_RETURN_TRUE;
7738 }
7739
7740 e = p + PyUnicode_GET_SIZE(self);
7741 for (; p < e; p++) {
7742 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7743 Py_RETURN_FALSE;
7744 }
7745 }
7746 Py_RETURN_TRUE;
7747}
7748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007749PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007750 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751\n\
7752Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007753iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754
7755static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007756unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007758 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759}
7760
Martin v. Löwis18e16552006-02-15 17:27:45 +00007761static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762unicode_length(PyUnicodeObject *self)
7763{
7764 return self->length;
7765}
7766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007767PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007768 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007770Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007771done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772
7773static PyObject *
7774unicode_ljust(PyUnicodeObject *self, PyObject *args)
7775{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007776 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007777 Py_UNICODE fillchar = ' ';
7778
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007779 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 return NULL;
7781
Tim Peters7a29bd52001-09-12 03:03:31 +00007782 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 Py_INCREF(self);
7784 return (PyObject*) self;
7785 }
7786
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007787 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788}
7789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007790PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007793Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794
7795static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007796unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 return fixup(self, fixlower);
7799}
7800
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007801#define LEFTSTRIP 0
7802#define RIGHTSTRIP 1
7803#define BOTHSTRIP 2
7804
7805/* Arrays indexed by above */
7806static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7807
7808#define STRIPNAME(i) (stripformat[i]+3)
7809
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007810/* externally visible for str.strip(unicode) */
7811PyObject *
7812_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7813{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007814 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7815 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7816 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7817 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7818 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007819
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007821
Benjamin Peterson14339b62009-01-31 16:36:08 +00007822 i = 0;
7823 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7825 i++;
7826 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007827 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007828
Benjamin Peterson14339b62009-01-31 16:36:08 +00007829 j = len;
7830 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 do {
7832 j--;
7833 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7834 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007835 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007836
Benjamin Peterson14339b62009-01-31 16:36:08 +00007837 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 Py_INCREF(self);
7839 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007840 }
7841 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007843}
7844
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845
7846static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007847do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007849 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7850 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007851
Benjamin Peterson14339b62009-01-31 16:36:08 +00007852 i = 0;
7853 if (striptype != RIGHTSTRIP) {
7854 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7855 i++;
7856 }
7857 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007858
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 j = len;
7860 if (striptype != LEFTSTRIP) {
7861 do {
7862 j--;
7863 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7864 j++;
7865 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007866
Benjamin Peterson14339b62009-01-31 16:36:08 +00007867 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7868 Py_INCREF(self);
7869 return (PyObject*)self;
7870 }
7871 else
7872 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873}
7874
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007875
7876static PyObject *
7877do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7878{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007879 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007880
Benjamin Peterson14339b62009-01-31 16:36:08 +00007881 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7882 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007883
Benjamin Peterson14339b62009-01-31 16:36:08 +00007884 if (sep != NULL && sep != Py_None) {
7885 if (PyUnicode_Check(sep))
7886 return _PyUnicode_XStrip(self, striptype, sep);
7887 else {
7888 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 "%s arg must be None or str",
7890 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00007891 return NULL;
7892 }
7893 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007894
Benjamin Peterson14339b62009-01-31 16:36:08 +00007895 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007896}
7897
7898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007899PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007901\n\
7902Return a copy of the string S with leading and trailing\n\
7903whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007904If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007905
7906static PyObject *
7907unicode_strip(PyUnicodeObject *self, PyObject *args)
7908{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007909 if (PyTuple_GET_SIZE(args) == 0)
7910 return do_strip(self, BOTHSTRIP); /* Common case */
7911 else
7912 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007913}
7914
7915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007916PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007918\n\
7919Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007920If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007921
7922static PyObject *
7923unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7924{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007925 if (PyTuple_GET_SIZE(args) == 0)
7926 return do_strip(self, LEFTSTRIP); /* Common case */
7927 else
7928 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007929}
7930
7931
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007932PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007934\n\
7935Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007936If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007937
7938static PyObject *
7939unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007941 if (PyTuple_GET_SIZE(args) == 0)
7942 return do_strip(self, RIGHTSTRIP); /* Common case */
7943 else
7944 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007945}
7946
7947
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007949unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950{
7951 PyUnicodeObject *u;
7952 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007953 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007954 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955
Georg Brandl222de0f2009-04-12 12:01:50 +00007956 if (len < 1) {
7957 Py_INCREF(unicode_empty);
7958 return (PyObject *)unicode_empty;
7959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960
Tim Peters7a29bd52001-09-12 03:03:31 +00007961 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962 /* no repeat, return original string */
7963 Py_INCREF(str);
7964 return (PyObject*) str;
7965 }
Tim Peters8f422462000-09-09 06:13:41 +00007966
7967 /* ensure # of chars needed doesn't overflow int and # of bytes
7968 * needed doesn't overflow size_t
7969 */
7970 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00007971 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00007972 PyErr_SetString(PyExc_OverflowError,
7973 "repeated string is too long");
7974 return NULL;
7975 }
7976 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7977 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7978 PyErr_SetString(PyExc_OverflowError,
7979 "repeated string is too long");
7980 return NULL;
7981 }
7982 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 if (!u)
7984 return NULL;
7985
7986 p = u->str;
7987
Georg Brandl222de0f2009-04-12 12:01:50 +00007988 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007989 Py_UNICODE_FILL(p, str->str[0], len);
7990 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00007991 Py_ssize_t done = str->length; /* number of characters copied this far */
7992 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00007994 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007995 Py_UNICODE_COPY(p+done, p, n);
7996 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 }
7999
8000 return (PyObject*) u;
8001}
8002
8003PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 PyObject *subobj,
8005 PyObject *replobj,
8006 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007{
8008 PyObject *self;
8009 PyObject *str1;
8010 PyObject *str2;
8011 PyObject *result;
8012
8013 self = PyUnicode_FromObject(obj);
8014 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 str1 = PyUnicode_FromObject(subobj);
8017 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 Py_DECREF(self);
8019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 }
8021 str2 = PyUnicode_FromObject(replobj);
8022 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 Py_DECREF(self);
8024 Py_DECREF(str1);
8025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 }
Tim Petersced69f82003-09-16 20:30:58 +00008027 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 (PyUnicodeObject *)str1,
8029 (PyUnicodeObject *)str2,
8030 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 Py_DECREF(self);
8032 Py_DECREF(str1);
8033 Py_DECREF(str2);
8034 return result;
8035}
8036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008037PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008038 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039\n\
8040Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008041old replaced by new. If the optional argument count is\n\
8042given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043
8044static PyObject*
8045unicode_replace(PyUnicodeObject *self, PyObject *args)
8046{
8047 PyUnicodeObject *str1;
8048 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008049 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 PyObject *result;
8051
Martin v. Löwis18e16552006-02-15 17:27:45 +00008052 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 return NULL;
8054 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8055 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008058 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 Py_DECREF(str1);
8060 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008061 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062
8063 result = replace(self, str1, str2, maxcount);
8064
8065 Py_DECREF(str1);
8066 Py_DECREF(str2);
8067 return result;
8068}
8069
8070static
8071PyObject *unicode_repr(PyObject *unicode)
8072{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008073 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008074 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008075 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8076 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8077
8078 /* XXX(nnorwitz): rather than over-allocating, it would be
8079 better to choose a different scheme. Perhaps scan the
8080 first N-chars of the string and allocate based on that size.
8081 */
8082 /* Initial allocation is based on the longest-possible unichr
8083 escape.
8084
8085 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8086 unichr, so in this case it's the longest unichr escape. In
8087 narrow (UTF-16) builds this is five chars per source unichr
8088 since there are two unichrs in the surrogate pair, so in narrow
8089 (UTF-16) builds it's not the longest unichr escape.
8090
8091 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8092 so in the narrow (UTF-16) build case it's the longest unichr
8093 escape.
8094 */
8095
Walter Dörwald1ab83302007-05-18 17:15:44 +00008096 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008098#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008100#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008102#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008104 if (repr == NULL)
8105 return NULL;
8106
Walter Dörwald1ab83302007-05-18 17:15:44 +00008107 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008108
8109 /* Add quote */
8110 *p++ = (findchar(s, size, '\'') &&
8111 !findchar(s, size, '"')) ? '"' : '\'';
8112 while (size-- > 0) {
8113 Py_UNICODE ch = *s++;
8114
8115 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008116 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008117 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008118 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008119 continue;
8120 }
8121
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008123 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008124 *p++ = '\\';
8125 *p++ = 't';
8126 }
8127 else if (ch == '\n') {
8128 *p++ = '\\';
8129 *p++ = 'n';
8130 }
8131 else if (ch == '\r') {
8132 *p++ = '\\';
8133 *p++ = 'r';
8134 }
8135
8136 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008137 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008138 *p++ = '\\';
8139 *p++ = 'x';
8140 *p++ = hexdigits[(ch >> 4) & 0x000F];
8141 *p++ = hexdigits[ch & 0x000F];
8142 }
8143
Georg Brandl559e5d72008-06-11 18:37:52 +00008144 /* Copy ASCII characters as-is */
8145 else if (ch < 0x7F) {
8146 *p++ = ch;
8147 }
8148
Benjamin Peterson29060642009-01-31 22:14:21 +00008149 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008150 else {
8151 Py_UCS4 ucs = ch;
8152
8153#ifndef Py_UNICODE_WIDE
8154 Py_UNICODE ch2 = 0;
8155 /* Get code point from surrogate pair */
8156 if (size > 0) {
8157 ch2 = *s;
8158 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008162 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008163 size--;
8164 }
8165 }
8166#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008167 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008168 (categories Z* and C* except ASCII space)
8169 */
8170 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8171 /* Map 8-bit characters to '\xhh' */
8172 if (ucs <= 0xff) {
8173 *p++ = '\\';
8174 *p++ = 'x';
8175 *p++ = hexdigits[(ch >> 4) & 0x000F];
8176 *p++ = hexdigits[ch & 0x000F];
8177 }
8178 /* Map 21-bit characters to '\U00xxxxxx' */
8179 else if (ucs >= 0x10000) {
8180 *p++ = '\\';
8181 *p++ = 'U';
8182 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8183 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8184 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8185 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8186 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8187 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8188 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8189 *p++ = hexdigits[ucs & 0x0000000F];
8190 }
8191 /* Map 16-bit characters to '\uxxxx' */
8192 else {
8193 *p++ = '\\';
8194 *p++ = 'u';
8195 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8196 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8197 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8198 *p++ = hexdigits[ucs & 0x000F];
8199 }
8200 }
8201 /* Copy characters as-is */
8202 else {
8203 *p++ = ch;
8204#ifndef Py_UNICODE_WIDE
8205 if (ucs >= 0x10000)
8206 *p++ = ch2;
8207#endif
8208 }
8209 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008210 }
8211 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008212 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008213
8214 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008215 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008216 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217}
8218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008219PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221\n\
8222Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008223such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224arguments start and end are interpreted as in slice notation.\n\
8225\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008226Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227
8228static PyObject *
8229unicode_rfind(PyUnicodeObject *self, PyObject *args)
8230{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008231 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008232 Py_ssize_t start;
8233 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008234 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235
Christian Heimes9cd17752007-11-18 19:35:23 +00008236 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238
Thomas Wouters477c8d52006-05-27 19:21:47 +00008239 result = stringlib_rfind_slice(
8240 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8241 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8242 start, end
8243 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244
8245 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008246
Christian Heimes217cfd12007-12-02 14:31:20 +00008247 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248}
8249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008250PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008253Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254
8255static PyObject *
8256unicode_rindex(PyUnicodeObject *self, PyObject *args)
8257{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008258 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008259 Py_ssize_t start;
8260 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008261 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262
Christian Heimes9cd17752007-11-18 19:35:23 +00008263 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265
Thomas Wouters477c8d52006-05-27 19:21:47 +00008266 result = stringlib_rfind_slice(
8267 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8268 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8269 start, end
8270 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271
8272 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008273
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 if (result < 0) {
8275 PyErr_SetString(PyExc_ValueError, "substring not found");
8276 return NULL;
8277 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008278 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279}
8280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008281PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008284Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008285done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286
8287static PyObject *
8288unicode_rjust(PyUnicodeObject *self, PyObject *args)
8289{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008290 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008291 Py_UNICODE fillchar = ' ';
8292
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008293 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294 return NULL;
8295
Tim Peters7a29bd52001-09-12 03:03:31 +00008296 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 Py_INCREF(self);
8298 return (PyObject*) self;
8299 }
8300
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008301 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302}
8303
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 PyObject *sep,
8306 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307{
8308 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008309
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310 s = PyUnicode_FromObject(s);
8311 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008312 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 if (sep != NULL) {
8314 sep = PyUnicode_FromObject(sep);
8315 if (sep == NULL) {
8316 Py_DECREF(s);
8317 return NULL;
8318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 }
8320
8321 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8322
8323 Py_DECREF(s);
8324 Py_XDECREF(sep);
8325 return result;
8326}
8327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008328PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330\n\
8331Return a list of the words in S, using sep as the\n\
8332delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008333splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008334whitespace string is a separator and empty strings are\n\
8335removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336
8337static PyObject*
8338unicode_split(PyUnicodeObject *self, PyObject *args)
8339{
8340 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008341 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342
Martin v. Löwis18e16552006-02-15 17:27:45 +00008343 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 return NULL;
8345
8346 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352}
8353
Thomas Wouters477c8d52006-05-27 19:21:47 +00008354PyObject *
8355PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8356{
8357 PyObject* str_obj;
8358 PyObject* sep_obj;
8359 PyObject* out;
8360
8361 str_obj = PyUnicode_FromObject(str_in);
8362 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008364 sep_obj = PyUnicode_FromObject(sep_in);
8365 if (!sep_obj) {
8366 Py_DECREF(str_obj);
8367 return NULL;
8368 }
8369
8370 out = stringlib_partition(
8371 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8372 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8373 );
8374
8375 Py_DECREF(sep_obj);
8376 Py_DECREF(str_obj);
8377
8378 return out;
8379}
8380
8381
8382PyObject *
8383PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8384{
8385 PyObject* str_obj;
8386 PyObject* sep_obj;
8387 PyObject* out;
8388
8389 str_obj = PyUnicode_FromObject(str_in);
8390 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008392 sep_obj = PyUnicode_FromObject(sep_in);
8393 if (!sep_obj) {
8394 Py_DECREF(str_obj);
8395 return NULL;
8396 }
8397
8398 out = stringlib_rpartition(
8399 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8400 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8401 );
8402
8403 Py_DECREF(sep_obj);
8404 Py_DECREF(str_obj);
8405
8406 return out;
8407}
8408
8409PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008411\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008412Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008413the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008414found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008415
8416static PyObject*
8417unicode_partition(PyUnicodeObject *self, PyObject *separator)
8418{
8419 return PyUnicode_Partition((PyObject *)self, separator);
8420}
8421
8422PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008423 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008424\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008425Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008426the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008427separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008428
8429static PyObject*
8430unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8431{
8432 return PyUnicode_RPartition((PyObject *)self, separator);
8433}
8434
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008435PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 PyObject *sep,
8437 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008438{
8439 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008440
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008441 s = PyUnicode_FromObject(s);
8442 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008443 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 if (sep != NULL) {
8445 sep = PyUnicode_FromObject(sep);
8446 if (sep == NULL) {
8447 Py_DECREF(s);
8448 return NULL;
8449 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008450 }
8451
8452 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8453
8454 Py_DECREF(s);
8455 Py_XDECREF(sep);
8456 return result;
8457}
8458
8459PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008461\n\
8462Return a list of the words in S, using sep as the\n\
8463delimiter string, starting at the end of the string and\n\
8464working to the front. If maxsplit is given, at most maxsplit\n\
8465splits are done. If sep is not specified, any whitespace string\n\
8466is a separator.");
8467
8468static PyObject*
8469unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8470{
8471 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008472 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008473
Martin v. Löwis18e16552006-02-15 17:27:45 +00008474 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008475 return NULL;
8476
8477 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008479 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008481 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008483}
8484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008485PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487\n\
8488Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008489Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008490is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491
8492static PyObject*
8493unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8494{
Guido van Rossum86662912000-04-11 15:38:46 +00008495 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496
Guido van Rossum86662912000-04-11 15:38:46 +00008497 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 return NULL;
8499
Guido van Rossum86662912000-04-11 15:38:46 +00008500 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501}
8502
8503static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008504PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505{
Walter Dörwald346737f2007-05-31 10:44:43 +00008506 if (PyUnicode_CheckExact(self)) {
8507 Py_INCREF(self);
8508 return self;
8509 } else
8510 /* Subtype -- return genuine unicode string with the same value. */
8511 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8512 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513}
8514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008515PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517\n\
8518Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008519and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520
8521static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008522unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524 return fixup(self, fixswapcase);
8525}
8526
Georg Brandlceee0772007-11-27 23:48:05 +00008527PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008529\n\
8530Return a translation table usable for str.translate().\n\
8531If there is only one argument, it must be a dictionary mapping Unicode\n\
8532ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008533Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008534If there are two arguments, they must be strings of equal length, and\n\
8535in the resulting dictionary, each character in x will be mapped to the\n\
8536character at the same position in y. If there is a third argument, it\n\
8537must be a string, whose characters will be mapped to None in the result.");
8538
8539static PyObject*
8540unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8541{
8542 PyObject *x, *y = NULL, *z = NULL;
8543 PyObject *new = NULL, *key, *value;
8544 Py_ssize_t i = 0;
8545 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008546
Georg Brandlceee0772007-11-27 23:48:05 +00008547 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8548 return NULL;
8549 new = PyDict_New();
8550 if (!new)
8551 return NULL;
8552 if (y != NULL) {
8553 /* x must be a string too, of equal length */
8554 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8555 if (!PyUnicode_Check(x)) {
8556 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8557 "be a string if there is a second argument");
8558 goto err;
8559 }
8560 if (PyUnicode_GET_SIZE(x) != ylen) {
8561 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8562 "arguments must have equal length");
8563 goto err;
8564 }
8565 /* create entries for translating chars in x to those in y */
8566 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008567 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8568 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008569 if (!key || !value)
8570 goto err;
8571 res = PyDict_SetItem(new, key, value);
8572 Py_DECREF(key);
8573 Py_DECREF(value);
8574 if (res < 0)
8575 goto err;
8576 }
8577 /* create entries for deleting chars in z */
8578 if (z != NULL) {
8579 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008580 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008581 if (!key)
8582 goto err;
8583 res = PyDict_SetItem(new, key, Py_None);
8584 Py_DECREF(key);
8585 if (res < 0)
8586 goto err;
8587 }
8588 }
8589 } else {
8590 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008591 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008592 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8593 "to maketrans it must be a dict");
8594 goto err;
8595 }
8596 /* copy entries into the new dict, converting string keys to int keys */
8597 while (PyDict_Next(x, &i, &key, &value)) {
8598 if (PyUnicode_Check(key)) {
8599 /* convert string keys to integer keys */
8600 PyObject *newkey;
8601 if (PyUnicode_GET_SIZE(key) != 1) {
8602 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8603 "table must be of length 1");
8604 goto err;
8605 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008606 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008607 if (!newkey)
8608 goto err;
8609 res = PyDict_SetItem(new, newkey, value);
8610 Py_DECREF(newkey);
8611 if (res < 0)
8612 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008613 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008614 /* just keep integer keys */
8615 if (PyDict_SetItem(new, key, value) < 0)
8616 goto err;
8617 } else {
8618 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8619 "be strings or integers");
8620 goto err;
8621 }
8622 }
8623 }
8624 return new;
8625 err:
8626 Py_DECREF(new);
8627 return NULL;
8628}
8629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008630PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632\n\
8633Return a copy of the string S, where all characters have been mapped\n\
8634through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008635Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008636Unmapped characters are left untouched. Characters mapped to None\n\
8637are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638
8639static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008640unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641{
Georg Brandlceee0772007-11-27 23:48:05 +00008642 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643}
8644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008645PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008648Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649
8650static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008651unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 return fixup(self, fixupper);
8654}
8655
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008656PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008659Pad a numeric string S with zeros on the left, to fill a field\n\
8660of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661
8662static PyObject *
8663unicode_zfill(PyUnicodeObject *self, PyObject *args)
8664{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008665 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 PyUnicodeObject *u;
8667
Martin v. Löwis18e16552006-02-15 17:27:45 +00008668 Py_ssize_t width;
8669 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670 return NULL;
8671
8672 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008673 if (PyUnicode_CheckExact(self)) {
8674 Py_INCREF(self);
8675 return (PyObject*) self;
8676 }
8677 else
8678 return PyUnicode_FromUnicode(
8679 PyUnicode_AS_UNICODE(self),
8680 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 }
8683
8684 fill = width - self->length;
8685
8686 u = pad(self, fill, 0, '0');
8687
Walter Dörwald068325e2002-04-15 13:36:47 +00008688 if (u == NULL)
8689 return NULL;
8690
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691 if (u->str[fill] == '+' || u->str[fill] == '-') {
8692 /* move sign to beginning of string */
8693 u->str[0] = u->str[fill];
8694 u->str[fill] = '0';
8695 }
8696
8697 return (PyObject*) u;
8698}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699
8700#if 0
8701static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008702unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703{
Christian Heimes2202f872008-02-06 14:31:34 +00008704 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705}
8706#endif
8707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008708PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008711Return True if S starts with the specified prefix, False otherwise.\n\
8712With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008713With optional end, stop comparing S at that position.\n\
8714prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715
8716static PyObject *
8717unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008720 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008722 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008723 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008724 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008726 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8728 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008729 if (PyTuple_Check(subobj)) {
8730 Py_ssize_t i;
8731 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8732 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008734 if (substring == NULL)
8735 return NULL;
8736 result = tailmatch(self, substring, start, end, -1);
8737 Py_DECREF(substring);
8738 if (result) {
8739 Py_RETURN_TRUE;
8740 }
8741 }
8742 /* nothing matched */
8743 Py_RETURN_FALSE;
8744 }
8745 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008748 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008750 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751}
8752
8753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008754PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008757Return True if S ends with the specified suffix, False otherwise.\n\
8758With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008759With optional end, stop comparing S at that position.\n\
8760suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761
8762static PyObject *
8763unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008766 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008768 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008769 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008770 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008772 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8774 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008775 if (PyTuple_Check(subobj)) {
8776 Py_ssize_t i;
8777 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8778 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008780 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008782 result = tailmatch(self, substring, start, end, +1);
8783 Py_DECREF(substring);
8784 if (result) {
8785 Py_RETURN_TRUE;
8786 }
8787 }
8788 Py_RETURN_FALSE;
8789 }
8790 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008794 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008796 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797}
8798
Eric Smith8c663262007-08-25 02:26:07 +00008799#include "stringlib/string_format.h"
8800
8801PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008803\n\
8804");
8805
Eric Smith4a7d76d2008-05-30 18:10:19 +00008806static PyObject *
8807unicode__format__(PyObject* self, PyObject* args)
8808{
8809 PyObject *format_spec;
8810
8811 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8812 return NULL;
8813
8814 return _PyUnicode_FormatAdvanced(self,
8815 PyUnicode_AS_UNICODE(format_spec),
8816 PyUnicode_GET_SIZE(format_spec));
8817}
8818
Eric Smith8c663262007-08-25 02:26:07 +00008819PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008820 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00008821\n\
8822");
8823
8824static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008825unicode__sizeof__(PyUnicodeObject *v)
8826{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00008827 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8828 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008829}
8830
8831PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008833
8834static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008835unicode_getnewargs(PyUnicodeObject *v)
8836{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008837 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008838}
8839
8840
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841static PyMethodDef unicode_methods[] = {
8842
8843 /* Order is according to common usage: often used methods should
8844 appear first, since lookup is done sequentially. */
8845
Benjamin Peterson308d6372009-09-18 21:42:35 +00008846 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008847 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8848 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008849 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008850 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8851 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8852 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8853 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8854 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8855 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8856 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008857 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008858 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8859 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8860 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008861 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008862 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8863 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8864 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008865 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00008866 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008867 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008868 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008869 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8870 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8871 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8872 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8873 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8874 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8875 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8876 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8877 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8878 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8879 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8880 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8881 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8882 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00008883 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00008884 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008885 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00008886 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00008887 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Eric Smithf6db4092007-08-27 23:52:26 +00008888 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8889 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Georg Brandlceee0772007-11-27 23:48:05 +00008890 {"maketrans", (PyCFunction) unicode_maketrans,
8891 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00008892 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008893#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008894 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895#endif
8896
8897#if 0
8898 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008899 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900#endif
8901
Benjamin Peterson14339b62009-01-31 16:36:08 +00008902 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 {NULL, NULL}
8904};
8905
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008906static PyObject *
8907unicode_mod(PyObject *v, PyObject *w)
8908{
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 if (!PyUnicode_Check(v)) {
8910 Py_INCREF(Py_NotImplemented);
8911 return Py_NotImplemented;
8912 }
8913 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008914}
8915
8916static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008917 0, /*nb_add*/
8918 0, /*nb_subtract*/
8919 0, /*nb_multiply*/
8920 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008921};
8922
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008924 (lenfunc) unicode_length, /* sq_length */
8925 PyUnicode_Concat, /* sq_concat */
8926 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8927 (ssizeargfunc) unicode_getitem, /* sq_item */
8928 0, /* sq_slice */
8929 0, /* sq_ass_item */
8930 0, /* sq_ass_slice */
8931 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932};
8933
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008934static PyObject*
8935unicode_subscript(PyUnicodeObject* self, PyObject* item)
8936{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00008937 if (PyIndex_Check(item)) {
8938 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008939 if (i == -1 && PyErr_Occurred())
8940 return NULL;
8941 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008942 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008943 return unicode_getitem(self, i);
8944 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008945 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008946 Py_UNICODE* source_buf;
8947 Py_UNICODE* result_buf;
8948 PyObject* result;
8949
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008950 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008952 return NULL;
8953 }
8954
8955 if (slicelength <= 0) {
8956 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00008957 } else if (start == 0 && step == 1 && slicelength == self->length &&
8958 PyUnicode_CheckExact(self)) {
8959 Py_INCREF(self);
8960 return (PyObject *)self;
8961 } else if (step == 1) {
8962 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008963 } else {
8964 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00008965 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8966 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008967
Benjamin Peterson29060642009-01-31 22:14:21 +00008968 if (result_buf == NULL)
8969 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008970
8971 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8972 result_buf[i] = source_buf[cur];
8973 }
Tim Petersced69f82003-09-16 20:30:58 +00008974
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008975 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00008976 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008977 return result;
8978 }
8979 } else {
8980 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8981 return NULL;
8982 }
8983}
8984
8985static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008986 (lenfunc)unicode_length, /* mp_length */
8987 (binaryfunc)unicode_subscript, /* mp_subscript */
8988 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008989};
8990
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992/* Helpers for PyUnicode_Format() */
8993
8994static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008995getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008997 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 (*p_argidx)++;
9000 if (arglen < 0)
9001 return args;
9002 else
9003 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004 }
9005 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007 return NULL;
9008}
9009
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009010/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009012static PyObject *
9013formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009015 char *p;
9016 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009018
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 x = PyFloat_AsDouble(v);
9020 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009021 return NULL;
9022
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009024 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009025
Eric Smith0923d1d2009-04-16 20:16:10 +00009026 p = PyOS_double_to_string(x, type, prec,
9027 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009028 if (p == NULL)
9029 return NULL;
9030 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009031 PyMem_Free(p);
9032 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033}
9034
Tim Peters38fd5b62000-09-21 05:43:11 +00009035static PyObject*
9036formatlong(PyObject *val, int flags, int prec, int type)
9037{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009038 char *buf;
9039 int len;
9040 PyObject *str; /* temporary string object. */
9041 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009042
Benjamin Peterson14339b62009-01-31 16:36:08 +00009043 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9044 if (!str)
9045 return NULL;
9046 result = PyUnicode_FromStringAndSize(buf, len);
9047 Py_DECREF(str);
9048 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009049}
9050
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051static int
9052formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009053 size_t buflen,
9054 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009056 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009057 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009058 if (PyUnicode_GET_SIZE(v) == 1) {
9059 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9060 buf[1] = '\0';
9061 return 1;
9062 }
9063#ifndef Py_UNICODE_WIDE
9064 if (PyUnicode_GET_SIZE(v) == 2) {
9065 /* Decode a valid surrogate pair */
9066 int c0 = PyUnicode_AS_UNICODE(v)[0];
9067 int c1 = PyUnicode_AS_UNICODE(v)[1];
9068 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9069 0xDC00 <= c1 && c1 <= 0xDFFF) {
9070 buf[0] = c0;
9071 buf[1] = c1;
9072 buf[2] = '\0';
9073 return 2;
9074 }
9075 }
9076#endif
9077 goto onError;
9078 }
9079 else {
9080 /* Integer input truncated to a character */
9081 long x;
9082 x = PyLong_AsLong(v);
9083 if (x == -1 && PyErr_Occurred())
9084 goto onError;
9085
9086 if (x < 0 || x > 0x10ffff) {
9087 PyErr_SetString(PyExc_OverflowError,
9088 "%c arg not in range(0x110000)");
9089 return -1;
9090 }
9091
9092#ifndef Py_UNICODE_WIDE
9093 if (x > 0xffff) {
9094 x -= 0x10000;
9095 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9096 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9097 return 2;
9098 }
9099#endif
9100 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009101 buf[1] = '\0';
9102 return 1;
9103 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009104
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009106 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009108 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109}
9110
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009111/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009112 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009113*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009114#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009115
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118{
9119 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009120 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121 int args_owned = 0;
9122 PyUnicodeObject *result = NULL;
9123 PyObject *dict = NULL;
9124 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009125
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009127 PyErr_BadInternalCall();
9128 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 }
9130 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009131 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133 fmt = PyUnicode_AS_UNICODE(uformat);
9134 fmtcnt = PyUnicode_GET_SIZE(uformat);
9135
9136 reslen = rescnt = fmtcnt + 100;
9137 result = _PyUnicode_New(reslen);
9138 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 res = PyUnicode_AS_UNICODE(result);
9141
9142 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 arglen = PyTuple_Size(args);
9144 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 }
9146 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 arglen = -1;
9148 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009150 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009151 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153
9154 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 if (*fmt != '%') {
9156 if (--rescnt < 0) {
9157 rescnt = fmtcnt + 100;
9158 reslen += rescnt;
9159 if (_PyUnicode_Resize(&result, reslen) < 0)
9160 goto onError;
9161 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9162 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009163 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009165 }
9166 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 /* Got a format specifier */
9168 int flags = 0;
9169 Py_ssize_t width = -1;
9170 int prec = -1;
9171 Py_UNICODE c = '\0';
9172 Py_UNICODE fill;
9173 int isnumok;
9174 PyObject *v = NULL;
9175 PyObject *temp = NULL;
9176 Py_UNICODE *pbuf;
9177 Py_UNICODE sign;
9178 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009179 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 fmt++;
9182 if (*fmt == '(') {
9183 Py_UNICODE *keystart;
9184 Py_ssize_t keylen;
9185 PyObject *key;
9186 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009187
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 if (dict == NULL) {
9189 PyErr_SetString(PyExc_TypeError,
9190 "format requires a mapping");
9191 goto onError;
9192 }
9193 ++fmt;
9194 --fmtcnt;
9195 keystart = fmt;
9196 /* Skip over balanced parentheses */
9197 while (pcount > 0 && --fmtcnt >= 0) {
9198 if (*fmt == ')')
9199 --pcount;
9200 else if (*fmt == '(')
9201 ++pcount;
9202 fmt++;
9203 }
9204 keylen = fmt - keystart - 1;
9205 if (fmtcnt < 0 || pcount > 0) {
9206 PyErr_SetString(PyExc_ValueError,
9207 "incomplete format key");
9208 goto onError;
9209 }
9210#if 0
9211 /* keys are converted to strings using UTF-8 and
9212 then looked up since Python uses strings to hold
9213 variables names etc. in its namespaces and we
9214 wouldn't want to break common idioms. */
9215 key = PyUnicode_EncodeUTF8(keystart,
9216 keylen,
9217 NULL);
9218#else
9219 key = PyUnicode_FromUnicode(keystart, keylen);
9220#endif
9221 if (key == NULL)
9222 goto onError;
9223 if (args_owned) {
9224 Py_DECREF(args);
9225 args_owned = 0;
9226 }
9227 args = PyObject_GetItem(dict, key);
9228 Py_DECREF(key);
9229 if (args == NULL) {
9230 goto onError;
9231 }
9232 args_owned = 1;
9233 arglen = -1;
9234 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009235 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009236 while (--fmtcnt >= 0) {
9237 switch (c = *fmt++) {
9238 case '-': flags |= F_LJUST; continue;
9239 case '+': flags |= F_SIGN; continue;
9240 case ' ': flags |= F_BLANK; continue;
9241 case '#': flags |= F_ALT; continue;
9242 case '0': flags |= F_ZERO; continue;
9243 }
9244 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009245 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009246 if (c == '*') {
9247 v = getnextarg(args, arglen, &argidx);
9248 if (v == NULL)
9249 goto onError;
9250 if (!PyLong_Check(v)) {
9251 PyErr_SetString(PyExc_TypeError,
9252 "* wants int");
9253 goto onError;
9254 }
9255 width = PyLong_AsLong(v);
9256 if (width == -1 && PyErr_Occurred())
9257 goto onError;
9258 if (width < 0) {
9259 flags |= F_LJUST;
9260 width = -width;
9261 }
9262 if (--fmtcnt >= 0)
9263 c = *fmt++;
9264 }
9265 else if (c >= '0' && c <= '9') {
9266 width = c - '0';
9267 while (--fmtcnt >= 0) {
9268 c = *fmt++;
9269 if (c < '0' || c > '9')
9270 break;
9271 if ((width*10) / 10 != width) {
9272 PyErr_SetString(PyExc_ValueError,
9273 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009274 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009275 }
9276 width = width*10 + (c - '0');
9277 }
9278 }
9279 if (c == '.') {
9280 prec = 0;
9281 if (--fmtcnt >= 0)
9282 c = *fmt++;
9283 if (c == '*') {
9284 v = getnextarg(args, arglen, &argidx);
9285 if (v == NULL)
9286 goto onError;
9287 if (!PyLong_Check(v)) {
9288 PyErr_SetString(PyExc_TypeError,
9289 "* wants int");
9290 goto onError;
9291 }
9292 prec = PyLong_AsLong(v);
9293 if (prec == -1 && PyErr_Occurred())
9294 goto onError;
9295 if (prec < 0)
9296 prec = 0;
9297 if (--fmtcnt >= 0)
9298 c = *fmt++;
9299 }
9300 else if (c >= '0' && c <= '9') {
9301 prec = c - '0';
9302 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009303 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009304 if (c < '0' || c > '9')
9305 break;
9306 if ((prec*10) / 10 != prec) {
9307 PyErr_SetString(PyExc_ValueError,
9308 "prec too big");
9309 goto onError;
9310 }
9311 prec = prec*10 + (c - '0');
9312 }
9313 }
9314 } /* prec */
9315 if (fmtcnt >= 0) {
9316 if (c == 'h' || c == 'l' || c == 'L') {
9317 if (--fmtcnt >= 0)
9318 c = *fmt++;
9319 }
9320 }
9321 if (fmtcnt < 0) {
9322 PyErr_SetString(PyExc_ValueError,
9323 "incomplete format");
9324 goto onError;
9325 }
9326 if (c != '%') {
9327 v = getnextarg(args, arglen, &argidx);
9328 if (v == NULL)
9329 goto onError;
9330 }
9331 sign = 0;
9332 fill = ' ';
9333 switch (c) {
9334
9335 case '%':
9336 pbuf = formatbuf;
9337 /* presume that buffer length is at least 1 */
9338 pbuf[0] = '%';
9339 len = 1;
9340 break;
9341
9342 case 's':
9343 case 'r':
9344 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009345 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 temp = v;
9347 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009348 }
9349 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009350 if (c == 's')
9351 temp = PyObject_Str(v);
9352 else if (c == 'r')
9353 temp = PyObject_Repr(v);
9354 else
9355 temp = PyObject_ASCII(v);
9356 if (temp == NULL)
9357 goto onError;
9358 if (PyUnicode_Check(temp))
9359 /* nothing to do */;
9360 else {
9361 Py_DECREF(temp);
9362 PyErr_SetString(PyExc_TypeError,
9363 "%s argument has non-string str()");
9364 goto onError;
9365 }
9366 }
9367 pbuf = PyUnicode_AS_UNICODE(temp);
9368 len = PyUnicode_GET_SIZE(temp);
9369 if (prec >= 0 && len > prec)
9370 len = prec;
9371 break;
9372
9373 case 'i':
9374 case 'd':
9375 case 'u':
9376 case 'o':
9377 case 'x':
9378 case 'X':
9379 if (c == 'i')
9380 c = 'd';
9381 isnumok = 0;
9382 if (PyNumber_Check(v)) {
9383 PyObject *iobj=NULL;
9384
9385 if (PyLong_Check(v)) {
9386 iobj = v;
9387 Py_INCREF(iobj);
9388 }
9389 else {
9390 iobj = PyNumber_Long(v);
9391 }
9392 if (iobj!=NULL) {
9393 if (PyLong_Check(iobj)) {
9394 isnumok = 1;
9395 temp = formatlong(iobj, flags, prec, c);
9396 Py_DECREF(iobj);
9397 if (!temp)
9398 goto onError;
9399 pbuf = PyUnicode_AS_UNICODE(temp);
9400 len = PyUnicode_GET_SIZE(temp);
9401 sign = 1;
9402 }
9403 else {
9404 Py_DECREF(iobj);
9405 }
9406 }
9407 }
9408 if (!isnumok) {
9409 PyErr_Format(PyExc_TypeError,
9410 "%%%c format: a number is required, "
9411 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9412 goto onError;
9413 }
9414 if (flags & F_ZERO)
9415 fill = '0';
9416 break;
9417
9418 case 'e':
9419 case 'E':
9420 case 'f':
9421 case 'F':
9422 case 'g':
9423 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009424 temp = formatfloat(v, flags, prec, c);
9425 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009426 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009427 pbuf = PyUnicode_AS_UNICODE(temp);
9428 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 sign = 1;
9430 if (flags & F_ZERO)
9431 fill = '0';
9432 break;
9433
9434 case 'c':
9435 pbuf = formatbuf;
9436 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9437 if (len < 0)
9438 goto onError;
9439 break;
9440
9441 default:
9442 PyErr_Format(PyExc_ValueError,
9443 "unsupported format character '%c' (0x%x) "
9444 "at index %zd",
9445 (31<=c && c<=126) ? (char)c : '?',
9446 (int)c,
9447 (Py_ssize_t)(fmt - 1 -
9448 PyUnicode_AS_UNICODE(uformat)));
9449 goto onError;
9450 }
9451 if (sign) {
9452 if (*pbuf == '-' || *pbuf == '+') {
9453 sign = *pbuf++;
9454 len--;
9455 }
9456 else if (flags & F_SIGN)
9457 sign = '+';
9458 else if (flags & F_BLANK)
9459 sign = ' ';
9460 else
9461 sign = 0;
9462 }
9463 if (width < len)
9464 width = len;
9465 if (rescnt - (sign != 0) < width) {
9466 reslen -= rescnt;
9467 rescnt = width + fmtcnt + 100;
9468 reslen += rescnt;
9469 if (reslen < 0) {
9470 Py_XDECREF(temp);
9471 PyErr_NoMemory();
9472 goto onError;
9473 }
9474 if (_PyUnicode_Resize(&result, reslen) < 0) {
9475 Py_XDECREF(temp);
9476 goto onError;
9477 }
9478 res = PyUnicode_AS_UNICODE(result)
9479 + reslen - rescnt;
9480 }
9481 if (sign) {
9482 if (fill != ' ')
9483 *res++ = sign;
9484 rescnt--;
9485 if (width > len)
9486 width--;
9487 }
9488 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9489 assert(pbuf[0] == '0');
9490 assert(pbuf[1] == c);
9491 if (fill != ' ') {
9492 *res++ = *pbuf++;
9493 *res++ = *pbuf++;
9494 }
9495 rescnt -= 2;
9496 width -= 2;
9497 if (width < 0)
9498 width = 0;
9499 len -= 2;
9500 }
9501 if (width > len && !(flags & F_LJUST)) {
9502 do {
9503 --rescnt;
9504 *res++ = fill;
9505 } while (--width > len);
9506 }
9507 if (fill == ' ') {
9508 if (sign)
9509 *res++ = sign;
9510 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9511 assert(pbuf[0] == '0');
9512 assert(pbuf[1] == c);
9513 *res++ = *pbuf++;
9514 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009515 }
9516 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 Py_UNICODE_COPY(res, pbuf, len);
9518 res += len;
9519 rescnt -= len;
9520 while (--width >= len) {
9521 --rescnt;
9522 *res++ = ' ';
9523 }
9524 if (dict && (argidx < arglen) && c != '%') {
9525 PyErr_SetString(PyExc_TypeError,
9526 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009527 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009528 goto onError;
9529 }
9530 Py_XDECREF(temp);
9531 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 } /* until end */
9533 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 PyErr_SetString(PyExc_TypeError,
9535 "not all arguments converted during string formatting");
9536 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 }
9538
Thomas Woutersa96affe2006-03-12 00:29:36 +00009539 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 }
9544 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545 return (PyObject *)result;
9546
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548 Py_XDECREF(result);
9549 Py_DECREF(uformat);
9550 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552 }
9553 return NULL;
9554}
9555
Jeremy Hylton938ace62002-07-17 16:30:39 +00009556static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009557unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9558
Tim Peters6d6c1a32001-08-02 04:15:00 +00009559static PyObject *
9560unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9561{
Benjamin Peterson29060642009-01-31 22:14:21 +00009562 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009563 static char *kwlist[] = {"object", "encoding", "errors", 0};
9564 char *encoding = NULL;
9565 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009566
Benjamin Peterson14339b62009-01-31 16:36:08 +00009567 if (type != &PyUnicode_Type)
9568 return unicode_subtype_new(type, args, kwds);
9569 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009571 return NULL;
9572 if (x == NULL)
9573 return (PyObject *)_PyUnicode_New(0);
9574 if (encoding == NULL && errors == NULL)
9575 return PyObject_Str(x);
9576 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009578}
9579
Guido van Rossume023fe02001-08-30 03:12:59 +00009580static PyObject *
9581unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9582{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009583 PyUnicodeObject *tmp, *pnew;
9584 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009585
Benjamin Peterson14339b62009-01-31 16:36:08 +00009586 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9587 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9588 if (tmp == NULL)
9589 return NULL;
9590 assert(PyUnicode_Check(tmp));
9591 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9592 if (pnew == NULL) {
9593 Py_DECREF(tmp);
9594 return NULL;
9595 }
9596 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9597 if (pnew->str == NULL) {
9598 _Py_ForgetReference((PyObject *)pnew);
9599 PyObject_Del(pnew);
9600 Py_DECREF(tmp);
9601 return PyErr_NoMemory();
9602 }
9603 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9604 pnew->length = n;
9605 pnew->hash = tmp->hash;
9606 Py_DECREF(tmp);
9607 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009608}
9609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009610PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009611 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009612\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009613Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009614encoding defaults to the current default string encoding.\n\
9615errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009616
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009617static PyObject *unicode_iter(PyObject *seq);
9618
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009620 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009621 "str", /* tp_name */
9622 sizeof(PyUnicodeObject), /* tp_size */
9623 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009625 (destructor)unicode_dealloc, /* tp_dealloc */
9626 0, /* tp_print */
9627 0, /* tp_getattr */
9628 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009629 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009630 unicode_repr, /* tp_repr */
9631 &unicode_as_number, /* tp_as_number */
9632 &unicode_as_sequence, /* tp_as_sequence */
9633 &unicode_as_mapping, /* tp_as_mapping */
9634 (hashfunc) unicode_hash, /* tp_hash*/
9635 0, /* tp_call*/
9636 (reprfunc) unicode_str, /* tp_str */
9637 PyObject_GenericGetAttr, /* tp_getattro */
9638 0, /* tp_setattro */
9639 0, /* tp_as_buffer */
9640 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009641 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009642 unicode_doc, /* tp_doc */
9643 0, /* tp_traverse */
9644 0, /* tp_clear */
9645 PyUnicode_RichCompare, /* tp_richcompare */
9646 0, /* tp_weaklistoffset */
9647 unicode_iter, /* tp_iter */
9648 0, /* tp_iternext */
9649 unicode_methods, /* tp_methods */
9650 0, /* tp_members */
9651 0, /* tp_getset */
9652 &PyBaseObject_Type, /* tp_base */
9653 0, /* tp_dict */
9654 0, /* tp_descr_get */
9655 0, /* tp_descr_set */
9656 0, /* tp_dictoffset */
9657 0, /* tp_init */
9658 0, /* tp_alloc */
9659 unicode_new, /* tp_new */
9660 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661};
9662
9663/* Initialize the Unicode implementation */
9664
Thomas Wouters78890102000-07-22 19:25:51 +00009665void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009667 int i;
9668
Thomas Wouters477c8d52006-05-27 19:21:47 +00009669 /* XXX - move this array to unicodectype.c ? */
9670 Py_UNICODE linebreak[] = {
9671 0x000A, /* LINE FEED */
9672 0x000D, /* CARRIAGE RETURN */
9673 0x001C, /* FILE SEPARATOR */
9674 0x001D, /* GROUP SEPARATOR */
9675 0x001E, /* RECORD SEPARATOR */
9676 0x0085, /* NEXT LINE */
9677 0x2028, /* LINE SEPARATOR */
9678 0x2029, /* PARAGRAPH SEPARATOR */
9679 };
9680
Fred Drakee4315f52000-05-09 19:53:39 +00009681 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009682 free_list = NULL;
9683 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009685 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009686 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009687
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009688 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009689 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009690 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009691 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009692
9693 /* initialize the linebreak bloom filter */
9694 bloom_linebreak = make_bloom_mask(
9695 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9696 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009697
9698 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699}
9700
9701/* Finalize the Unicode implementation */
9702
Christian Heimesa156e092008-02-16 07:38:31 +00009703int
9704PyUnicode_ClearFreeList(void)
9705{
9706 int freelist_size = numfree;
9707 PyUnicodeObject *u;
9708
9709 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009710 PyUnicodeObject *v = u;
9711 u = *(PyUnicodeObject **)u;
9712 if (v->str)
9713 PyObject_DEL(v->str);
9714 Py_XDECREF(v->defenc);
9715 PyObject_Del(v);
9716 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009717 }
9718 free_list = NULL;
9719 assert(numfree == 0);
9720 return freelist_size;
9721}
9722
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723void
Thomas Wouters78890102000-07-22 19:25:51 +00009724_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009726 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009728 Py_XDECREF(unicode_empty);
9729 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009730
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009731 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009732 if (unicode_latin1[i]) {
9733 Py_DECREF(unicode_latin1[i]);
9734 unicode_latin1[i] = NULL;
9735 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009736 }
Christian Heimesa156e092008-02-16 07:38:31 +00009737 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009739
Walter Dörwald16807132007-05-25 13:52:07 +00009740void
9741PyUnicode_InternInPlace(PyObject **p)
9742{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009743 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9744 PyObject *t;
9745 if (s == NULL || !PyUnicode_Check(s))
9746 Py_FatalError(
9747 "PyUnicode_InternInPlace: unicode strings only please!");
9748 /* If it's a subclass, we don't really know what putting
9749 it in the interned dict might do. */
9750 if (!PyUnicode_CheckExact(s))
9751 return;
9752 if (PyUnicode_CHECK_INTERNED(s))
9753 return;
9754 if (interned == NULL) {
9755 interned = PyDict_New();
9756 if (interned == NULL) {
9757 PyErr_Clear(); /* Don't leave an exception */
9758 return;
9759 }
9760 }
9761 /* It might be that the GetItem call fails even
9762 though the key is present in the dictionary,
9763 namely when this happens during a stack overflow. */
9764 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009765 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009766 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009767
Benjamin Peterson29060642009-01-31 22:14:21 +00009768 if (t) {
9769 Py_INCREF(t);
9770 Py_DECREF(*p);
9771 *p = t;
9772 return;
9773 }
Walter Dörwald16807132007-05-25 13:52:07 +00009774
Benjamin Peterson14339b62009-01-31 16:36:08 +00009775 PyThreadState_GET()->recursion_critical = 1;
9776 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9777 PyErr_Clear();
9778 PyThreadState_GET()->recursion_critical = 0;
9779 return;
9780 }
9781 PyThreadState_GET()->recursion_critical = 0;
9782 /* The two references in interned are not counted by refcnt.
9783 The deallocator will take care of this */
9784 Py_REFCNT(s) -= 2;
9785 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +00009786}
9787
9788void
9789PyUnicode_InternImmortal(PyObject **p)
9790{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009791 PyUnicode_InternInPlace(p);
9792 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9793 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9794 Py_INCREF(*p);
9795 }
Walter Dörwald16807132007-05-25 13:52:07 +00009796}
9797
9798PyObject *
9799PyUnicode_InternFromString(const char *cp)
9800{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009801 PyObject *s = PyUnicode_FromString(cp);
9802 if (s == NULL)
9803 return NULL;
9804 PyUnicode_InternInPlace(&s);
9805 return s;
Walter Dörwald16807132007-05-25 13:52:07 +00009806}
9807
9808void _Py_ReleaseInternedUnicodeStrings(void)
9809{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009810 PyObject *keys;
9811 PyUnicodeObject *s;
9812 Py_ssize_t i, n;
9813 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +00009814
Benjamin Peterson14339b62009-01-31 16:36:08 +00009815 if (interned == NULL || !PyDict_Check(interned))
9816 return;
9817 keys = PyDict_Keys(interned);
9818 if (keys == NULL || !PyList_Check(keys)) {
9819 PyErr_Clear();
9820 return;
9821 }
Walter Dörwald16807132007-05-25 13:52:07 +00009822
Benjamin Peterson14339b62009-01-31 16:36:08 +00009823 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9824 detector, interned unicode strings are not forcibly deallocated;
9825 rather, we give them their stolen references back, and then clear
9826 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +00009827
Benjamin Peterson14339b62009-01-31 16:36:08 +00009828 n = PyList_GET_SIZE(keys);
9829 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +00009830 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009831 for (i = 0; i < n; i++) {
9832 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9833 switch (s->state) {
9834 case SSTATE_NOT_INTERNED:
9835 /* XXX Shouldn't happen */
9836 break;
9837 case SSTATE_INTERNED_IMMORTAL:
9838 Py_REFCNT(s) += 1;
9839 immortal_size += s->length;
9840 break;
9841 case SSTATE_INTERNED_MORTAL:
9842 Py_REFCNT(s) += 2;
9843 mortal_size += s->length;
9844 break;
9845 default:
9846 Py_FatalError("Inconsistent interned string state.");
9847 }
9848 s->state = SSTATE_NOT_INTERNED;
9849 }
9850 fprintf(stderr, "total size of all interned strings: "
9851 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9852 "mortal/immortal\n", mortal_size, immortal_size);
9853 Py_DECREF(keys);
9854 PyDict_Clear(interned);
9855 Py_DECREF(interned);
9856 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +00009857}
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009858
9859
9860/********************* Unicode Iterator **************************/
9861
9862typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009863 PyObject_HEAD
9864 Py_ssize_t it_index;
9865 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009866} unicodeiterobject;
9867
9868static void
9869unicodeiter_dealloc(unicodeiterobject *it)
9870{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009871 _PyObject_GC_UNTRACK(it);
9872 Py_XDECREF(it->it_seq);
9873 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009874}
9875
9876static int
9877unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9878{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009879 Py_VISIT(it->it_seq);
9880 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009881}
9882
9883static PyObject *
9884unicodeiter_next(unicodeiterobject *it)
9885{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009886 PyUnicodeObject *seq;
9887 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009888
Benjamin Peterson14339b62009-01-31 16:36:08 +00009889 assert(it != NULL);
9890 seq = it->it_seq;
9891 if (seq == NULL)
9892 return NULL;
9893 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009894
Benjamin Peterson14339b62009-01-31 16:36:08 +00009895 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9896 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +00009897 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009898 if (item != NULL)
9899 ++it->it_index;
9900 return item;
9901 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009902
Benjamin Peterson14339b62009-01-31 16:36:08 +00009903 Py_DECREF(seq);
9904 it->it_seq = NULL;
9905 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009906}
9907
9908static PyObject *
9909unicodeiter_len(unicodeiterobject *it)
9910{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009911 Py_ssize_t len = 0;
9912 if (it->it_seq)
9913 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9914 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009915}
9916
9917PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9918
9919static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009920 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00009921 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +00009922 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009923};
9924
9925PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009926 PyVarObject_HEAD_INIT(&PyType_Type, 0)
9927 "str_iterator", /* tp_name */
9928 sizeof(unicodeiterobject), /* tp_basicsize */
9929 0, /* tp_itemsize */
9930 /* methods */
9931 (destructor)unicodeiter_dealloc, /* tp_dealloc */
9932 0, /* tp_print */
9933 0, /* tp_getattr */
9934 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009935 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009936 0, /* tp_repr */
9937 0, /* tp_as_number */
9938 0, /* tp_as_sequence */
9939 0, /* tp_as_mapping */
9940 0, /* tp_hash */
9941 0, /* tp_call */
9942 0, /* tp_str */
9943 PyObject_GenericGetAttr, /* tp_getattro */
9944 0, /* tp_setattro */
9945 0, /* tp_as_buffer */
9946 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9947 0, /* tp_doc */
9948 (traverseproc)unicodeiter_traverse, /* tp_traverse */
9949 0, /* tp_clear */
9950 0, /* tp_richcompare */
9951 0, /* tp_weaklistoffset */
9952 PyObject_SelfIter, /* tp_iter */
9953 (iternextfunc)unicodeiter_next, /* tp_iternext */
9954 unicodeiter_methods, /* tp_methods */
9955 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009956};
9957
9958static PyObject *
9959unicode_iter(PyObject *seq)
9960{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009961 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009962
Benjamin Peterson14339b62009-01-31 16:36:08 +00009963 if (!PyUnicode_Check(seq)) {
9964 PyErr_BadInternalCall();
9965 return NULL;
9966 }
9967 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9968 if (it == NULL)
9969 return NULL;
9970 it->it_index = 0;
9971 Py_INCREF(seq);
9972 it->it_seq = (PyUnicodeObject *)seq;
9973 _PyObject_GC_TRACK(it);
9974 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009975}
9976
Martin v. Löwis5b222132007-06-10 09:51:05 +00009977size_t
9978Py_UNICODE_strlen(const Py_UNICODE *u)
9979{
9980 int res = 0;
9981 while(*u++)
9982 res++;
9983 return res;
9984}
9985
9986Py_UNICODE*
9987Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9988{
9989 Py_UNICODE *u = s1;
9990 while ((*u++ = *s2++));
9991 return s1;
9992}
9993
9994Py_UNICODE*
9995Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9996{
9997 Py_UNICODE *u = s1;
9998 while ((*u++ = *s2++))
9999 if (n-- == 0)
10000 break;
10001 return s1;
10002}
10003
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010004Py_UNICODE*
10005Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10006{
10007 Py_UNICODE *u1 = s1;
10008 u1 += Py_UNICODE_strlen(u1);
10009 Py_UNICODE_strcpy(u1, s2);
10010 return s1;
10011}
10012
Martin v. Löwis5b222132007-06-10 09:51:05 +000010013int
10014Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10015{
10016 while (*s1 && *s2 && *s1 == *s2)
10017 s1++, s2++;
10018 if (*s1 && *s2)
10019 return (*s1 < *s2) ? -1 : +1;
10020 if (*s1)
10021 return 1;
10022 if (*s2)
10023 return -1;
10024 return 0;
10025}
10026
Victor Stinneref8d95c2010-08-16 22:03:11 +000010027int
10028Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10029{
10030 register Py_UNICODE u1, u2;
10031 for (; n != 0; n--) {
10032 u1 = *s1;
10033 u2 = *s2;
10034 if (u1 != u2)
10035 return (u1 < u2) ? -1 : +1;
10036 if (u1 == '\0')
10037 return 0;
10038 s1++;
10039 s2++;
10040 }
10041 return 0;
10042}
10043
Martin v. Löwis5b222132007-06-10 09:51:05 +000010044Py_UNICODE*
10045Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10046{
10047 const Py_UNICODE *p;
10048 for (p = s; *p; p++)
10049 if (*p == c)
10050 return (Py_UNICODE*)p;
10051 return NULL;
10052}
10053
Victor Stinner331ea922010-08-10 16:37:20 +000010054Py_UNICODE*
10055Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10056{
10057 const Py_UNICODE *p;
10058 p = s + Py_UNICODE_strlen(s);
10059 while (p != s) {
10060 p--;
10061 if (*p == c)
10062 return (Py_UNICODE*)p;
10063 }
10064 return NULL;
10065}
10066
Victor Stinner71133ff2010-09-01 23:43:53 +000010067Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010068PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010069{
10070 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10071 Py_UNICODE *copy;
10072 Py_ssize_t size;
10073
10074 /* Ensure we won't overflow the size. */
10075 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10076 PyErr_NoMemory();
10077 return NULL;
10078 }
10079 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10080 size *= sizeof(Py_UNICODE);
10081 copy = PyMem_Malloc(size);
10082 if (copy == NULL) {
10083 PyErr_NoMemory();
10084 return NULL;
10085 }
10086 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10087 return copy;
10088}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010090#ifdef __cplusplus
10091}
10092#endif