blob: f5c09dd7f8eab74f95a2b74dec0c821112355ec2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Guido van Rossumd57fd912000-03-10 22:53:23 +000046#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000047#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000049#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000050#include <windows.h>
51#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000052
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Walter Dörwald16807132007-05-25 13:52:07 +000096/* This dictionary holds all interned unicode strings. Note that references
97 to strings in this dictionary are *not* counted in the string's ob_refcnt.
98 When the interned string reaches a refcnt of 0 the string deallocation
99 function will delete the reference from this dictionary.
100
101 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000102 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000103*/
104static PyObject *interned;
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000107static PyUnicodeObject *free_list;
108static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000109
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114 shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
Christian Heimes190d79e2008-01-30 11:58:22 +0000117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000119 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000120/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000121/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000122/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000123/* case 0x000C: * FORM FEED */
124/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000125 0, 1, 1, 1, 1, 1, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000127/* case 0x001C: * FILE SEPARATOR */
128/* case 0x001D: * GROUP SEPARATOR */
129/* case 0x001E: * RECORD SEPARATOR */
130/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000131 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000132/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000133 1, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000137
Benjamin Peterson14339b62009-01-31 16:36:08 +0000138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000146};
147
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149 PyObject **errorHandler,const char *encoding, const char *reason,
150 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
Victor Stinner31be90b2010-04-22 19:38:16 +0000153static void raise_encode_exception(PyObject **exceptionObject,
154 const char *encoding,
155 const Py_UNICODE *unicode, Py_ssize_t size,
156 Py_ssize_t startpos, Py_ssize_t endpos,
157 const char *reason);
158
Christian Heimes190d79e2008-01-30 11:58:22 +0000159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000161 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163/* 0x000B, * LINE TABULATION */
164/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000166 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000167 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000168/* 0x001C, * FILE SEPARATOR */
169/* 0x001D, * GROUP SEPARATOR */
170/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000176
Benjamin Peterson14339b62009-01-31 16:36:08 +0000177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000185};
186
187
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000188Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000189PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000191#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000193#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000197#endif
198}
199
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
Antoine Pitrouf068f942010-01-13 14:19:12 +0000208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
Thomas Wouters477c8d52006-05-27 19:21:47 +0000218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
Antoine Pitrouf068f942010-01-13 14:19:12 +0000222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000224
Benjamin Peterson29060642009-01-31 22:14:21 +0000225#define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252}
253
Benjamin Peterson29060642009-01-31 22:14:21 +0000254#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257/* --- Unicode Object ----------------------------------------------------- */
258
259static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000260int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000261 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262{
263 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000264
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000267 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000268
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000272
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000278 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 return -1;
280 }
281
Thomas Wouters477c8d52006-05-27 19:21:47 +0000282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000288 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000289 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000290 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000291 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
Benjamin Peterson29060642009-01-31 22:14:21 +0000298 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000300 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000301 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 }
303 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305 return 0;
306}
307
308/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000309 Ux0000 terminated; some code (e.g. new_identifier)
310 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000313 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000314
315*/
316
317static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319{
320 register PyUnicodeObject *unicode;
321
Thomas Wouters477c8d52006-05-27 19:21:47 +0000322 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 if (length == 0 && unicode_empty != NULL) {
324 Py_INCREF(unicode_empty);
325 return unicode_empty;
326 }
327
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000328 /* Ensure we won't overflow the size. */
329 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330 return (PyUnicodeObject *)PyErr_NoMemory();
331 }
332
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000334 if (free_list) {
335 unicode = free_list;
336 free_list = *(PyUnicodeObject **)unicode;
337 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000338 if (unicode->str) {
339 /* Keep-Alive optimization: we only upsize the buffer,
340 never downsize it. */
341 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000342 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000343 PyObject_DEL(unicode->str);
344 unicode->str = NULL;
345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000348 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000350 }
351 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 }
353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000354 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000355 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000356 if (unicode == NULL)
357 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000358 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 }
361
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000363 PyErr_NoMemory();
364 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000365 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000366 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000367 * the caller fails before initializing str -- unicode_resize()
368 * reads str[0], and the Keep-Alive optimization can keep memory
369 * allocated for str alive across a call to unicode_dealloc(unicode).
370 * We don't want unicode_resize to read uninitialized memory in
371 * that case.
372 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000373 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000374 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000377 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000378 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000380
Benjamin Peterson29060642009-01-31 22:14:21 +0000381 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000382 /* XXX UNREF/NEWREF interface should be more symmetrical */
383 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000384 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000385 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387}
388
389static
Guido van Rossum9475a232001-10-05 20:51:39 +0000390void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000391{
Walter Dörwald16807132007-05-25 13:52:07 +0000392 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_NOT_INTERNED:
394 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000395
Benjamin Peterson29060642009-01-31 22:14:21 +0000396 case SSTATE_INTERNED_MORTAL:
397 /* revive dead object temporarily for DelItem */
398 Py_REFCNT(unicode) = 3;
399 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400 Py_FatalError(
401 "deletion of interned string failed");
402 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 case SSTATE_INTERNED_IMMORTAL:
405 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406
Benjamin Peterson29060642009-01-31 22:14:21 +0000407 default:
408 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000409 }
410
Guido van Rossum604ddf82001-12-06 20:03:56 +0000411 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000412 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000413 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000414 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415 PyObject_DEL(unicode->str);
416 unicode->str = NULL;
417 unicode->length = 0;
418 }
419 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000420 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000421 }
422 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000423 *(PyUnicodeObject **)unicode = free_list;
424 free_list = unicode;
425 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000426 }
427 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000428 PyObject_DEL(unicode->str);
429 Py_XDECREF(unicode->defenc);
430 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000431 }
432}
433
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000436{
437 register PyUnicodeObject *v;
438
439 /* Argument checks */
440 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000441 PyErr_BadInternalCall();
442 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000443 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000444 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000445 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000446 PyErr_BadInternalCall();
447 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000448 }
449
450 /* Resizing unicode_empty and single character objects is not
451 possible since these are being shared. We simply return a fresh
452 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000453 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000454 (v == unicode_empty || v->length == 1)) {
455 PyUnicodeObject *w = _PyUnicode_New(length);
456 if (w == NULL)
457 return -1;
458 Py_UNICODE_COPY(w->str, v->str,
459 length < v->length ? length : v->length);
460 Py_DECREF(*unicode);
461 *unicode = w;
462 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000463 }
464
465 /* Note that we don't have to modify *unicode for unshared Unicode
466 objects, since we can modify them in-place. */
467 return unicode_resize(v, length);
468}
469
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000474
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000476 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477{
478 PyUnicodeObject *unicode;
479
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects. */
482 if (u != NULL) {
483
Benjamin Peterson29060642009-01-31 22:14:21 +0000484 /* Optimization for empty strings */
485 if (size == 0 && unicode_empty != NULL) {
486 Py_INCREF(unicode_empty);
487 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000489
490 /* Single character Unicode objects in the Latin-1 range are
491 shared when using this constructor */
492 if (size == 1 && *u < 256) {
493 unicode = unicode_latin1[*u];
494 if (!unicode) {
495 unicode = _PyUnicode_New(1);
496 if (!unicode)
497 return NULL;
498 unicode->str[0] = *u;
499 unicode_latin1[*u] = unicode;
500 }
501 Py_INCREF(unicode);
502 return (PyObject *)unicode;
503 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000504 }
Tim Petersced69f82003-09-16 20:30:58 +0000505
Guido van Rossumd57fd912000-03-10 22:53:23 +0000506 unicode = _PyUnicode_New(size);
507 if (!unicode)
508 return NULL;
509
510 /* Copy the Unicode data into the new object */
511 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000512 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513
514 return (PyObject *)unicode;
515}
516
Walter Dörwaldd2034312007-05-18 16:29:38 +0000517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000518{
519 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000520
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 if (size < 0) {
522 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000523 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 return NULL;
525 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000526
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000527 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000528 some optimizations which share commonly used objects.
529 Also, this means the input must be UTF-8, so fall back to the
530 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000531 if (u != NULL) {
532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533 /* Optimization for empty strings */
534 if (size == 0 && unicode_empty != NULL) {
535 Py_INCREF(unicode_empty);
536 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000537 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000538
539 /* Single characters are shared when using this constructor.
540 Restrict to ASCII, since the input must be UTF-8. */
541 if (size == 1 && Py_CHARMASK(*u) < 128) {
542 unicode = unicode_latin1[Py_CHARMASK(*u)];
543 if (!unicode) {
544 unicode = _PyUnicode_New(1);
545 if (!unicode)
546 return NULL;
547 unicode->str[0] = Py_CHARMASK(*u);
548 unicode_latin1[Py_CHARMASK(*u)] = unicode;
549 }
550 Py_INCREF(unicode);
551 return (PyObject *)unicode;
552 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000553
554 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 }
556
Walter Dörwald55507312007-05-18 13:12:10 +0000557 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 if (!unicode)
559 return NULL;
560
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 return (PyObject *)unicode;
562}
563
Walter Dörwaldd2034312007-05-18 16:29:38 +0000564PyObject *PyUnicode_FromString(const char *u)
565{
566 size_t size = strlen(u);
567 if (size > PY_SSIZE_T_MAX) {
568 PyErr_SetString(PyExc_OverflowError, "input too long");
569 return NULL;
570 }
571
572 return PyUnicode_FromStringAndSize(u, size);
573}
574
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575#ifdef HAVE_WCHAR_H
576
Mark Dickinson081dfee2009-03-18 14:47:41 +0000577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584 to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587 Py_ssize_t size)
588{
589 PyUnicodeObject *unicode;
590 register Py_ssize_t i;
591 Py_ssize_t alloc;
592 const wchar_t *orig_w;
593
594 if (w == NULL) {
595 if (size == 0)
596 return PyUnicode_FromStringAndSize(NULL, 0);
597 PyErr_BadInternalCall();
598 return NULL;
599 }
600
601 if (size == -1) {
602 size = wcslen(w);
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633}
634
635#else
636
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639{
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000643 if (size == 0)
644 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000645 PyErr_BadInternalCall();
646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000647 }
648
Martin v. Löwis790465f2008-04-05 20:41:37 +0000649 if (size == -1) {
650 size = wcslen(w);
651 }
652
Guido van Rossumd57fd912000-03-10 22:53:23 +0000653 unicode = _PyUnicode_New(size);
654 if (!unicode)
655 return NULL;
656
657 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000660#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000662 register Py_UNICODE *u;
663 register Py_ssize_t i;
664 u = PyUnicode_AS_UNICODE(unicode);
665 for (i = size; i > 0; i--)
666 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000667 }
668#endif
669
670 return (PyObject *)unicode;
671}
672
Mark Dickinson081dfee2009-03-18 14:47:41 +0000673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
Walter Dörwald346737f2007-05-31 10:44:43 +0000677static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000680{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000681 *fmt++ = '%';
682 if (width) {
683 if (zeropad)
684 *fmt++ = '0';
685 fmt += sprintf(fmt, "%d", width);
686 }
687 if (precision)
688 fmt += sprintf(fmt, ".%d", precision);
689 if (longflag)
690 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000691 else if (longlongflag) {
692 /* longlongflag should only ever be nonzero on machines with
693 HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695 char *f = PY_FORMAT_LONG_LONG;
696 while (*f)
697 *fmt++ = *f++;
698#else
699 /* we shouldn't ever get here */
700 assert(0);
701 *fmt++ = 'l';
702#endif
703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000704 else if (size_tflag) {
705 char *f = PY_FORMAT_SIZE_T;
706 while (*f)
707 *fmt++ = *f++;
708 }
709 *fmt++ = c;
710 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000711}
712
Walter Dörwaldd2034312007-05-18 16:29:38 +0000713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld. 21 characters
718 allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
Walter Dörwaldd2034312007-05-18 16:29:38 +0000725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000728 va_list count;
729 Py_ssize_t callcount = 0;
730 PyObject **callresults = NULL;
731 PyObject **callresult = NULL;
732 Py_ssize_t n = 0;
733 int width = 0;
734 int precision = 0;
735 int zeropad;
736 const char* f;
737 Py_UNICODE *s;
738 PyObject *string;
739 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000740 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000741 /* use abuffer instead of buffer, if we need more space
742 * (which can happen if there's a format specifier with width). */
743 char *abuffer = NULL;
744 char *realbuffer;
745 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000746 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000747 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000748
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000749 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000750 /* step 1: count the number of %S/%R/%A/%s format specifications
751 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000754 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000755 if (*f == '%') {
756 if (*(f+1)=='%')
757 continue;
758 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759 ++callcount;
760 while (ISDIGIT((unsigned)*f))
761 width = (width*10) + *f++ - '0';
762 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763 ;
764 if (*f == 's')
765 ++callcount;
766 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000767 else if (128 <= (unsigned char)*f) {
768 PyErr_Format(PyExc_ValueError,
769 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000770 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000771 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000772 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 }
775 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000776 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000777 if (callcount) {
778 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
779 if (!callresults) {
780 PyErr_NoMemory();
781 return NULL;
782 }
783 callresult = callresults;
784 }
785 /* step 3: figure out how large a buffer we need */
786 for (f = format; *f; f++) {
787 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000788#ifdef HAVE_LONG_LONG
789 int longlongflag = 0;
790#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 const char* p = f;
792 width = 0;
793 while (ISDIGIT((unsigned)*f))
794 width = (width*10) + *f++ - '0';
795 while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
796 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000797
Benjamin Peterson14339b62009-01-31 16:36:08 +0000798 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
799 * they don't affect the amount of space we reserve.
800 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000801 if (*f == 'l') {
802 if (f[1] == 'd' || f[1] == 'u') {
803 ++f;
804 }
805#ifdef HAVE_LONG_LONG
806 else if (f[1] == 'l' &&
807 (f[2] == 'd' || f[2] == 'u')) {
808 longlongflag = 1;
809 f += 2;
810 }
811#endif
812 }
813 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000814 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000816
Benjamin Peterson14339b62009-01-31 16:36:08 +0000817 switch (*f) {
818 case 'c':
819 (void)va_arg(count, int);
820 /* fall through... */
821 case '%':
822 n++;
823 break;
824 case 'd': case 'u': case 'i': case 'x':
825 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000826#ifdef HAVE_LONG_LONG
827 if (longlongflag) {
828 if (width < MAX_LONG_LONG_CHARS)
829 width = MAX_LONG_LONG_CHARS;
830 }
831 else
832#endif
833 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
834 including sign. Decimal takes the most space. This
835 isn't enough for octal. If a width is specified we
836 need more (which we allocate later). */
837 if (width < MAX_LONG_CHARS)
838 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000839 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000840 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000841 if (abuffersize < width)
842 abuffersize = width;
843 break;
844 case 's':
845 {
846 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000847 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000848 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
849 if (!str)
850 goto fail;
851 n += PyUnicode_GET_SIZE(str);
852 /* Remember the str and switch to the next slot */
853 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000854 break;
855 }
856 case 'U':
857 {
858 PyObject *obj = va_arg(count, PyObject *);
859 assert(obj && PyUnicode_Check(obj));
860 n += PyUnicode_GET_SIZE(obj);
861 break;
862 }
863 case 'V':
864 {
865 PyObject *obj = va_arg(count, PyObject *);
866 const char *str = va_arg(count, const char *);
867 assert(obj || str);
868 assert(!obj || PyUnicode_Check(obj));
869 if (obj)
870 n += PyUnicode_GET_SIZE(obj);
871 else
872 n += strlen(str);
873 break;
874 }
875 case 'S':
876 {
877 PyObject *obj = va_arg(count, PyObject *);
878 PyObject *str;
879 assert(obj);
880 str = PyObject_Str(obj);
881 if (!str)
882 goto fail;
883 n += PyUnicode_GET_SIZE(str);
884 /* Remember the str and switch to the next slot */
885 *callresult++ = str;
886 break;
887 }
888 case 'R':
889 {
890 PyObject *obj = va_arg(count, PyObject *);
891 PyObject *repr;
892 assert(obj);
893 repr = PyObject_Repr(obj);
894 if (!repr)
895 goto fail;
896 n += PyUnicode_GET_SIZE(repr);
897 /* Remember the repr and switch to the next slot */
898 *callresult++ = repr;
899 break;
900 }
901 case 'A':
902 {
903 PyObject *obj = va_arg(count, PyObject *);
904 PyObject *ascii;
905 assert(obj);
906 ascii = PyObject_ASCII(obj);
907 if (!ascii)
908 goto fail;
909 n += PyUnicode_GET_SIZE(ascii);
910 /* Remember the repr and switch to the next slot */
911 *callresult++ = ascii;
912 break;
913 }
914 case 'p':
915 (void) va_arg(count, int);
916 /* maximum 64-bit pointer representation:
917 * 0xffffffffffffffff
918 * so 19 characters is enough.
919 * XXX I count 18 -- what's the extra for?
920 */
921 n += 19;
922 break;
923 default:
924 /* if we stumble upon an unknown
925 formatting code, copy the rest of
926 the format string to the output
927 string. (we cannot just skip the
928 code, since there's no way to know
929 what's in the argument list) */
930 n += strlen(p);
931 goto expand;
932 }
933 } else
934 n++;
935 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000936 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000937 if (abuffersize > ITEM_BUFFER_LEN) {
938 /* add 1 for sprintf's trailing null byte */
939 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000940 if (!abuffer) {
941 PyErr_NoMemory();
942 goto fail;
943 }
944 realbuffer = abuffer;
945 }
946 else
947 realbuffer = buffer;
948 /* step 4: fill the buffer */
949 /* Since we've analyzed how much space we need for the worst case,
950 we don't have to resize the string.
951 There can be no errors beyond this point. */
952 string = PyUnicode_FromUnicode(NULL, n);
953 if (!string)
954 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000955
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 s = PyUnicode_AS_UNICODE(string);
957 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000958
Benjamin Peterson14339b62009-01-31 16:36:08 +0000959 for (f = format; *f; f++) {
960 if (*f == '%') {
961 const char* p = f++;
962 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000963 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000964 int size_tflag = 0;
965 zeropad = (*f == '0');
966 /* parse the width.precision part */
967 width = 0;
968 while (ISDIGIT((unsigned)*f))
969 width = (width*10) + *f++ - '0';
970 precision = 0;
971 if (*f == '.') {
972 f++;
973 while (ISDIGIT((unsigned)*f))
974 precision = (precision*10) + *f++ - '0';
975 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000976 /* Handle %ld, %lu, %lld and %llu. */
977 if (*f == 'l') {
978 if (f[1] == 'd' || f[1] == 'u') {
979 longflag = 1;
980 ++f;
981 }
982#ifdef HAVE_LONG_LONG
983 else if (f[1] == 'l' &&
984 (f[2] == 'd' || f[2] == 'u')) {
985 longlongflag = 1;
986 f += 2;
987 }
988#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000989 }
990 /* handle the size_t flag. */
991 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
992 size_tflag = 1;
993 ++f;
994 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000995
Benjamin Peterson14339b62009-01-31 16:36:08 +0000996 switch (*f) {
997 case 'c':
998 *s++ = va_arg(vargs, int);
999 break;
1000 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001001 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1002 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001003 if (longflag)
1004 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001005#ifdef HAVE_LONG_LONG
1006 else if (longlongflag)
1007 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1008#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001009 else if (size_tflag)
1010 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1011 else
1012 sprintf(realbuffer, fmt, va_arg(vargs, int));
1013 appendstring(realbuffer);
1014 break;
1015 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001016 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1017 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 if (longflag)
1019 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001020#ifdef HAVE_LONG_LONG
1021 else if (longlongflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs,
1023 unsigned PY_LONG_LONG));
1024#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001025 else if (size_tflag)
1026 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1027 else
1028 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1029 appendstring(realbuffer);
1030 break;
1031 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001032 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 sprintf(realbuffer, fmt, va_arg(vargs, int));
1034 appendstring(realbuffer);
1035 break;
1036 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001037 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 's':
1042 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001043 /* unused, since we already have the result */
1044 (void) va_arg(vargs, char *);
1045 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1046 PyUnicode_GET_SIZE(*callresult));
1047 s += PyUnicode_GET_SIZE(*callresult);
1048 /* We're done with the unicode()/repr() => forget it */
1049 Py_DECREF(*callresult);
1050 /* switch to next unicode()/repr() result */
1051 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001052 break;
1053 }
1054 case 'U':
1055 {
1056 PyObject *obj = va_arg(vargs, PyObject *);
1057 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1058 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1059 s += size;
1060 break;
1061 }
1062 case 'V':
1063 {
1064 PyObject *obj = va_arg(vargs, PyObject *);
1065 const char *str = va_arg(vargs, const char *);
1066 if (obj) {
1067 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1068 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1069 s += size;
1070 } else {
1071 appendstring(str);
1072 }
1073 break;
1074 }
1075 case 'S':
1076 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001077 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001078 {
1079 Py_UNICODE *ucopy;
1080 Py_ssize_t usize;
1081 Py_ssize_t upos;
1082 /* unused, since we already have the result */
1083 (void) va_arg(vargs, PyObject *);
1084 ucopy = PyUnicode_AS_UNICODE(*callresult);
1085 usize = PyUnicode_GET_SIZE(*callresult);
1086 for (upos = 0; upos<usize;)
1087 *s++ = ucopy[upos++];
1088 /* We're done with the unicode()/repr() => forget it */
1089 Py_DECREF(*callresult);
1090 /* switch to next unicode()/repr() result */
1091 ++callresult;
1092 break;
1093 }
1094 case 'p':
1095 sprintf(buffer, "%p", va_arg(vargs, void*));
1096 /* %p is ill-defined: ensure leading 0x. */
1097 if (buffer[1] == 'X')
1098 buffer[1] = 'x';
1099 else if (buffer[1] != 'x') {
1100 memmove(buffer+2, buffer, strlen(buffer)+1);
1101 buffer[0] = '0';
1102 buffer[1] = 'x';
1103 }
1104 appendstring(buffer);
1105 break;
1106 case '%':
1107 *s++ = '%';
1108 break;
1109 default:
1110 appendstring(p);
1111 goto end;
1112 }
Victor Stinner1205f272010-09-11 00:54:47 +00001113 }
Victor Stinner1205f272010-09-11 00:54:47 +00001114 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001115 *s++ = *f;
1116 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001117
Benjamin Peterson29060642009-01-31 22:14:21 +00001118 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001119 if (callresults)
1120 PyObject_Free(callresults);
1121 if (abuffer)
1122 PyObject_Free(abuffer);
1123 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1124 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001125 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001126 if (callresults) {
1127 PyObject **callresult2 = callresults;
1128 while (callresult2 < callresult) {
1129 Py_DECREF(*callresult2);
1130 ++callresult2;
1131 }
1132 PyObject_Free(callresults);
1133 }
1134 if (abuffer)
1135 PyObject_Free(abuffer);
1136 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001137}
1138
1139#undef appendstring
1140
1141PyObject *
1142PyUnicode_FromFormat(const char *format, ...)
1143{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 PyObject* ret;
1145 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001146
1147#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001148 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001149#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001150 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001151#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001152 ret = PyUnicode_FromFormatV(format, vargs);
1153 va_end(vargs);
1154 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155}
1156
Victor Stinner5593d8a2010-10-02 11:11:27 +00001157/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1158 convert a Unicode object to a wide character string.
1159
1160 - If w is NULL: return the number of wide characters (including the nul
1161 character) required to convert the unicode object. Ignore size argument.
1162
1163 - Otherwise: return the number of wide characters (excluding the nul
1164 character) written into w. Write at most size wide characters (including
1165 the nul character). */
1166static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001167unicode_aswidechar(PyUnicodeObject *unicode,
1168 wchar_t *w,
1169 Py_ssize_t size)
1170{
1171#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001172 Py_ssize_t res;
1173 if (w != NULL) {
1174 res = PyUnicode_GET_SIZE(unicode);
1175 if (size > res)
1176 size = res + 1;
1177 else
1178 res = size;
1179 memcpy(w, unicode->str, size * sizeof(wchar_t));
1180 return res;
1181 }
1182 else
1183 return PyUnicode_GET_SIZE(unicode) + 1;
1184#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1185 register const Py_UNICODE *u;
1186 const Py_UNICODE *uend;
1187 const wchar_t *worig, *wend;
1188 Py_ssize_t nchar;
1189
Victor Stinner137c34c2010-09-29 10:25:54 +00001190 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001191 uend = u + PyUnicode_GET_SIZE(unicode);
1192 if (w != NULL) {
1193 worig = w;
1194 wend = w + size;
1195 while (u != uend && w != wend) {
1196 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1197 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1198 {
1199 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1200 u += 2;
1201 }
1202 else {
1203 *w = *u;
1204 u++;
1205 }
1206 w++;
1207 }
1208 if (w != wend)
1209 *w = L'\0';
1210 return w - worig;
1211 }
1212 else {
1213 nchar = 1; /* nul character at the end */
1214 while (u != uend) {
1215 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1216 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1217 u += 2;
1218 else
1219 u++;
1220 nchar++;
1221 }
1222 }
1223 return nchar;
1224#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1225 register Py_UNICODE *u, *uend, ordinal;
1226 register Py_ssize_t i;
1227 wchar_t *worig, *wend;
1228 Py_ssize_t nchar;
1229
1230 u = PyUnicode_AS_UNICODE(unicode);
1231 uend = u + PyUnicode_GET_SIZE(u);
1232 if (w != NULL) {
1233 worig = w;
1234 wend = w + size;
1235 while (u != uend && w != wend) {
1236 ordinal = *u;
1237 if (ordinal > 0xffff) {
1238 ordinal -= 0x10000;
1239 *w++ = 0xD800 | (ordinal >> 10);
1240 *w++ = 0xDC00 | (ordinal & 0x3FF);
1241 }
1242 else
1243 *w++ = ordinal;
1244 u++;
1245 }
1246 if (w != wend)
1247 *w = 0;
1248 return w - worig;
1249 }
1250 else {
1251 nchar = 1; /* nul character */
1252 while (u != uend) {
1253 if (*u > 0xffff)
1254 nchar += 2;
1255 else
1256 nchar++;
1257 u++;
1258 }
1259 return nchar;
1260 }
1261#else
1262# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001263#endif
1264}
1265
1266Py_ssize_t
1267PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1268 wchar_t *w,
1269 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001270{
1271 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001272 PyErr_BadInternalCall();
1273 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00001275 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276}
1277
Victor Stinner137c34c2010-09-29 10:25:54 +00001278wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001279PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001280 Py_ssize_t *size)
1281{
1282 wchar_t* buffer;
1283 Py_ssize_t buflen;
1284
1285 if (unicode == NULL) {
1286 PyErr_BadInternalCall();
1287 return NULL;
1288 }
1289
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001290 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001291 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001292 PyErr_NoMemory();
1293 return NULL;
1294 }
1295
Victor Stinner137c34c2010-09-29 10:25:54 +00001296 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1297 if (buffer == NULL) {
1298 PyErr_NoMemory();
1299 return NULL;
1300 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001301 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001302 if (size != NULL)
1303 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001304 return buffer;
1305}
1306
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307#endif
1308
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001309PyObject *PyUnicode_FromOrdinal(int ordinal)
1310{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001311 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001312
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001313 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001314 PyErr_SetString(PyExc_ValueError,
1315 "chr() arg not in range(0x110000)");
1316 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001317 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001318
1319#ifndef Py_UNICODE_WIDE
1320 if (ordinal > 0xffff) {
1321 ordinal -= 0x10000;
1322 s[0] = 0xD800 | (ordinal >> 10);
1323 s[1] = 0xDC00 | (ordinal & 0x3FF);
1324 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001325 }
1326#endif
1327
Hye-Shik Chang40574832004-04-06 07:24:51 +00001328 s[0] = (Py_UNICODE)ordinal;
1329 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001330}
1331
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332PyObject *PyUnicode_FromObject(register PyObject *obj)
1333{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001334 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001335 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001336 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001337 Py_INCREF(obj);
1338 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001339 }
1340 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001341 /* For a Unicode subtype that's not a Unicode object,
1342 return a true Unicode object with the same data. */
1343 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1344 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001345 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001346 PyErr_Format(PyExc_TypeError,
1347 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001348 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001349 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001350}
1351
1352PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 const char *encoding,
1354 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001355{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001356 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001357 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001358
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001360 PyErr_BadInternalCall();
1361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001363
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001364 /* Decoding bytes objects is the most common case and should be fast */
1365 if (PyBytes_Check(obj)) {
1366 if (PyBytes_GET_SIZE(obj) == 0) {
1367 Py_INCREF(unicode_empty);
1368 v = (PyObject *) unicode_empty;
1369 }
1370 else {
1371 v = PyUnicode_Decode(
1372 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1373 encoding, errors);
1374 }
1375 return v;
1376 }
1377
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001378 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001379 PyErr_SetString(PyExc_TypeError,
1380 "decoding str is not supported");
1381 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001382 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001383
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001384 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1385 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1386 PyErr_Format(PyExc_TypeError,
1387 "coercing to str: need bytes, bytearray "
1388 "or buffer-like object, %.80s found",
1389 Py_TYPE(obj)->tp_name);
1390 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001391 }
Tim Petersced69f82003-09-16 20:30:58 +00001392
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001393 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001395 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 }
Tim Petersced69f82003-09-16 20:30:58 +00001397 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001398 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001399
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001400 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001401 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001402}
1403
Victor Stinner600d3be2010-06-10 12:00:55 +00001404/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001405 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1406 1 on success. */
1407static int
1408normalize_encoding(const char *encoding,
1409 char *lower,
1410 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001412 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001413 char *l;
1414 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001415
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001416 e = encoding;
1417 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001418 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001419 while (*e) {
1420 if (l == l_end)
1421 return 0;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001422 if (ISUPPER(*e)) {
1423 *l++ = TOLOWER(*e++);
1424 }
1425 else if (*e == '_') {
1426 *l++ = '-';
1427 e++;
1428 }
1429 else {
1430 *l++ = *e++;
1431 }
1432 }
1433 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001434 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001435}
1436
1437PyObject *PyUnicode_Decode(const char *s,
1438 Py_ssize_t size,
1439 const char *encoding,
1440 const char *errors)
1441{
1442 PyObject *buffer = NULL, *unicode;
1443 Py_buffer info;
1444 char lower[11]; /* Enough for any encoding shortcut */
1445
1446 if (encoding == NULL)
1447 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001448
1449 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001450 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1451 if (strcmp(lower, "utf-8") == 0)
1452 return PyUnicode_DecodeUTF8(s, size, errors);
1453 else if ((strcmp(lower, "latin-1") == 0) ||
1454 (strcmp(lower, "iso-8859-1") == 0))
1455 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001456#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001457 else if (strcmp(lower, "mbcs") == 0)
1458 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001459#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001460 else if (strcmp(lower, "ascii") == 0)
1461 return PyUnicode_DecodeASCII(s, size, errors);
1462 else if (strcmp(lower, "utf-16") == 0)
1463 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1464 else if (strcmp(lower, "utf-32") == 0)
1465 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467
1468 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001469 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001470 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001471 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001472 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473 if (buffer == NULL)
1474 goto onError;
1475 unicode = PyCodec_Decode(buffer, encoding, errors);
1476 if (unicode == NULL)
1477 goto onError;
1478 if (!PyUnicode_Check(unicode)) {
1479 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001480 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001481 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 Py_DECREF(unicode);
1483 goto onError;
1484 }
1485 Py_DECREF(buffer);
1486 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001487
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489 Py_XDECREF(buffer);
1490 return NULL;
1491}
1492
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001493PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1494 const char *encoding,
1495 const char *errors)
1496{
1497 PyObject *v;
1498
1499 if (!PyUnicode_Check(unicode)) {
1500 PyErr_BadArgument();
1501 goto onError;
1502 }
1503
1504 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001505 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001506
1507 /* Decode via the codec registry */
1508 v = PyCodec_Decode(unicode, encoding, errors);
1509 if (v == NULL)
1510 goto onError;
1511 return v;
1512
Benjamin Peterson29060642009-01-31 22:14:21 +00001513 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001514 return NULL;
1515}
1516
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001517PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1518 const char *encoding,
1519 const char *errors)
1520{
1521 PyObject *v;
1522
1523 if (!PyUnicode_Check(unicode)) {
1524 PyErr_BadArgument();
1525 goto onError;
1526 }
1527
1528 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001529 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001530
1531 /* Decode via the codec registry */
1532 v = PyCodec_Decode(unicode, encoding, errors);
1533 if (v == NULL)
1534 goto onError;
1535 if (!PyUnicode_Check(v)) {
1536 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001537 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001538 Py_TYPE(v)->tp_name);
1539 Py_DECREF(v);
1540 goto onError;
1541 }
1542 return v;
1543
Benjamin Peterson29060642009-01-31 22:14:21 +00001544 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001545 return NULL;
1546}
1547
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001549 Py_ssize_t size,
1550 const char *encoding,
1551 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001552{
1553 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001554
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 unicode = PyUnicode_FromUnicode(s, size);
1556 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1559 Py_DECREF(unicode);
1560 return v;
1561}
1562
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001563PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1564 const char *encoding,
1565 const char *errors)
1566{
1567 PyObject *v;
1568
1569 if (!PyUnicode_Check(unicode)) {
1570 PyErr_BadArgument();
1571 goto onError;
1572 }
1573
1574 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001576
1577 /* Encode via the codec registry */
1578 v = PyCodec_Encode(unicode, encoding, errors);
1579 if (v == NULL)
1580 goto onError;
1581 return v;
1582
Benjamin Peterson29060642009-01-31 22:14:21 +00001583 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001584 return NULL;
1585}
1586
Victor Stinnerae6265f2010-05-15 16:27:27 +00001587PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1588{
Victor Stinner313a1202010-06-11 23:56:51 +00001589 if (Py_FileSystemDefaultEncoding) {
1590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1591 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1592 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1593 PyUnicode_GET_SIZE(unicode),
1594 NULL);
1595#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001596 return PyUnicode_AsEncodedString(unicode,
1597 Py_FileSystemDefaultEncoding,
1598 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001599 }
1600 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001601 /* locale encoding with surrogateescape */
1602 wchar_t *wchar;
1603 char *bytes;
1604 PyObject *bytes_obj;
1605
1606 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1607 if (wchar == NULL)
1608 return NULL;
1609 bytes = _Py_wchar2char(wchar);
1610 PyMem_Free(wchar);
1611 if (bytes == NULL)
1612 return NULL;
1613
1614 bytes_obj = PyBytes_FromString(bytes);
1615 PyMem_Free(bytes);
1616 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001617 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00001618}
1619
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1621 const char *encoding,
1622 const char *errors)
1623{
1624 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001625 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001626
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627 if (!PyUnicode_Check(unicode)) {
1628 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001629 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630 }
Fred Drakee4315f52000-05-09 19:53:39 +00001631
Tim Petersced69f82003-09-16 20:30:58 +00001632 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001633 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001634
1635 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001636 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1637 if (strcmp(lower, "utf-8") == 0)
1638 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1639 PyUnicode_GET_SIZE(unicode),
1640 errors);
1641 else if ((strcmp(lower, "latin-1") == 0) ||
1642 (strcmp(lower, "iso-8859-1") == 0))
1643 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1644 PyUnicode_GET_SIZE(unicode),
1645 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001646#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001647 else if (strcmp(lower, "mbcs") == 0)
1648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649 PyUnicode_GET_SIZE(unicode),
1650 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001651#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001652 else if (strcmp(lower, "ascii") == 0)
1653 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1654 PyUnicode_GET_SIZE(unicode),
1655 errors);
1656 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001657 /* During bootstrap, we may need to find the encodings
1658 package, to load the file system encoding, and require the
1659 file system encoding in order to load the encodings
1660 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001661
Victor Stinner59e62db2010-05-15 13:14:32 +00001662 Break out of this dependency by assuming that the path to
1663 the encodings module is ASCII-only. XXX could try wcstombs
1664 instead, if the file system encoding is the locale's
1665 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001666 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001667 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1668 !PyThreadState_GET()->interp->codecs_initialized)
1669 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1670 PyUnicode_GET_SIZE(unicode),
1671 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672
1673 /* Encode via the codec registry */
1674 v = PyCodec_Encode(unicode, encoding, errors);
1675 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001676 return NULL;
1677
1678 /* The normal path */
1679 if (PyBytes_Check(v))
1680 return v;
1681
1682 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001683 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001684 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001685 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001686
1687 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1688 "encoder %s returned bytearray instead of bytes",
1689 encoding);
1690 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001691 Py_DECREF(v);
1692 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001693 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001694
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001695 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1696 Py_DECREF(v);
1697 return b;
1698 }
1699
1700 PyErr_Format(PyExc_TypeError,
1701 "encoder did not return a bytes object (type=%.400s)",
1702 Py_TYPE(v)->tp_name);
1703 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001704 return NULL;
1705}
1706
1707PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1708 const char *encoding,
1709 const char *errors)
1710{
1711 PyObject *v;
1712
1713 if (!PyUnicode_Check(unicode)) {
1714 PyErr_BadArgument();
1715 goto onError;
1716 }
1717
1718 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001719 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001720
1721 /* Encode via the codec registry */
1722 v = PyCodec_Encode(unicode, encoding, errors);
1723 if (v == NULL)
1724 goto onError;
1725 if (!PyUnicode_Check(v)) {
1726 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001727 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001728 Py_TYPE(v)->tp_name);
1729 Py_DECREF(v);
1730 goto onError;
1731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001733
Benjamin Peterson29060642009-01-31 22:14:21 +00001734 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 return NULL;
1736}
1737
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001738PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001739 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001740{
1741 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001742 if (v)
1743 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001744 if (errors != NULL)
1745 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001746 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001747 PyUnicode_GET_SIZE(unicode),
1748 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001749 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001750 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001751 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001752 return v;
1753}
1754
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001755PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001756PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001757 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001758 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1759}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001760
Christian Heimes5894ba72007-11-04 11:43:14 +00001761PyObject*
1762PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1763{
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001764 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1765 can be undefined. If it is case, decode using UTF-8. The following assumes
1766 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1767 bootstrapping process where the codecs aren't ready yet.
1768 */
1769 if (Py_FileSystemDefaultEncoding) {
1770#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Christian Heimes5894ba72007-11-04 11:43:14 +00001771 if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
Victor Stinner313a1202010-06-11 23:56:51 +00001772 return PyUnicode_DecodeMBCS(s, size, NULL);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001773 }
1774#elif defined(__APPLE__)
Christian Heimes5894ba72007-11-04 11:43:14 +00001775 if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001776 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001777 }
1778#endif
1779 return PyUnicode_Decode(s, size,
1780 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001781 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001782 }
1783 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001784 /* locale encoding with surrogateescape */
1785 wchar_t *wchar;
1786 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001787 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001788
1789 if (s[size] != '\0' || size != strlen(s)) {
1790 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1791 return NULL;
1792 }
1793
Victor Stinner168e1172010-10-16 23:16:16 +00001794 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001795 if (wchar == NULL)
1796 return NULL;
1797
Victor Stinner168e1172010-10-16 23:16:16 +00001798 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001799 PyMem_Free(wchar);
1800 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001801 }
1802}
1803
Martin v. Löwis011e8422009-05-05 04:43:17 +00001804
1805int
1806PyUnicode_FSConverter(PyObject* arg, void* addr)
1807{
1808 PyObject *output = NULL;
1809 Py_ssize_t size;
1810 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001811 if (arg == NULL) {
1812 Py_DECREF(*(PyObject**)addr);
1813 return 1;
1814 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001815 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001816 output = arg;
1817 Py_INCREF(output);
1818 }
1819 else {
1820 arg = PyUnicode_FromObject(arg);
1821 if (!arg)
1822 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001823 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001824 Py_DECREF(arg);
1825 if (!output)
1826 return 0;
1827 if (!PyBytes_Check(output)) {
1828 Py_DECREF(output);
1829 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1830 return 0;
1831 }
1832 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001833 size = PyBytes_GET_SIZE(output);
1834 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001835 if (size != strlen(data)) {
1836 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1837 Py_DECREF(output);
1838 return 0;
1839 }
1840 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001841 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001842}
1843
1844
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001845int
1846PyUnicode_FSDecoder(PyObject* arg, void* addr)
1847{
1848 PyObject *output = NULL;
1849 Py_ssize_t size;
1850 void *data;
1851 if (arg == NULL) {
1852 Py_DECREF(*(PyObject**)addr);
1853 return 1;
1854 }
1855 if (PyUnicode_Check(arg)) {
1856 output = arg;
1857 Py_INCREF(output);
1858 }
1859 else {
1860 arg = PyBytes_FromObject(arg);
1861 if (!arg)
1862 return 0;
1863 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1864 PyBytes_GET_SIZE(arg));
1865 Py_DECREF(arg);
1866 if (!output)
1867 return 0;
1868 if (!PyUnicode_Check(output)) {
1869 Py_DECREF(output);
1870 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1871 return 0;
1872 }
1873 }
1874 size = PyUnicode_GET_SIZE(output);
1875 data = PyUnicode_AS_UNICODE(output);
1876 if (size != Py_UNICODE_strlen(data)) {
1877 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1878 Py_DECREF(output);
1879 return 0;
1880 }
1881 *(PyObject**)addr = output;
1882 return Py_CLEANUP_SUPPORTED;
1883}
1884
1885
Martin v. Löwis5b222132007-06-10 09:51:05 +00001886char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001887_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001888{
Christian Heimesf3863112007-11-22 07:46:41 +00001889 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001890 if (!PyUnicode_Check(unicode)) {
1891 PyErr_BadArgument();
1892 return NULL;
1893 }
Christian Heimesf3863112007-11-22 07:46:41 +00001894 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1895 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001896 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001897 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001898 *psize = PyBytes_GET_SIZE(bytes);
1899 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001900}
1901
1902char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001903_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001904{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001905 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001906}
1907
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1909{
1910 if (!PyUnicode_Check(unicode)) {
1911 PyErr_BadArgument();
1912 goto onError;
1913 }
1914 return PyUnicode_AS_UNICODE(unicode);
1915
Benjamin Peterson29060642009-01-31 22:14:21 +00001916 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 return NULL;
1918}
1919
Martin v. Löwis18e16552006-02-15 17:27:45 +00001920Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921{
1922 if (!PyUnicode_Check(unicode)) {
1923 PyErr_BadArgument();
1924 goto onError;
1925 }
1926 return PyUnicode_GET_SIZE(unicode);
1927
Benjamin Peterson29060642009-01-31 22:14:21 +00001928 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 return -1;
1930}
1931
Thomas Wouters78890102000-07-22 19:25:51 +00001932const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001933{
Victor Stinner42cb4622010-09-01 19:39:01 +00001934 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001935}
1936
Victor Stinner554f3f02010-06-16 23:33:54 +00001937/* create or adjust a UnicodeDecodeError */
1938static void
1939make_decode_exception(PyObject **exceptionObject,
1940 const char *encoding,
1941 const char *input, Py_ssize_t length,
1942 Py_ssize_t startpos, Py_ssize_t endpos,
1943 const char *reason)
1944{
1945 if (*exceptionObject == NULL) {
1946 *exceptionObject = PyUnicodeDecodeError_Create(
1947 encoding, input, length, startpos, endpos, reason);
1948 }
1949 else {
1950 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1951 goto onError;
1952 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1953 goto onError;
1954 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1955 goto onError;
1956 }
1957 return;
1958
1959onError:
1960 Py_DECREF(*exceptionObject);
1961 *exceptionObject = NULL;
1962}
1963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001964/* error handling callback helper:
1965 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001966 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001967 and adjust various state variables.
1968 return 0 on success, -1 on error
1969*/
1970
1971static
1972int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001973 const char *encoding, const char *reason,
1974 const char **input, const char **inend, Py_ssize_t *startinpos,
1975 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1976 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001977{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001978 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001979
1980 PyObject *restuple = NULL;
1981 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001982 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00001983 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001984 Py_ssize_t requiredsize;
1985 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001986 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00001987 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001988 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 int res = -1;
1990
1991 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001992 *errorHandler = PyCodec_LookupError(errors);
1993 if (*errorHandler == NULL)
1994 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001995 }
1996
Victor Stinner554f3f02010-06-16 23:33:54 +00001997 make_decode_exception(exceptionObject,
1998 encoding,
1999 *input, *inend - *input,
2000 *startinpos, *endinpos,
2001 reason);
2002 if (*exceptionObject == NULL)
2003 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002004
2005 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2006 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002008 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002009 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002010 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 }
2012 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002013 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002014
2015 /* Copy back the bytes variables, which might have been modified by the
2016 callback */
2017 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2018 if (!inputobj)
2019 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002020 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002021 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002022 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002023 *input = PyBytes_AS_STRING(inputobj);
2024 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002025 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002026 /* we can DECREF safely, as the exception has another reference,
2027 so the object won't go away. */
2028 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002029
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002030 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002032 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002033 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2034 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002035 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002036
2037 /* need more space? (at least enough for what we
2038 have+the replacement+the rest of the string (starting
2039 at the new input position), so we won't have to check space
2040 when there are no errors in the rest of the string) */
2041 repptr = PyUnicode_AS_UNICODE(repunicode);
2042 repsize = PyUnicode_GET_SIZE(repunicode);
2043 requiredsize = *outpos + repsize + insize-newpos;
2044 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002045 if (requiredsize<2*outsize)
2046 requiredsize = 2*outsize;
2047 if (_PyUnicode_Resize(output, requiredsize) < 0)
2048 goto onError;
2049 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002050 }
2051 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002052 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 Py_UNICODE_COPY(*outptr, repptr, repsize);
2054 *outptr += repsize;
2055 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002056
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 /* we made it! */
2058 res = 0;
2059
Benjamin Peterson29060642009-01-31 22:14:21 +00002060 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 Py_XDECREF(restuple);
2062 return res;
2063}
2064
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002065/* --- UTF-7 Codec -------------------------------------------------------- */
2066
Antoine Pitrou244651a2009-05-04 18:56:13 +00002067/* See RFC2152 for details. We encode conservatively and decode liberally. */
2068
2069/* Three simple macros defining base-64. */
2070
2071/* Is c a base-64 character? */
2072
2073#define IS_BASE64(c) \
2074 (((c) >= 'A' && (c) <= 'Z') || \
2075 ((c) >= 'a' && (c) <= 'z') || \
2076 ((c) >= '0' && (c) <= '9') || \
2077 (c) == '+' || (c) == '/')
2078
2079/* given that c is a base-64 character, what is its base-64 value? */
2080
2081#define FROM_BASE64(c) \
2082 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2083 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2084 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2085 (c) == '+' ? 62 : 63)
2086
2087/* What is the base-64 character of the bottom 6 bits of n? */
2088
2089#define TO_BASE64(n) \
2090 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2091
2092/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2093 * decoded as itself. We are permissive on decoding; the only ASCII
2094 * byte not decoding to itself is the + which begins a base64
2095 * string. */
2096
2097#define DECODE_DIRECT(c) \
2098 ((c) <= 127 && (c) != '+')
2099
2100/* The UTF-7 encoder treats ASCII characters differently according to
2101 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2102 * the above). See RFC2152. This array identifies these different
2103 * sets:
2104 * 0 : "Set D"
2105 * alphanumeric and '(),-./:?
2106 * 1 : "Set O"
2107 * !"#$%&*;<=>@[]^_`{|}
2108 * 2 : "whitespace"
2109 * ht nl cr sp
2110 * 3 : special (must be base64 encoded)
2111 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2112 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002113
Tim Petersced69f82003-09-16 20:30:58 +00002114static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002115char utf7_category[128] = {
2116/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2117 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2118/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2119 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2120/* sp ! " # $ % & ' ( ) * + , - . / */
2121 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2122/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2124/* @ A B C D E F G H I J K L M N O */
2125 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2126/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2128/* ` a b c d e f g h i j k l m n o */
2129 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2130/* p q r s t u v w x y z { | } ~ del */
2131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002132};
2133
Antoine Pitrou244651a2009-05-04 18:56:13 +00002134/* ENCODE_DIRECT: this character should be encoded as itself. The
2135 * answer depends on whether we are encoding set O as itself, and also
2136 * on whether we are encoding whitespace as itself. RFC2152 makes it
2137 * clear that the answers to these questions vary between
2138 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002139
Antoine Pitrou244651a2009-05-04 18:56:13 +00002140#define ENCODE_DIRECT(c, directO, directWS) \
2141 ((c) < 128 && (c) > 0 && \
2142 ((utf7_category[(c)] == 0) || \
2143 (directWS && (utf7_category[(c)] == 2)) || \
2144 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002145
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002146PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002147 Py_ssize_t size,
2148 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002149{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002150 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2151}
2152
Antoine Pitrou244651a2009-05-04 18:56:13 +00002153/* The decoder. The only state we preserve is our read position,
2154 * i.e. how many characters we have consumed. So if we end in the
2155 * middle of a shift sequence we have to back off the read position
2156 * and the output to the beginning of the sequence, otherwise we lose
2157 * all the shift state (seen bits, number of bits seen, high
2158 * surrogate). */
2159
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002160PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002161 Py_ssize_t size,
2162 const char *errors,
2163 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002164{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002165 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002166 Py_ssize_t startinpos;
2167 Py_ssize_t endinpos;
2168 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002169 const char *e;
2170 PyUnicodeObject *unicode;
2171 Py_UNICODE *p;
2172 const char *errmsg = "";
2173 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002174 Py_UNICODE *shiftOutStart;
2175 unsigned int base64bits = 0;
2176 unsigned long base64buffer = 0;
2177 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002178 PyObject *errorHandler = NULL;
2179 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002180
2181 unicode = _PyUnicode_New(size);
2182 if (!unicode)
2183 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002184 if (size == 0) {
2185 if (consumed)
2186 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002187 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002188 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002189
2190 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002191 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002192 e = s + size;
2193
2194 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002195 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002196 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002197 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002198
Antoine Pitrou244651a2009-05-04 18:56:13 +00002199 if (inShift) { /* in a base-64 section */
2200 if (IS_BASE64(ch)) { /* consume a base-64 character */
2201 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2202 base64bits += 6;
2203 s++;
2204 if (base64bits >= 16) {
2205 /* we have enough bits for a UTF-16 value */
2206 Py_UNICODE outCh = (Py_UNICODE)
2207 (base64buffer >> (base64bits-16));
2208 base64bits -= 16;
2209 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2210 if (surrogate) {
2211 /* expecting a second surrogate */
2212 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2213#ifdef Py_UNICODE_WIDE
2214 *p++ = (((surrogate & 0x3FF)<<10)
2215 | (outCh & 0x3FF)) + 0x10000;
2216#else
2217 *p++ = surrogate;
2218 *p++ = outCh;
2219#endif
2220 surrogate = 0;
2221 }
2222 else {
2223 surrogate = 0;
2224 errmsg = "second surrogate missing";
2225 goto utf7Error;
2226 }
2227 }
2228 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2229 /* first surrogate */
2230 surrogate = outCh;
2231 }
2232 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2233 errmsg = "unexpected second surrogate";
2234 goto utf7Error;
2235 }
2236 else {
2237 *p++ = outCh;
2238 }
2239 }
2240 }
2241 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002242 inShift = 0;
2243 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002244 if (surrogate) {
2245 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002246 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002247 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002248 if (base64bits > 0) { /* left-over bits */
2249 if (base64bits >= 6) {
2250 /* We've seen at least one base-64 character */
2251 errmsg = "partial character in shift sequence";
2252 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002253 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002254 else {
2255 /* Some bits remain; they should be zero */
2256 if (base64buffer != 0) {
2257 errmsg = "non-zero padding bits in shift sequence";
2258 goto utf7Error;
2259 }
2260 }
2261 }
2262 if (ch != '-') {
2263 /* '-' is absorbed; other terminating
2264 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002265 *p++ = ch;
2266 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002267 }
2268 }
2269 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002270 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002271 s++; /* consume '+' */
2272 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002273 s++;
2274 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002275 }
2276 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002277 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002278 shiftOutStart = p;
2279 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002280 }
2281 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002282 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002283 *p++ = ch;
2284 s++;
2285 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002286 else {
2287 startinpos = s-starts;
2288 s++;
2289 errmsg = "unexpected special character";
2290 goto utf7Error;
2291 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002292 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002294 outpos = p-PyUnicode_AS_UNICODE(unicode);
2295 endinpos = s-starts;
2296 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002297 errors, &errorHandler,
2298 "utf7", errmsg,
2299 &starts, &e, &startinpos, &endinpos, &exc, &s,
2300 &unicode, &outpos, &p))
2301 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002302 }
2303
Antoine Pitrou244651a2009-05-04 18:56:13 +00002304 /* end of string */
2305
2306 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2307 /* if we're in an inconsistent state, that's an error */
2308 if (surrogate ||
2309 (base64bits >= 6) ||
2310 (base64bits > 0 && base64buffer != 0)) {
2311 outpos = p-PyUnicode_AS_UNICODE(unicode);
2312 endinpos = size;
2313 if (unicode_decode_call_errorhandler(
2314 errors, &errorHandler,
2315 "utf7", "unterminated shift sequence",
2316 &starts, &e, &startinpos, &endinpos, &exc, &s,
2317 &unicode, &outpos, &p))
2318 goto onError;
2319 if (s < e)
2320 goto restart;
2321 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002322 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002323
2324 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002325 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002326 if (inShift) {
2327 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002328 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002329 }
2330 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002331 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002332 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002333 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002334
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002335 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002336 goto onError;
2337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002338 Py_XDECREF(errorHandler);
2339 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002340 return (PyObject *)unicode;
2341
Benjamin Peterson29060642009-01-31 22:14:21 +00002342 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002343 Py_XDECREF(errorHandler);
2344 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002345 Py_DECREF(unicode);
2346 return NULL;
2347}
2348
2349
2350PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002351 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002352 int base64SetO,
2353 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002354 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002355{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002356 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002357 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002358 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002359 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002360 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002361 unsigned int base64bits = 0;
2362 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002363 char * out;
2364 char * start;
2365
2366 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002367 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002368
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002369 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002370 return PyErr_NoMemory();
2371
Antoine Pitrou244651a2009-05-04 18:56:13 +00002372 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002373 if (v == NULL)
2374 return NULL;
2375
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002376 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002377 for (;i < size; ++i) {
2378 Py_UNICODE ch = s[i];
2379
Antoine Pitrou244651a2009-05-04 18:56:13 +00002380 if (inShift) {
2381 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2382 /* shifting out */
2383 if (base64bits) { /* output remaining bits */
2384 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2385 base64buffer = 0;
2386 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002387 }
2388 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002389 /* Characters not in the BASE64 set implicitly unshift the sequence
2390 so no '-' is required, except if the character is itself a '-' */
2391 if (IS_BASE64(ch) || ch == '-') {
2392 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002393 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002394 *out++ = (char) ch;
2395 }
2396 else {
2397 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002400 else { /* not in a shift sequence */
2401 if (ch == '+') {
2402 *out++ = '+';
2403 *out++ = '-';
2404 }
2405 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2406 *out++ = (char) ch;
2407 }
2408 else {
2409 *out++ = '+';
2410 inShift = 1;
2411 goto encode_char;
2412 }
2413 }
2414 continue;
2415encode_char:
2416#ifdef Py_UNICODE_WIDE
2417 if (ch >= 0x10000) {
2418 /* code first surrogate */
2419 base64bits += 16;
2420 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2421 while (base64bits >= 6) {
2422 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2423 base64bits -= 6;
2424 }
2425 /* prepare second surrogate */
2426 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2427 }
2428#endif
2429 base64bits += 16;
2430 base64buffer = (base64buffer << 16) | ch;
2431 while (base64bits >= 6) {
2432 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2433 base64bits -= 6;
2434 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002435 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002436 if (base64bits)
2437 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2438 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002439 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002440 if (_PyBytes_Resize(&v, out - start) < 0)
2441 return NULL;
2442 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002443}
2444
Antoine Pitrou244651a2009-05-04 18:56:13 +00002445#undef IS_BASE64
2446#undef FROM_BASE64
2447#undef TO_BASE64
2448#undef DECODE_DIRECT
2449#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002450
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451/* --- UTF-8 Codec -------------------------------------------------------- */
2452
Tim Petersced69f82003-09-16 20:30:58 +00002453static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002455 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2456 illegal prefix. See RFC 3629 for details */
2457 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2458 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002459 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2463 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002464 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2465 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2469 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2470 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2471 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2472 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473};
2474
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002476 Py_ssize_t size,
2477 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478{
Walter Dörwald69652032004-09-07 20:24:22 +00002479 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2480}
2481
Antoine Pitrouab868312009-01-10 15:40:25 +00002482/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2483#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2484
2485/* Mask to quickly check whether a C 'long' contains a
2486 non-ASCII, UTF8-encoded char. */
2487#if (SIZEOF_LONG == 8)
2488# define ASCII_CHAR_MASK 0x8080808080808080L
2489#elif (SIZEOF_LONG == 4)
2490# define ASCII_CHAR_MASK 0x80808080L
2491#else
2492# error C 'long' size should be either 4 or 8!
2493#endif
2494
Walter Dörwald69652032004-09-07 20:24:22 +00002495PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002496 Py_ssize_t size,
2497 const char *errors,
2498 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002499{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002502 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002503 Py_ssize_t startinpos;
2504 Py_ssize_t endinpos;
2505 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002506 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507 PyUnicodeObject *unicode;
2508 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002509 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002510 PyObject *errorHandler = NULL;
2511 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512
2513 /* Note: size will always be longer than the resulting Unicode
2514 character count */
2515 unicode = _PyUnicode_New(size);
2516 if (!unicode)
2517 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002518 if (size == 0) {
2519 if (consumed)
2520 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523
2524 /* Unpack UTF-8 encoded data */
2525 p = unicode->str;
2526 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002527 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528
2529 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002530 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531
2532 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002533 /* Fast path for runs of ASCII characters. Given that common UTF-8
2534 input will consist of an overwhelming majority of ASCII
2535 characters, we try to optimize for this case by checking
2536 as many characters as a C 'long' can contain.
2537 First, check if we can do an aligned read, as most CPUs have
2538 a penalty for unaligned reads.
2539 */
2540 if (!((size_t) s & LONG_PTR_MASK)) {
2541 /* Help register allocation */
2542 register const char *_s = s;
2543 register Py_UNICODE *_p = p;
2544 while (_s < aligned_end) {
2545 /* Read a whole long at a time (either 4 or 8 bytes),
2546 and do a fast unrolled copy if it only contains ASCII
2547 characters. */
2548 unsigned long data = *(unsigned long *) _s;
2549 if (data & ASCII_CHAR_MASK)
2550 break;
2551 _p[0] = (unsigned char) _s[0];
2552 _p[1] = (unsigned char) _s[1];
2553 _p[2] = (unsigned char) _s[2];
2554 _p[3] = (unsigned char) _s[3];
2555#if (SIZEOF_LONG == 8)
2556 _p[4] = (unsigned char) _s[4];
2557 _p[5] = (unsigned char) _s[5];
2558 _p[6] = (unsigned char) _s[6];
2559 _p[7] = (unsigned char) _s[7];
2560#endif
2561 _s += SIZEOF_LONG;
2562 _p += SIZEOF_LONG;
2563 }
2564 s = _s;
2565 p = _p;
2566 if (s == e)
2567 break;
2568 ch = (unsigned char)*s;
2569 }
2570 }
2571
2572 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002573 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 s++;
2575 continue;
2576 }
2577
2578 n = utf8_code_length[ch];
2579
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002580 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002581 if (consumed)
2582 break;
2583 else {
2584 errmsg = "unexpected end of data";
2585 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002586 endinpos = startinpos+1;
2587 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2588 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002589 goto utf8Error;
2590 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592
2593 switch (n) {
2594
2595 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002596 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002597 startinpos = s-starts;
2598 endinpos = startinpos+1;
2599 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600
2601 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002602 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002603 startinpos = s-starts;
2604 endinpos = startinpos+1;
2605 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606
2607 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002608 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002609 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002610 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002611 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002612 goto utf8Error;
2613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002615 assert ((ch > 0x007F) && (ch <= 0x07FF));
2616 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 break;
2618
2619 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002620 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2621 will result in surrogates in range d800-dfff. Surrogates are
2622 not valid UTF-8 so they are rejected.
2623 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2624 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002625 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002626 (s[2] & 0xc0) != 0x80 ||
2627 ((unsigned char)s[0] == 0xE0 &&
2628 (unsigned char)s[1] < 0xA0) ||
2629 ((unsigned char)s[0] == 0xED &&
2630 (unsigned char)s[1] > 0x9F)) {
2631 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002632 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002633 endinpos = startinpos + 1;
2634
2635 /* if s[1] first two bits are 1 and 0, then the invalid
2636 continuation byte is s[2], so increment endinpos by 1,
2637 if not, s[1] is invalid and endinpos doesn't need to
2638 be incremented. */
2639 if ((s[1] & 0xC0) == 0x80)
2640 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002641 goto utf8Error;
2642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002644 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2645 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002646 break;
2647
2648 case 4:
2649 if ((s[1] & 0xc0) != 0x80 ||
2650 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002651 (s[3] & 0xc0) != 0x80 ||
2652 ((unsigned char)s[0] == 0xF0 &&
2653 (unsigned char)s[1] < 0x90) ||
2654 ((unsigned char)s[0] == 0xF4 &&
2655 (unsigned char)s[1] > 0x8F)) {
2656 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002657 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002658 endinpos = startinpos + 1;
2659 if ((s[1] & 0xC0) == 0x80) {
2660 endinpos++;
2661 if ((s[2] & 0xC0) == 0x80)
2662 endinpos++;
2663 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002664 goto utf8Error;
2665 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002666 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002667 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2668 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2669
Fredrik Lundh8f455852001-06-27 18:59:43 +00002670#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002672#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002673 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002674
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002675 /* translate from 10000..10FFFF to 0..FFFF */
2676 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002677
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002678 /* high surrogate = top 10 bits added to D800 */
2679 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002680
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002681 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002682 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002683#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 }
2686 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002687 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002688
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 utf8Error:
2690 outpos = p-PyUnicode_AS_UNICODE(unicode);
2691 if (unicode_decode_call_errorhandler(
2692 errors, &errorHandler,
2693 "utf8", errmsg,
2694 &starts, &e, &startinpos, &endinpos, &exc, &s,
2695 &unicode, &outpos, &p))
2696 goto onError;
2697 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 }
Walter Dörwald69652032004-09-07 20:24:22 +00002699 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002700 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701
2702 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002703 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 goto onError;
2705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 Py_XDECREF(errorHandler);
2707 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708 return (PyObject *)unicode;
2709
Benjamin Peterson29060642009-01-31 22:14:21 +00002710 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 Py_DECREF(unicode);
2714 return NULL;
2715}
2716
Antoine Pitrouab868312009-01-10 15:40:25 +00002717#undef ASCII_CHAR_MASK
2718
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002719#ifdef __APPLE__
2720
2721/* Simplified UTF-8 decoder using surrogateescape error handler,
2722 used to decode the command line arguments on Mac OS X. */
2723
2724wchar_t*
2725_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2726{
2727 int n;
2728 const char *e;
2729 wchar_t *unicode, *p;
2730
2731 /* Note: size will always be longer than the resulting Unicode
2732 character count */
2733 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2734 PyErr_NoMemory();
2735 return NULL;
2736 }
2737 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2738 if (!unicode)
2739 return NULL;
2740
2741 /* Unpack UTF-8 encoded data */
2742 p = unicode;
2743 e = s + size;
2744 while (s < e) {
2745 Py_UCS4 ch = (unsigned char)*s;
2746
2747 if (ch < 0x80) {
2748 *p++ = (wchar_t)ch;
2749 s++;
2750 continue;
2751 }
2752
2753 n = utf8_code_length[ch];
2754 if (s + n > e) {
2755 goto surrogateescape;
2756 }
2757
2758 switch (n) {
2759 case 0:
2760 case 1:
2761 goto surrogateescape;
2762
2763 case 2:
2764 if ((s[1] & 0xc0) != 0x80)
2765 goto surrogateescape;
2766 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2767 assert ((ch > 0x007F) && (ch <= 0x07FF));
2768 *p++ = (wchar_t)ch;
2769 break;
2770
2771 case 3:
2772 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2773 will result in surrogates in range d800-dfff. Surrogates are
2774 not valid UTF-8 so they are rejected.
2775 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2776 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2777 if ((s[1] & 0xc0) != 0x80 ||
2778 (s[2] & 0xc0) != 0x80 ||
2779 ((unsigned char)s[0] == 0xE0 &&
2780 (unsigned char)s[1] < 0xA0) ||
2781 ((unsigned char)s[0] == 0xED &&
2782 (unsigned char)s[1] > 0x9F)) {
2783
2784 goto surrogateescape;
2785 }
2786 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2787 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2788 *p++ = (Py_UNICODE)ch;
2789 break;
2790
2791 case 4:
2792 if ((s[1] & 0xc0) != 0x80 ||
2793 (s[2] & 0xc0) != 0x80 ||
2794 (s[3] & 0xc0) != 0x80 ||
2795 ((unsigned char)s[0] == 0xF0 &&
2796 (unsigned char)s[1] < 0x90) ||
2797 ((unsigned char)s[0] == 0xF4 &&
2798 (unsigned char)s[1] > 0x8F)) {
2799 goto surrogateescape;
2800 }
2801 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2802 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2803 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2804
2805#if SIZEOF_WCHAR_T == 4
2806 *p++ = (wchar_t)ch;
2807#else
2808 /* compute and append the two surrogates: */
2809
2810 /* translate from 10000..10FFFF to 0..FFFF */
2811 ch -= 0x10000;
2812
2813 /* high surrogate = top 10 bits added to D800 */
2814 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2815
2816 /* low surrogate = bottom 10 bits added to DC00 */
2817 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2818#endif
2819 break;
2820 }
2821 s += n;
2822 continue;
2823
2824 surrogateescape:
2825 *p++ = 0xDC00 + ch;
2826 s++;
2827 }
2828 *p = L'\0';
2829 return unicode;
2830}
2831
2832#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002833
Tim Peters602f7402002-04-27 18:03:26 +00002834/* Allocation strategy: if the string is short, convert into a stack buffer
2835 and allocate exactly as much space needed at the end. Else allocate the
2836 maximum possible needed (4 result bytes per Unicode character), and return
2837 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002838*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002839PyObject *
2840PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 Py_ssize_t size,
2842 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843{
Tim Peters602f7402002-04-27 18:03:26 +00002844#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002845
Guido van Rossum98297ee2007-11-06 21:34:58 +00002846 Py_ssize_t i; /* index into s of next input byte */
2847 PyObject *result; /* result string object */
2848 char *p; /* next free byte in output buffer */
2849 Py_ssize_t nallocated; /* number of result bytes allocated */
2850 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002851 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002852 PyObject *errorHandler = NULL;
2853 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002854
Tim Peters602f7402002-04-27 18:03:26 +00002855 assert(s != NULL);
2856 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857
Tim Peters602f7402002-04-27 18:03:26 +00002858 if (size <= MAX_SHORT_UNICHARS) {
2859 /* Write into the stack buffer; nallocated can't overflow.
2860 * At the end, we'll allocate exactly as much heap space as it
2861 * turns out we need.
2862 */
2863 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002864 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002865 p = stackbuf;
2866 }
2867 else {
2868 /* Overallocate on the heap, and give the excess back at the end. */
2869 nallocated = size * 4;
2870 if (nallocated / 4 != size) /* overflow! */
2871 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002872 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002873 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002874 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002875 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002876 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002877
Tim Peters602f7402002-04-27 18:03:26 +00002878 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002879 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002880
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002881 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002882 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002884
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002886 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002887 *p++ = (char)(0xc0 | (ch >> 6));
2888 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002889 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002890#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002891 /* Special case: check for high and low surrogate */
2892 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2893 Py_UCS4 ch2 = s[i];
2894 /* Combine the two surrogates to form a UCS4 value */
2895 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2896 i++;
2897
2898 /* Encode UCS4 Unicode ordinals */
2899 *p++ = (char)(0xf0 | (ch >> 18));
2900 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002901 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2902 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002903 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002904#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002905 Py_ssize_t newpos;
2906 PyObject *rep;
2907 Py_ssize_t repsize, k;
2908 rep = unicode_encode_call_errorhandler
2909 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2910 s, size, &exc, i-1, i, &newpos);
2911 if (!rep)
2912 goto error;
2913
2914 if (PyBytes_Check(rep))
2915 repsize = PyBytes_GET_SIZE(rep);
2916 else
2917 repsize = PyUnicode_GET_SIZE(rep);
2918
2919 if (repsize > 4) {
2920 Py_ssize_t offset;
2921
2922 if (result == NULL)
2923 offset = p - stackbuf;
2924 else
2925 offset = p - PyBytes_AS_STRING(result);
2926
2927 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2928 /* integer overflow */
2929 PyErr_NoMemory();
2930 goto error;
2931 }
2932 nallocated += repsize - 4;
2933 if (result != NULL) {
2934 if (_PyBytes_Resize(&result, nallocated) < 0)
2935 goto error;
2936 } else {
2937 result = PyBytes_FromStringAndSize(NULL, nallocated);
2938 if (result == NULL)
2939 goto error;
2940 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2941 }
2942 p = PyBytes_AS_STRING(result) + offset;
2943 }
2944
2945 if (PyBytes_Check(rep)) {
2946 char *prep = PyBytes_AS_STRING(rep);
2947 for(k = repsize; k > 0; k--)
2948 *p++ = *prep++;
2949 } else /* rep is unicode */ {
2950 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2951 Py_UNICODE c;
2952
2953 for(k=0; k<repsize; k++) {
2954 c = prep[k];
2955 if (0x80 <= c) {
2956 raise_encode_exception(&exc, "utf-8", s, size,
2957 i-1, i, "surrogates not allowed");
2958 goto error;
2959 }
2960 *p++ = (char)prep[k];
2961 }
2962 }
2963 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002964#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002965 }
Victor Stinner445a6232010-04-22 20:01:57 +00002966#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002967 } else if (ch < 0x10000) {
2968 *p++ = (char)(0xe0 | (ch >> 12));
2969 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2970 *p++ = (char)(0x80 | (ch & 0x3f));
2971 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002972 /* Encode UCS4 Unicode ordinals */
2973 *p++ = (char)(0xf0 | (ch >> 18));
2974 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2975 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2976 *p++ = (char)(0x80 | (ch & 0x3f));
2977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002979
Guido van Rossum98297ee2007-11-06 21:34:58 +00002980 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002981 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00002982 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002983 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002984 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002985 }
2986 else {
Christian Heimesf3863112007-11-22 07:46:41 +00002987 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00002988 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002989 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00002990 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002991 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002992 Py_XDECREF(errorHandler);
2993 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002994 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002995 error:
2996 Py_XDECREF(errorHandler);
2997 Py_XDECREF(exc);
2998 Py_XDECREF(result);
2999 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003000
Tim Peters602f7402002-04-27 18:03:26 +00003001#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002}
3003
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3005{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 if (!PyUnicode_Check(unicode)) {
3007 PyErr_BadArgument();
3008 return NULL;
3009 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003010 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 PyUnicode_GET_SIZE(unicode),
3012 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013}
3014
Walter Dörwald41980ca2007-08-16 21:55:45 +00003015/* --- UTF-32 Codec ------------------------------------------------------- */
3016
3017PyObject *
3018PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 Py_ssize_t size,
3020 const char *errors,
3021 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003022{
3023 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3024}
3025
3026PyObject *
3027PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 Py_ssize_t size,
3029 const char *errors,
3030 int *byteorder,
3031 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003032{
3033 const char *starts = s;
3034 Py_ssize_t startinpos;
3035 Py_ssize_t endinpos;
3036 Py_ssize_t outpos;
3037 PyUnicodeObject *unicode;
3038 Py_UNICODE *p;
3039#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003040 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003041 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003042#else
3043 const int pairs = 0;
3044#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003045 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003046 int bo = 0; /* assume native ordering by default */
3047 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003048 /* Offsets from q for retrieving bytes in the right order. */
3049#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3050 int iorder[] = {0, 1, 2, 3};
3051#else
3052 int iorder[] = {3, 2, 1, 0};
3053#endif
3054 PyObject *errorHandler = NULL;
3055 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003056
Walter Dörwald41980ca2007-08-16 21:55:45 +00003057 q = (unsigned char *)s;
3058 e = q + size;
3059
3060 if (byteorder)
3061 bo = *byteorder;
3062
3063 /* Check for BOM marks (U+FEFF) in the input and adjust current
3064 byte order setting accordingly. In native mode, the leading BOM
3065 mark is skipped, in all other modes, it is copied to the output
3066 stream as-is (giving a ZWNBSP character). */
3067 if (bo == 0) {
3068 if (size >= 4) {
3069 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003071#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 if (bom == 0x0000FEFF) {
3073 q += 4;
3074 bo = -1;
3075 }
3076 else if (bom == 0xFFFE0000) {
3077 q += 4;
3078 bo = 1;
3079 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003080#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003081 if (bom == 0x0000FEFF) {
3082 q += 4;
3083 bo = 1;
3084 }
3085 else if (bom == 0xFFFE0000) {
3086 q += 4;
3087 bo = -1;
3088 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003089#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003090 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003091 }
3092
3093 if (bo == -1) {
3094 /* force LE */
3095 iorder[0] = 0;
3096 iorder[1] = 1;
3097 iorder[2] = 2;
3098 iorder[3] = 3;
3099 }
3100 else if (bo == 1) {
3101 /* force BE */
3102 iorder[0] = 3;
3103 iorder[1] = 2;
3104 iorder[2] = 1;
3105 iorder[3] = 0;
3106 }
3107
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003108 /* On narrow builds we split characters outside the BMP into two
3109 codepoints => count how much extra space we need. */
3110#ifndef Py_UNICODE_WIDE
3111 for (qq = q; qq < e; qq += 4)
3112 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3113 pairs++;
3114#endif
3115
3116 /* This might be one to much, because of a BOM */
3117 unicode = _PyUnicode_New((size+3)/4+pairs);
3118 if (!unicode)
3119 return NULL;
3120 if (size == 0)
3121 return (PyObject *)unicode;
3122
3123 /* Unpack UTF-32 encoded data */
3124 p = unicode->str;
3125
Walter Dörwald41980ca2007-08-16 21:55:45 +00003126 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 Py_UCS4 ch;
3128 /* remaining bytes at the end? (size should be divisible by 4) */
3129 if (e-q<4) {
3130 if (consumed)
3131 break;
3132 errmsg = "truncated data";
3133 startinpos = ((const char *)q)-starts;
3134 endinpos = ((const char *)e)-starts;
3135 goto utf32Error;
3136 /* The remaining input chars are ignored if the callback
3137 chooses to skip the input */
3138 }
3139 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3140 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003141
Benjamin Peterson29060642009-01-31 22:14:21 +00003142 if (ch >= 0x110000)
3143 {
3144 errmsg = "codepoint not in range(0x110000)";
3145 startinpos = ((const char *)q)-starts;
3146 endinpos = startinpos+4;
3147 goto utf32Error;
3148 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003149#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003150 if (ch >= 0x10000)
3151 {
3152 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3153 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3154 }
3155 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003156#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003157 *p++ = ch;
3158 q += 4;
3159 continue;
3160 utf32Error:
3161 outpos = p-PyUnicode_AS_UNICODE(unicode);
3162 if (unicode_decode_call_errorhandler(
3163 errors, &errorHandler,
3164 "utf32", errmsg,
3165 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3166 &unicode, &outpos, &p))
3167 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003168 }
3169
3170 if (byteorder)
3171 *byteorder = bo;
3172
3173 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003174 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003175
3176 /* Adjust length */
3177 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3178 goto onError;
3179
3180 Py_XDECREF(errorHandler);
3181 Py_XDECREF(exc);
3182 return (PyObject *)unicode;
3183
Benjamin Peterson29060642009-01-31 22:14:21 +00003184 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003185 Py_DECREF(unicode);
3186 Py_XDECREF(errorHandler);
3187 Py_XDECREF(exc);
3188 return NULL;
3189}
3190
3191PyObject *
3192PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003193 Py_ssize_t size,
3194 const char *errors,
3195 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003196{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003197 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003198 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003199 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003200#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003201 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003202#else
3203 const int pairs = 0;
3204#endif
3205 /* Offsets from p for storing byte pairs in the right order. */
3206#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3207 int iorder[] = {0, 1, 2, 3};
3208#else
3209 int iorder[] = {3, 2, 1, 0};
3210#endif
3211
Benjamin Peterson29060642009-01-31 22:14:21 +00003212#define STORECHAR(CH) \
3213 do { \
3214 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3215 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3216 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3217 p[iorder[0]] = (CH) & 0xff; \
3218 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003219 } while(0)
3220
3221 /* In narrow builds we can output surrogate pairs as one codepoint,
3222 so we need less space. */
3223#ifndef Py_UNICODE_WIDE
3224 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3226 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3227 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003228#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003229 nsize = (size - pairs + (byteorder == 0));
3230 bytesize = nsize * 4;
3231 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003232 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003233 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003234 if (v == NULL)
3235 return NULL;
3236
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003237 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003238 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003240 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003241 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003242
3243 if (byteorder == -1) {
3244 /* force LE */
3245 iorder[0] = 0;
3246 iorder[1] = 1;
3247 iorder[2] = 2;
3248 iorder[3] = 3;
3249 }
3250 else if (byteorder == 1) {
3251 /* force BE */
3252 iorder[0] = 3;
3253 iorder[1] = 2;
3254 iorder[2] = 1;
3255 iorder[3] = 0;
3256 }
3257
3258 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003259 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003260#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3262 Py_UCS4 ch2 = *s;
3263 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3264 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3265 s++;
3266 size--;
3267 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003268 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003269#endif
3270 STORECHAR(ch);
3271 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003272
3273 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003274 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003275#undef STORECHAR
3276}
3277
3278PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3279{
3280 if (!PyUnicode_Check(unicode)) {
3281 PyErr_BadArgument();
3282 return NULL;
3283 }
3284 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 PyUnicode_GET_SIZE(unicode),
3286 NULL,
3287 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003288}
3289
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290/* --- UTF-16 Codec ------------------------------------------------------- */
3291
Tim Peters772747b2001-08-09 22:21:55 +00003292PyObject *
3293PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003294 Py_ssize_t size,
3295 const char *errors,
3296 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003297{
Walter Dörwald69652032004-09-07 20:24:22 +00003298 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3299}
3300
Antoine Pitrouab868312009-01-10 15:40:25 +00003301/* Two masks for fast checking of whether a C 'long' may contain
3302 UTF16-encoded surrogate characters. This is an efficient heuristic,
3303 assuming that non-surrogate characters with a code point >= 0x8000 are
3304 rare in most input.
3305 FAST_CHAR_MASK is used when the input is in native byte ordering,
3306 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003307*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003308#if (SIZEOF_LONG == 8)
3309# define FAST_CHAR_MASK 0x8000800080008000L
3310# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3311#elif (SIZEOF_LONG == 4)
3312# define FAST_CHAR_MASK 0x80008000L
3313# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3314#else
3315# error C 'long' size should be either 4 or 8!
3316#endif
3317
Walter Dörwald69652032004-09-07 20:24:22 +00003318PyObject *
3319PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003320 Py_ssize_t size,
3321 const char *errors,
3322 int *byteorder,
3323 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003324{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003325 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003326 Py_ssize_t startinpos;
3327 Py_ssize_t endinpos;
3328 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329 PyUnicodeObject *unicode;
3330 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003331 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003332 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003333 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003334 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003335 /* Offsets from q for retrieving byte pairs in the right order. */
3336#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3337 int ihi = 1, ilo = 0;
3338#else
3339 int ihi = 0, ilo = 1;
3340#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 PyObject *errorHandler = NULL;
3342 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343
3344 /* Note: size will always be longer than the resulting Unicode
3345 character count */
3346 unicode = _PyUnicode_New(size);
3347 if (!unicode)
3348 return NULL;
3349 if (size == 0)
3350 return (PyObject *)unicode;
3351
3352 /* Unpack UTF-16 encoded data */
3353 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003354 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003355 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356
3357 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003358 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003360 /* Check for BOM marks (U+FEFF) in the input and adjust current
3361 byte order setting accordingly. In native mode, the leading BOM
3362 mark is skipped, in all other modes, it is copied to the output
3363 stream as-is (giving a ZWNBSP character). */
3364 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003365 if (size >= 2) {
3366 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003367#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003368 if (bom == 0xFEFF) {
3369 q += 2;
3370 bo = -1;
3371 }
3372 else if (bom == 0xFFFE) {
3373 q += 2;
3374 bo = 1;
3375 }
Tim Petersced69f82003-09-16 20:30:58 +00003376#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003377 if (bom == 0xFEFF) {
3378 q += 2;
3379 bo = 1;
3380 }
3381 else if (bom == 0xFFFE) {
3382 q += 2;
3383 bo = -1;
3384 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003385#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003386 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003387 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388
Tim Peters772747b2001-08-09 22:21:55 +00003389 if (bo == -1) {
3390 /* force LE */
3391 ihi = 1;
3392 ilo = 0;
3393 }
3394 else if (bo == 1) {
3395 /* force BE */
3396 ihi = 0;
3397 ilo = 1;
3398 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003399#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3400 native_ordering = ilo < ihi;
3401#else
3402 native_ordering = ilo > ihi;
3403#endif
Tim Peters772747b2001-08-09 22:21:55 +00003404
Antoine Pitrouab868312009-01-10 15:40:25 +00003405 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003406 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003407 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003408 /* First check for possible aligned read of a C 'long'. Unaligned
3409 reads are more expensive, better to defer to another iteration. */
3410 if (!((size_t) q & LONG_PTR_MASK)) {
3411 /* Fast path for runs of non-surrogate chars. */
3412 register const unsigned char *_q = q;
3413 Py_UNICODE *_p = p;
3414 if (native_ordering) {
3415 /* Native ordering is simple: as long as the input cannot
3416 possibly contain a surrogate char, do an unrolled copy
3417 of several 16-bit code points to the target object.
3418 The non-surrogate check is done on several input bytes
3419 at a time (as many as a C 'long' can contain). */
3420 while (_q < aligned_end) {
3421 unsigned long data = * (unsigned long *) _q;
3422 if (data & FAST_CHAR_MASK)
3423 break;
3424 _p[0] = ((unsigned short *) _q)[0];
3425 _p[1] = ((unsigned short *) _q)[1];
3426#if (SIZEOF_LONG == 8)
3427 _p[2] = ((unsigned short *) _q)[2];
3428 _p[3] = ((unsigned short *) _q)[3];
3429#endif
3430 _q += SIZEOF_LONG;
3431 _p += SIZEOF_LONG / 2;
3432 }
3433 }
3434 else {
3435 /* Byteswapped ordering is similar, but we must decompose
3436 the copy bytewise, and take care of zero'ing out the
3437 upper bytes if the target object is in 32-bit units
3438 (that is, in UCS-4 builds). */
3439 while (_q < aligned_end) {
3440 unsigned long data = * (unsigned long *) _q;
3441 if (data & SWAPPED_FAST_CHAR_MASK)
3442 break;
3443 /* Zero upper bytes in UCS-4 builds */
3444#if (Py_UNICODE_SIZE > 2)
3445 _p[0] = 0;
3446 _p[1] = 0;
3447#if (SIZEOF_LONG == 8)
3448 _p[2] = 0;
3449 _p[3] = 0;
3450#endif
3451#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003452 /* Issue #4916; UCS-4 builds on big endian machines must
3453 fill the two last bytes of each 4-byte unit. */
3454#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3455# define OFF 2
3456#else
3457# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003458#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003459 ((unsigned char *) _p)[OFF + 1] = _q[0];
3460 ((unsigned char *) _p)[OFF + 0] = _q[1];
3461 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3462 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3463#if (SIZEOF_LONG == 8)
3464 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3465 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3466 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3467 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3468#endif
3469#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003470 _q += SIZEOF_LONG;
3471 _p += SIZEOF_LONG / 2;
3472 }
3473 }
3474 p = _p;
3475 q = _q;
3476 if (q >= e)
3477 break;
3478 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480
Benjamin Peterson14339b62009-01-31 16:36:08 +00003481 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003482
3483 if (ch < 0xD800 || ch > 0xDFFF) {
3484 *p++ = ch;
3485 continue;
3486 }
3487
3488 /* UTF-16 code pair: */
3489 if (q > e) {
3490 errmsg = "unexpected end of data";
3491 startinpos = (((const char *)q) - 2) - starts;
3492 endinpos = ((const char *)e) + 1 - starts;
3493 goto utf16Error;
3494 }
3495 if (0xD800 <= ch && ch <= 0xDBFF) {
3496 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3497 q += 2;
3498 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003499#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003500 *p++ = ch;
3501 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003502#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003503 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003504#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003505 continue;
3506 }
3507 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003508 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003509 startinpos = (((const char *)q)-4)-starts;
3510 endinpos = startinpos+2;
3511 goto utf16Error;
3512 }
3513
Benjamin Peterson14339b62009-01-31 16:36:08 +00003514 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003515 errmsg = "illegal encoding";
3516 startinpos = (((const char *)q)-2)-starts;
3517 endinpos = startinpos+2;
3518 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003519
Benjamin Peterson29060642009-01-31 22:14:21 +00003520 utf16Error:
3521 outpos = p - PyUnicode_AS_UNICODE(unicode);
3522 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003523 errors,
3524 &errorHandler,
3525 "utf16", errmsg,
3526 &starts,
3527 (const char **)&e,
3528 &startinpos,
3529 &endinpos,
3530 &exc,
3531 (const char **)&q,
3532 &unicode,
3533 &outpos,
3534 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003537 /* remaining byte at the end? (size should be even) */
3538 if (e == q) {
3539 if (!consumed) {
3540 errmsg = "truncated data";
3541 startinpos = ((const char *)q) - starts;
3542 endinpos = ((const char *)e) + 1 - starts;
3543 outpos = p - PyUnicode_AS_UNICODE(unicode);
3544 if (unicode_decode_call_errorhandler(
3545 errors,
3546 &errorHandler,
3547 "utf16", errmsg,
3548 &starts,
3549 (const char **)&e,
3550 &startinpos,
3551 &endinpos,
3552 &exc,
3553 (const char **)&q,
3554 &unicode,
3555 &outpos,
3556 &p))
3557 goto onError;
3558 /* The remaining input chars are ignored if the callback
3559 chooses to skip the input */
3560 }
3561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562
3563 if (byteorder)
3564 *byteorder = bo;
3565
Walter Dörwald69652032004-09-07 20:24:22 +00003566 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003567 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003568
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003570 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 goto onError;
3572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003573 Py_XDECREF(errorHandler);
3574 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 return (PyObject *)unicode;
3576
Benjamin Peterson29060642009-01-31 22:14:21 +00003577 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003579 Py_XDECREF(errorHandler);
3580 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581 return NULL;
3582}
3583
Antoine Pitrouab868312009-01-10 15:40:25 +00003584#undef FAST_CHAR_MASK
3585#undef SWAPPED_FAST_CHAR_MASK
3586
Tim Peters772747b2001-08-09 22:21:55 +00003587PyObject *
3588PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 Py_ssize_t size,
3590 const char *errors,
3591 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003593 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003594 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003595 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003596#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003597 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003598#else
3599 const int pairs = 0;
3600#endif
Tim Peters772747b2001-08-09 22:21:55 +00003601 /* Offsets from p for storing byte pairs in the right order. */
3602#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3603 int ihi = 1, ilo = 0;
3604#else
3605 int ihi = 0, ilo = 1;
3606#endif
3607
Benjamin Peterson29060642009-01-31 22:14:21 +00003608#define STORECHAR(CH) \
3609 do { \
3610 p[ihi] = ((CH) >> 8) & 0xff; \
3611 p[ilo] = (CH) & 0xff; \
3612 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003613 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003615#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003616 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 if (s[i] >= 0x10000)
3618 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003619#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003620 /* 2 * (size + pairs + (byteorder == 0)) */
3621 if (size > PY_SSIZE_T_MAX ||
3622 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003623 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003624 nsize = size + pairs + (byteorder == 0);
3625 bytesize = nsize * 2;
3626 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003628 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 if (v == NULL)
3630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003632 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003634 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003635 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003636 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003637
3638 if (byteorder == -1) {
3639 /* force LE */
3640 ihi = 1;
3641 ilo = 0;
3642 }
3643 else if (byteorder == 1) {
3644 /* force BE */
3645 ihi = 0;
3646 ilo = 1;
3647 }
3648
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003649 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003650 Py_UNICODE ch = *s++;
3651 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003652#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003653 if (ch >= 0x10000) {
3654 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3655 ch = 0xD800 | ((ch-0x10000) >> 10);
3656 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003657#endif
Tim Peters772747b2001-08-09 22:21:55 +00003658 STORECHAR(ch);
3659 if (ch2)
3660 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003661 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003662
3663 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003664 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003665#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666}
3667
3668PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3669{
3670 if (!PyUnicode_Check(unicode)) {
3671 PyErr_BadArgument();
3672 return NULL;
3673 }
3674 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003675 PyUnicode_GET_SIZE(unicode),
3676 NULL,
3677 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678}
3679
3680/* --- Unicode Escape Codec ----------------------------------------------- */
3681
Fredrik Lundh06d12682001-01-24 07:59:11 +00003682static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003683
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003685 Py_ssize_t size,
3686 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003689 Py_ssize_t startinpos;
3690 Py_ssize_t endinpos;
3691 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003694 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003696 char* message;
3697 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 PyObject *errorHandler = NULL;
3699 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003700
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 /* Escaped strings will always be longer than the resulting
3702 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003703 length after conversion to the true value.
3704 (but if the error callback returns a long replacement string
3705 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 v = _PyUnicode_New(size);
3707 if (v == NULL)
3708 goto onError;
3709 if (size == 0)
3710 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003712 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003714
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 while (s < end) {
3716 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003717 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719
3720 /* Non-escape characters are interpreted as Unicode ordinals */
3721 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003722 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 continue;
3724 }
3725
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727 /* \ - Escapes */
3728 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003729 c = *s++;
3730 if (s > end)
3731 c = '\0'; /* Invalid after \ */
3732 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733
Benjamin Peterson29060642009-01-31 22:14:21 +00003734 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 case '\n': break;
3736 case '\\': *p++ = '\\'; break;
3737 case '\'': *p++ = '\''; break;
3738 case '\"': *p++ = '\"'; break;
3739 case 'b': *p++ = '\b'; break;
3740 case 'f': *p++ = '\014'; break; /* FF */
3741 case 't': *p++ = '\t'; break;
3742 case 'n': *p++ = '\n'; break;
3743 case 'r': *p++ = '\r'; break;
3744 case 'v': *p++ = '\013'; break; /* VT */
3745 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3746
Benjamin Peterson29060642009-01-31 22:14:21 +00003747 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748 case '0': case '1': case '2': case '3':
3749 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003750 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003751 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003752 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003753 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003754 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003756 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 break;
3758
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 /* hex escapes */
3760 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003762 digits = 2;
3763 message = "truncated \\xXX escape";
3764 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765
Benjamin Peterson29060642009-01-31 22:14:21 +00003766 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003768 digits = 4;
3769 message = "truncated \\uXXXX escape";
3770 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771
Benjamin Peterson29060642009-01-31 22:14:21 +00003772 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003773 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003774 digits = 8;
3775 message = "truncated \\UXXXXXXXX escape";
3776 hexescape:
3777 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 outpos = p-PyUnicode_AS_UNICODE(v);
3779 if (s+digits>end) {
3780 endinpos = size;
3781 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 errors, &errorHandler,
3783 "unicodeescape", "end of string in escape sequence",
3784 &starts, &end, &startinpos, &endinpos, &exc, &s,
3785 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 goto onError;
3787 goto nextByte;
3788 }
3789 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003790 c = (unsigned char) s[i];
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003791 if (!ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 endinpos = (s+i+1)-starts;
3793 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003794 errors, &errorHandler,
3795 "unicodeescape", message,
3796 &starts, &end, &startinpos, &endinpos, &exc, &s,
3797 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003798 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003799 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003800 }
3801 chr = (chr<<4) & ~0xF;
3802 if (c >= '0' && c <= '9')
3803 chr += c - '0';
3804 else if (c >= 'a' && c <= 'f')
3805 chr += 10 + c - 'a';
3806 else
3807 chr += 10 + c - 'A';
3808 }
3809 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003810 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 /* _decoding_error will have already written into the
3812 target buffer. */
3813 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003814 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003815 /* when we get here, chr is a 32-bit unicode character */
3816 if (chr <= 0xffff)
3817 /* UCS-2 character */
3818 *p++ = (Py_UNICODE) chr;
3819 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003820 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003821 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003822#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003823 *p++ = chr;
3824#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003825 chr -= 0x10000L;
3826 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003827 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003828#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003829 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003830 endinpos = s-starts;
3831 outpos = p-PyUnicode_AS_UNICODE(v);
3832 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003833 errors, &errorHandler,
3834 "unicodeescape", "illegal Unicode character",
3835 &starts, &end, &startinpos, &endinpos, &exc, &s,
3836 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003837 goto onError;
3838 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003839 break;
3840
Benjamin Peterson29060642009-01-31 22:14:21 +00003841 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003842 case 'N':
3843 message = "malformed \\N character escape";
3844 if (ucnhash_CAPI == NULL) {
3845 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003846 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003847 if (ucnhash_CAPI == NULL)
3848 goto ucnhashError;
3849 }
3850 if (*s == '{') {
3851 const char *start = s+1;
3852 /* look for the closing brace */
3853 while (*s != '}' && s < end)
3854 s++;
3855 if (s > start && s < end && *s == '}') {
3856 /* found a name. look it up in the unicode database */
3857 message = "unknown Unicode character name";
3858 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003859 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003860 goto store;
3861 }
3862 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003863 endinpos = s-starts;
3864 outpos = p-PyUnicode_AS_UNICODE(v);
3865 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003866 errors, &errorHandler,
3867 "unicodeescape", message,
3868 &starts, &end, &startinpos, &endinpos, &exc, &s,
3869 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003870 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003871 break;
3872
3873 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003874 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875 message = "\\ at end of string";
3876 s--;
3877 endinpos = s-starts;
3878 outpos = p-PyUnicode_AS_UNICODE(v);
3879 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003880 errors, &errorHandler,
3881 "unicodeescape", message,
3882 &starts, &end, &startinpos, &endinpos, &exc, &s,
3883 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003884 goto onError;
3885 }
3886 else {
3887 *p++ = '\\';
3888 *p++ = (unsigned char)s[-1];
3889 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003890 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003892 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003895 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003897 Py_XDECREF(errorHandler);
3898 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003899 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003900
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003902 PyErr_SetString(
3903 PyExc_UnicodeError,
3904 "\\N escapes not supported (can't load unicodedata module)"
3905 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003906 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003907 Py_XDECREF(errorHandler);
3908 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003909 return NULL;
3910
Benjamin Peterson29060642009-01-31 22:14:21 +00003911 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913 Py_XDECREF(errorHandler);
3914 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915 return NULL;
3916}
3917
3918/* Return a Unicode-Escape string version of the Unicode object.
3919
3920 If quotes is true, the string is enclosed in u"" or u'' quotes as
3921 appropriate.
3922
3923*/
3924
Thomas Wouters477c8d52006-05-27 19:21:47 +00003925Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003926 Py_ssize_t size,
3927 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003928{
3929 /* like wcschr, but doesn't stop at NULL characters */
3930
3931 while (size-- > 0) {
3932 if (*s == ch)
3933 return s;
3934 s++;
3935 }
3936
3937 return NULL;
3938}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003939
Walter Dörwald79e913e2007-05-12 11:08:06 +00003940static const char *hexdigits = "0123456789abcdef";
3941
3942PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003943 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003945 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003948#ifdef Py_UNICODE_WIDE
3949 const Py_ssize_t expandsize = 10;
3950#else
3951 const Py_ssize_t expandsize = 6;
3952#endif
3953
Thomas Wouters89f507f2006-12-13 04:49:30 +00003954 /* XXX(nnorwitz): rather than over-allocating, it would be
3955 better to choose a different scheme. Perhaps scan the
3956 first N-chars of the string and allocate based on that size.
3957 */
3958 /* Initial allocation is based on the longest-possible unichr
3959 escape.
3960
3961 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3962 unichr, so in this case it's the longest unichr escape. In
3963 narrow (UTF-16) builds this is five chars per source unichr
3964 since there are two unichrs in the surrogate pair, so in narrow
3965 (UTF-16) builds it's not the longest unichr escape.
3966
3967 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3968 so in the narrow (UTF-16) build case it's the longest unichr
3969 escape.
3970 */
3971
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003972 if (size == 0)
3973 return PyBytes_FromStringAndSize(NULL, 0);
3974
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003975 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003977
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003978 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003979 2
3980 + expandsize*size
3981 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 if (repr == NULL)
3983 return NULL;
3984
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003985 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 while (size-- > 0) {
3988 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003989
Walter Dörwald79e913e2007-05-12 11:08:06 +00003990 /* Escape backslashes */
3991 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 *p++ = '\\';
3993 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00003994 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003995 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003996
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003997#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003998 /* Map 21-bit characters to '\U00xxxxxx' */
3999 else if (ch >= 0x10000) {
4000 *p++ = '\\';
4001 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004002 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4003 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4004 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4005 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4006 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4007 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4008 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4009 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004011 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004012#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004013 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4014 else if (ch >= 0xD800 && ch < 0xDC00) {
4015 Py_UNICODE ch2;
4016 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004017
Benjamin Peterson29060642009-01-31 22:14:21 +00004018 ch2 = *s++;
4019 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004020 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004021 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4022 *p++ = '\\';
4023 *p++ = 'U';
4024 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4025 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4026 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4027 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4028 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4029 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4030 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4031 *p++ = hexdigits[ucs & 0x0000000F];
4032 continue;
4033 }
4034 /* Fall through: isolated surrogates are copied as-is */
4035 s--;
4036 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004037 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004038#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004039
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004041 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 *p++ = '\\';
4043 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004044 *p++ = hexdigits[(ch >> 12) & 0x000F];
4045 *p++ = hexdigits[(ch >> 8) & 0x000F];
4046 *p++ = hexdigits[(ch >> 4) & 0x000F];
4047 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004049
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004050 /* Map special whitespace to '\t', \n', '\r' */
4051 else if (ch == '\t') {
4052 *p++ = '\\';
4053 *p++ = 't';
4054 }
4055 else if (ch == '\n') {
4056 *p++ = '\\';
4057 *p++ = 'n';
4058 }
4059 else if (ch == '\r') {
4060 *p++ = '\\';
4061 *p++ = 'r';
4062 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004063
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004064 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004065 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004067 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004068 *p++ = hexdigits[(ch >> 4) & 0x000F];
4069 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004070 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004071
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 /* Copy everything else as-is */
4073 else
4074 *p++ = (char) ch;
4075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004077 assert(p - PyBytes_AS_STRING(repr) > 0);
4078 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4079 return NULL;
4080 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081}
4082
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004083PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004085 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086 if (!PyUnicode_Check(unicode)) {
4087 PyErr_BadArgument();
4088 return NULL;
4089 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004090 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4091 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004092 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093}
4094
4095/* --- Raw Unicode Escape Codec ------------------------------------------- */
4096
4097PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004098 Py_ssize_t size,
4099 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004102 Py_ssize_t startinpos;
4103 Py_ssize_t endinpos;
4104 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 const char *end;
4108 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109 PyObject *errorHandler = NULL;
4110 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004111
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112 /* Escaped strings will always be longer than the resulting
4113 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 length after conversion to the true value. (But decoding error
4115 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 v = _PyUnicode_New(size);
4117 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 end = s + size;
4123 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 unsigned char c;
4125 Py_UCS4 x;
4126 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004127 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 /* Non-escape characters are interpreted as Unicode ordinals */
4130 if (*s != '\\') {
4131 *p++ = (unsigned char)*s++;
4132 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004133 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 startinpos = s-starts;
4135
4136 /* \u-escapes are only interpreted iff the number of leading
4137 backslashes if odd */
4138 bs = s;
4139 for (;s < end;) {
4140 if (*s != '\\')
4141 break;
4142 *p++ = (unsigned char)*s++;
4143 }
4144 if (((s - bs) & 1) == 0 ||
4145 s >= end ||
4146 (*s != 'u' && *s != 'U')) {
4147 continue;
4148 }
4149 p--;
4150 count = *s=='u' ? 4 : 8;
4151 s++;
4152
4153 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4154 outpos = p-PyUnicode_AS_UNICODE(v);
4155 for (x = 0, i = 0; i < count; ++i, ++s) {
4156 c = (unsigned char)*s;
4157 if (!ISXDIGIT(c)) {
4158 endinpos = s-starts;
4159 if (unicode_decode_call_errorhandler(
4160 errors, &errorHandler,
4161 "rawunicodeescape", "truncated \\uXXXX",
4162 &starts, &end, &startinpos, &endinpos, &exc, &s,
4163 &v, &outpos, &p))
4164 goto onError;
4165 goto nextByte;
4166 }
4167 x = (x<<4) & ~0xF;
4168 if (c >= '0' && c <= '9')
4169 x += c - '0';
4170 else if (c >= 'a' && c <= 'f')
4171 x += 10 + c - 'a';
4172 else
4173 x += 10 + c - 'A';
4174 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004175 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 /* UCS-2 character */
4177 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004178 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 /* UCS-4 character. Either store directly, or as
4180 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004181#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004183#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004184 x -= 0x10000L;
4185 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4186 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004187#endif
4188 } else {
4189 endinpos = s-starts;
4190 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004191 if (unicode_decode_call_errorhandler(
4192 errors, &errorHandler,
4193 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 &starts, &end, &startinpos, &endinpos, &exc, &s,
4195 &v, &outpos, &p))
4196 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004197 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 nextByte:
4199 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004201 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 Py_XDECREF(errorHandler);
4204 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004206
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 Py_XDECREF(errorHandler);
4210 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211 return NULL;
4212}
4213
4214PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004215 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004217 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 char *p;
4219 char *q;
4220
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004221#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004222 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004223#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004224 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004225#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004226
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004227 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004229
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004230 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231 if (repr == NULL)
4232 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004233 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004234 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004236 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237 while (size-- > 0) {
4238 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004239#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 /* Map 32-bit characters to '\Uxxxxxxxx' */
4241 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004242 *p++ = '\\';
4243 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004244 *p++ = hexdigits[(ch >> 28) & 0xf];
4245 *p++ = hexdigits[(ch >> 24) & 0xf];
4246 *p++ = hexdigits[(ch >> 20) & 0xf];
4247 *p++ = hexdigits[(ch >> 16) & 0xf];
4248 *p++ = hexdigits[(ch >> 12) & 0xf];
4249 *p++ = hexdigits[(ch >> 8) & 0xf];
4250 *p++ = hexdigits[(ch >> 4) & 0xf];
4251 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004252 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004253 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004254#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4256 if (ch >= 0xD800 && ch < 0xDC00) {
4257 Py_UNICODE ch2;
4258 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004259
Benjamin Peterson29060642009-01-31 22:14:21 +00004260 ch2 = *s++;
4261 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004262 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004263 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4264 *p++ = '\\';
4265 *p++ = 'U';
4266 *p++ = hexdigits[(ucs >> 28) & 0xf];
4267 *p++ = hexdigits[(ucs >> 24) & 0xf];
4268 *p++ = hexdigits[(ucs >> 20) & 0xf];
4269 *p++ = hexdigits[(ucs >> 16) & 0xf];
4270 *p++ = hexdigits[(ucs >> 12) & 0xf];
4271 *p++ = hexdigits[(ucs >> 8) & 0xf];
4272 *p++ = hexdigits[(ucs >> 4) & 0xf];
4273 *p++ = hexdigits[ucs & 0xf];
4274 continue;
4275 }
4276 /* Fall through: isolated surrogates are copied as-is */
4277 s--;
4278 size++;
4279 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004280#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 /* Map 16-bit characters to '\uxxxx' */
4282 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 *p++ = '\\';
4284 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004285 *p++ = hexdigits[(ch >> 12) & 0xf];
4286 *p++ = hexdigits[(ch >> 8) & 0xf];
4287 *p++ = hexdigits[(ch >> 4) & 0xf];
4288 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 /* Copy everything else as-is */
4291 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 *p++ = (char) ch;
4293 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004294 size = p - q;
4295
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004296 assert(size > 0);
4297 if (_PyBytes_Resize(&repr, size) < 0)
4298 return NULL;
4299 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300}
4301
4302PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4303{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004304 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004306 PyErr_BadArgument();
4307 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004309 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4310 PyUnicode_GET_SIZE(unicode));
4311
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004312 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313}
4314
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004315/* --- Unicode Internal Codec ------------------------------------------- */
4316
4317PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 Py_ssize_t size,
4319 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004320{
4321 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004322 Py_ssize_t startinpos;
4323 Py_ssize_t endinpos;
4324 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004325 PyUnicodeObject *v;
4326 Py_UNICODE *p;
4327 const char *end;
4328 const char *reason;
4329 PyObject *errorHandler = NULL;
4330 PyObject *exc = NULL;
4331
Neal Norwitzd43069c2006-01-08 01:12:10 +00004332#ifdef Py_UNICODE_WIDE
4333 Py_UNICODE unimax = PyUnicode_GetMax();
4334#endif
4335
Thomas Wouters89f507f2006-12-13 04:49:30 +00004336 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004337 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4338 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004340 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004341 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004342 p = PyUnicode_AS_UNICODE(v);
4343 end = s + size;
4344
4345 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004346 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004347 /* We have to sanity check the raw data, otherwise doom looms for
4348 some malformed UCS-4 data. */
4349 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004350#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004351 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004352#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004353 end-s < Py_UNICODE_SIZE
4354 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004355 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004356 startinpos = s - starts;
4357 if (end-s < Py_UNICODE_SIZE) {
4358 endinpos = end-starts;
4359 reason = "truncated input";
4360 }
4361 else {
4362 endinpos = s - starts + Py_UNICODE_SIZE;
4363 reason = "illegal code point (> 0x10FFFF)";
4364 }
4365 outpos = p - PyUnicode_AS_UNICODE(v);
4366 if (unicode_decode_call_errorhandler(
4367 errors, &errorHandler,
4368 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004369 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004370 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004371 goto onError;
4372 }
4373 }
4374 else {
4375 p++;
4376 s += Py_UNICODE_SIZE;
4377 }
4378 }
4379
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004380 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004381 goto onError;
4382 Py_XDECREF(errorHandler);
4383 Py_XDECREF(exc);
4384 return (PyObject *)v;
4385
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004387 Py_XDECREF(v);
4388 Py_XDECREF(errorHandler);
4389 Py_XDECREF(exc);
4390 return NULL;
4391}
4392
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393/* --- Latin-1 Codec ------------------------------------------------------ */
4394
4395PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 Py_ssize_t size,
4397 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398{
4399 PyUnicodeObject *v;
4400 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004401 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004402
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004404 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 Py_UNICODE r = *(unsigned char*)s;
4406 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004407 }
4408
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409 v = _PyUnicode_New(size);
4410 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004415 e = s + size;
4416 /* Unrolling the copy makes it much faster by reducing the looping
4417 overhead. This is similar to what many memcpy() implementations do. */
4418 unrolled_end = e - 4;
4419 while (s < unrolled_end) {
4420 p[0] = (unsigned char) s[0];
4421 p[1] = (unsigned char) s[1];
4422 p[2] = (unsigned char) s[2];
4423 p[3] = (unsigned char) s[3];
4424 s += 4;
4425 p += 4;
4426 }
4427 while (s < e)
4428 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004430
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 Py_XDECREF(v);
4433 return NULL;
4434}
4435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436/* create or adjust a UnicodeEncodeError */
4437static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 const char *encoding,
4439 const Py_UNICODE *unicode, Py_ssize_t size,
4440 Py_ssize_t startpos, Py_ssize_t endpos,
4441 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 *exceptionObject = PyUnicodeEncodeError_Create(
4445 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 }
4447 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4449 goto onError;
4450 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4451 goto onError;
4452 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4453 goto onError;
4454 return;
4455 onError:
4456 Py_DECREF(*exceptionObject);
4457 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 }
4459}
4460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461/* raises a UnicodeEncodeError */
4462static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004463 const char *encoding,
4464 const Py_UNICODE *unicode, Py_ssize_t size,
4465 Py_ssize_t startpos, Py_ssize_t endpos,
4466 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467{
4468 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472}
4473
4474/* error handling callback helper:
4475 build arguments, call the callback and check the arguments,
4476 put the result into newpos and return the replacement string, which
4477 has to be freed by the caller */
4478static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 PyObject **errorHandler,
4480 const char *encoding, const char *reason,
4481 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4482 Py_ssize_t startpos, Py_ssize_t endpos,
4483 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004484{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004485 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486
4487 PyObject *restuple;
4488 PyObject *resunicode;
4489
4490 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 }
4495
4496 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500
4501 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004506 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 Py_DECREF(restuple);
4508 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004510 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 &resunicode, newpos)) {
4512 Py_DECREF(restuple);
4513 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004515 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4516 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4517 Py_DECREF(restuple);
4518 return NULL;
4519 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004522 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4524 Py_DECREF(restuple);
4525 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004526 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 Py_INCREF(resunicode);
4528 Py_DECREF(restuple);
4529 return resunicode;
4530}
4531
4532static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 Py_ssize_t size,
4534 const char *errors,
4535 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536{
4537 /* output object */
4538 PyObject *res;
4539 /* pointers to the beginning and end+1 of input */
4540 const Py_UNICODE *startp = p;
4541 const Py_UNICODE *endp = p + size;
4542 /* pointer to the beginning of the unencodable characters */
4543 /* const Py_UNICODE *badp = NULL; */
4544 /* pointer into the output */
4545 char *str;
4546 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004547 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004548 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4549 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 PyObject *errorHandler = NULL;
4551 PyObject *exc = NULL;
4552 /* the following variable is used for caching string comparisons
4553 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4554 int known_errorHandler = -1;
4555
4556 /* allocate enough for a simple encoding without
4557 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004558 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004559 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004560 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004562 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004563 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 ressize = size;
4565
4566 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 /* can we encode this? */
4570 if (c<limit) {
4571 /* no overflow check, because we know that the space is enough */
4572 *str++ = (char)c;
4573 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004574 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 else {
4576 Py_ssize_t unicodepos = p-startp;
4577 Py_ssize_t requiredsize;
4578 PyObject *repunicode;
4579 Py_ssize_t repsize;
4580 Py_ssize_t newpos;
4581 Py_ssize_t respos;
4582 Py_UNICODE *uni2;
4583 /* startpos for collecting unencodable chars */
4584 const Py_UNICODE *collstart = p;
4585 const Py_UNICODE *collend = p;
4586 /* find all unecodable characters */
4587 while ((collend < endp) && ((*collend)>=limit))
4588 ++collend;
4589 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4590 if (known_errorHandler==-1) {
4591 if ((errors==NULL) || (!strcmp(errors, "strict")))
4592 known_errorHandler = 1;
4593 else if (!strcmp(errors, "replace"))
4594 known_errorHandler = 2;
4595 else if (!strcmp(errors, "ignore"))
4596 known_errorHandler = 3;
4597 else if (!strcmp(errors, "xmlcharrefreplace"))
4598 known_errorHandler = 4;
4599 else
4600 known_errorHandler = 0;
4601 }
4602 switch (known_errorHandler) {
4603 case 1: /* strict */
4604 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4605 goto onError;
4606 case 2: /* replace */
4607 while (collstart++<collend)
4608 *str++ = '?'; /* fall through */
4609 case 3: /* ignore */
4610 p = collend;
4611 break;
4612 case 4: /* xmlcharrefreplace */
4613 respos = str - PyBytes_AS_STRING(res);
4614 /* determine replacement size (temporarily (mis)uses p) */
4615 for (p = collstart, repsize = 0; p < collend; ++p) {
4616 if (*p<10)
4617 repsize += 2+1+1;
4618 else if (*p<100)
4619 repsize += 2+2+1;
4620 else if (*p<1000)
4621 repsize += 2+3+1;
4622 else if (*p<10000)
4623 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004624#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 else
4626 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004627#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004628 else if (*p<100000)
4629 repsize += 2+5+1;
4630 else if (*p<1000000)
4631 repsize += 2+6+1;
4632 else
4633 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004634#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 }
4636 requiredsize = respos+repsize+(endp-collend);
4637 if (requiredsize > ressize) {
4638 if (requiredsize<2*ressize)
4639 requiredsize = 2*ressize;
4640 if (_PyBytes_Resize(&res, requiredsize))
4641 goto onError;
4642 str = PyBytes_AS_STRING(res) + respos;
4643 ressize = requiredsize;
4644 }
4645 /* generate replacement (temporarily (mis)uses p) */
4646 for (p = collstart; p < collend; ++p) {
4647 str += sprintf(str, "&#%d;", (int)*p);
4648 }
4649 p = collend;
4650 break;
4651 default:
4652 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4653 encoding, reason, startp, size, &exc,
4654 collstart-startp, collend-startp, &newpos);
4655 if (repunicode == NULL)
4656 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004657 if (PyBytes_Check(repunicode)) {
4658 /* Directly copy bytes result to output. */
4659 repsize = PyBytes_Size(repunicode);
4660 if (repsize > 1) {
4661 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004662 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004663 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4664 Py_DECREF(repunicode);
4665 goto onError;
4666 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004667 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004668 ressize += repsize-1;
4669 }
4670 memcpy(str, PyBytes_AsString(repunicode), repsize);
4671 str += repsize;
4672 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004673 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004674 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004675 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004676 /* need more space? (at least enough for what we
4677 have+the replacement+the rest of the string, so
4678 we won't have to check space for encodable characters) */
4679 respos = str - PyBytes_AS_STRING(res);
4680 repsize = PyUnicode_GET_SIZE(repunicode);
4681 requiredsize = respos+repsize+(endp-collend);
4682 if (requiredsize > ressize) {
4683 if (requiredsize<2*ressize)
4684 requiredsize = 2*ressize;
4685 if (_PyBytes_Resize(&res, requiredsize)) {
4686 Py_DECREF(repunicode);
4687 goto onError;
4688 }
4689 str = PyBytes_AS_STRING(res) + respos;
4690 ressize = requiredsize;
4691 }
4692 /* check if there is anything unencodable in the replacement
4693 and copy it to the output */
4694 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4695 c = *uni2;
4696 if (c >= limit) {
4697 raise_encode_exception(&exc, encoding, startp, size,
4698 unicodepos, unicodepos+1, reason);
4699 Py_DECREF(repunicode);
4700 goto onError;
4701 }
4702 *str = (char)c;
4703 }
4704 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004705 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004706 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004707 }
4708 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004709 /* Resize if we allocated to much */
4710 size = str - PyBytes_AS_STRING(res);
4711 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004712 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004713 if (_PyBytes_Resize(&res, size) < 0)
4714 goto onError;
4715 }
4716
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 Py_XDECREF(errorHandler);
4718 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004719 return res;
4720
4721 onError:
4722 Py_XDECREF(res);
4723 Py_XDECREF(errorHandler);
4724 Py_XDECREF(exc);
4725 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726}
4727
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 Py_ssize_t size,
4730 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004732 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733}
4734
4735PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4736{
4737 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 PyErr_BadArgument();
4739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740 }
4741 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 PyUnicode_GET_SIZE(unicode),
4743 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744}
4745
4746/* --- 7-bit ASCII Codec -------------------------------------------------- */
4747
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004749 Py_ssize_t size,
4750 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004752 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753 PyUnicodeObject *v;
4754 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004755 Py_ssize_t startinpos;
4756 Py_ssize_t endinpos;
4757 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 const char *e;
4759 PyObject *errorHandler = NULL;
4760 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004761
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004763 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 Py_UNICODE r = *(unsigned char*)s;
4765 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004766 }
Tim Petersced69f82003-09-16 20:30:58 +00004767
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 v = _PyUnicode_New(size);
4769 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004772 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 e = s + size;
4775 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 register unsigned char c = (unsigned char)*s;
4777 if (c < 128) {
4778 *p++ = c;
4779 ++s;
4780 }
4781 else {
4782 startinpos = s-starts;
4783 endinpos = startinpos + 1;
4784 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4785 if (unicode_decode_call_errorhandler(
4786 errors, &errorHandler,
4787 "ascii", "ordinal not in range(128)",
4788 &starts, &e, &startinpos, &endinpos, &exc, &s,
4789 &v, &outpos, &p))
4790 goto onError;
4791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004793 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4795 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796 Py_XDECREF(errorHandler);
4797 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004799
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 Py_XDECREF(errorHandler);
4803 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804 return NULL;
4805}
4806
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 Py_ssize_t size,
4809 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004811 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812}
4813
4814PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4815{
4816 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004817 PyErr_BadArgument();
4818 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 }
4820 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 PyUnicode_GET_SIZE(unicode),
4822 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823}
4824
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004825#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004826
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004827/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004828
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004829#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004830#define NEED_RETRY
4831#endif
4832
4833/* XXX This code is limited to "true" double-byte encodings, as
4834 a) it assumes an incomplete character consists of a single byte, and
4835 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004837
4838static int is_dbcs_lead_byte(const char *s, int offset)
4839{
4840 const char *curr = s + offset;
4841
4842 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 const char *prev = CharPrev(s, curr);
4844 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004845 }
4846 return 0;
4847}
4848
4849/*
4850 * Decode MBCS string into unicode object. If 'final' is set, converts
4851 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4852 */
4853static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 const char *s, /* MBCS string */
4855 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004856 int final,
4857 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004858{
4859 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004860 Py_ssize_t n;
4861 DWORD usize;
4862 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004863
4864 assert(size >= 0);
4865
Victor Stinner554f3f02010-06-16 23:33:54 +00004866 /* check and handle 'errors' arg */
4867 if (errors==NULL || strcmp(errors, "strict")==0)
4868 flags = MB_ERR_INVALID_CHARS;
4869 else if (strcmp(errors, "ignore")==0)
4870 flags = 0;
4871 else {
4872 PyErr_Format(PyExc_ValueError,
4873 "mbcs encoding does not support errors='%s'",
4874 errors);
4875 return -1;
4876 }
4877
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004878 /* Skip trailing lead-byte unless 'final' is set */
4879 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004881
4882 /* First get the size of the result */
4883 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004884 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4885 if (usize==0)
4886 goto mbcs_decode_error;
4887 } else
4888 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004889
4890 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 /* Create unicode object */
4892 *v = _PyUnicode_New(usize);
4893 if (*v == NULL)
4894 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004895 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004896 }
4897 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 /* Extend unicode object */
4899 n = PyUnicode_GET_SIZE(*v);
4900 if (_PyUnicode_Resize(v, n + usize) < 0)
4901 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004902 }
4903
4904 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004905 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004907 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4908 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004909 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004910 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004911 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004912
4913mbcs_decode_error:
4914 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4915 we raise a UnicodeDecodeError - else it is a 'generic'
4916 windows error
4917 */
4918 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4919 /* Ideally, we should get reason from FormatMessage - this
4920 is the Windows 2000 English version of the message
4921 */
4922 PyObject *exc = NULL;
4923 const char *reason = "No mapping for the Unicode character exists "
4924 "in the target multi-byte code page.";
4925 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4926 if (exc != NULL) {
4927 PyCodec_StrictErrors(exc);
4928 Py_DECREF(exc);
4929 }
4930 } else {
4931 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4932 }
4933 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004934}
4935
4936PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 Py_ssize_t size,
4938 const char *errors,
4939 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004940{
4941 PyUnicodeObject *v = NULL;
4942 int done;
4943
4944 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004945 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004946
4947#ifdef NEED_RETRY
4948 retry:
4949 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004950 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004951 else
4952#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004953 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004954
4955 if (done < 0) {
4956 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004957 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004958 }
4959
4960 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004962
4963#ifdef NEED_RETRY
4964 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 s += done;
4966 size -= done;
4967 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004968 }
4969#endif
4970
4971 return (PyObject *)v;
4972}
4973
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004974PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004975 Py_ssize_t size,
4976 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004977{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004978 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4979}
4980
4981/*
4982 * Convert unicode into string object (MBCS).
4983 * Returns 0 if succeed, -1 otherwise.
4984 */
4985static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00004987 int size, /* size of unicode */
4988 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004989{
Victor Stinner554f3f02010-06-16 23:33:54 +00004990 BOOL usedDefaultChar = FALSE;
4991 BOOL *pusedDefaultChar;
4992 int mbcssize;
4993 Py_ssize_t n;
4994 PyObject *exc = NULL;
4995 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004996
4997 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004998
Victor Stinner554f3f02010-06-16 23:33:54 +00004999 /* check and handle 'errors' arg */
5000 if (errors==NULL || strcmp(errors, "strict")==0) {
5001 flags = WC_NO_BEST_FIT_CHARS;
5002 pusedDefaultChar = &usedDefaultChar;
5003 } else if (strcmp(errors, "replace")==0) {
5004 flags = 0;
5005 pusedDefaultChar = NULL;
5006 } else {
5007 PyErr_Format(PyExc_ValueError,
5008 "mbcs encoding does not support errors='%s'",
5009 errors);
5010 return -1;
5011 }
5012
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005013 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005014 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005015 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5016 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 if (mbcssize == 0) {
5018 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5019 return -1;
5020 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005021 /* If we used a default char, then we failed! */
5022 if (pusedDefaultChar && *pusedDefaultChar)
5023 goto mbcs_encode_error;
5024 } else {
5025 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005026 }
5027
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005028 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 /* Create string object */
5030 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5031 if (*repr == NULL)
5032 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005033 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005034 }
5035 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 /* Extend string object */
5037 n = PyBytes_Size(*repr);
5038 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5039 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005040 }
5041
5042 /* Do the conversion */
5043 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005045 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5046 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5048 return -1;
5049 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005050 if (pusedDefaultChar && *pusedDefaultChar)
5051 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005052 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005053 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005054
5055mbcs_encode_error:
5056 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5057 Py_XDECREF(exc);
5058 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005059}
5060
5061PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 Py_ssize_t size,
5063 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005064{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005065 PyObject *repr = NULL;
5066 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005067
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005068#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005070 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005071 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005072 else
5073#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005074 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005075
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005076 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 Py_XDECREF(repr);
5078 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005079 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005080
5081#ifdef NEED_RETRY
5082 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005083 p += INT_MAX;
5084 size -= INT_MAX;
5085 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005086 }
5087#endif
5088
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005089 return repr;
5090}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005091
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005092PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5093{
5094 if (!PyUnicode_Check(unicode)) {
5095 PyErr_BadArgument();
5096 return NULL;
5097 }
5098 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 PyUnicode_GET_SIZE(unicode),
5100 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005101}
5102
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005103#undef NEED_RETRY
5104
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005105#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005106
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107/* --- Character Mapping Codec -------------------------------------------- */
5108
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 Py_ssize_t size,
5111 PyObject *mapping,
5112 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005114 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005115 Py_ssize_t startinpos;
5116 Py_ssize_t endinpos;
5117 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005118 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 PyUnicodeObject *v;
5120 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005121 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 PyObject *errorHandler = NULL;
5123 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005124 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005125 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005126
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 /* Default to Latin-1 */
5128 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130
5131 v = _PyUnicode_New(size);
5132 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005138 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 mapstring = PyUnicode_AS_UNICODE(mapping);
5140 maplen = PyUnicode_GET_SIZE(mapping);
5141 while (s < e) {
5142 unsigned char ch = *s;
5143 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 if (ch < maplen)
5146 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 if (x == 0xfffe) {
5149 /* undefined mapping */
5150 outpos = p-PyUnicode_AS_UNICODE(v);
5151 startinpos = s-starts;
5152 endinpos = startinpos+1;
5153 if (unicode_decode_call_errorhandler(
5154 errors, &errorHandler,
5155 "charmap", "character maps to <undefined>",
5156 &starts, &e, &startinpos, &endinpos, &exc, &s,
5157 &v, &outpos, &p)) {
5158 goto onError;
5159 }
5160 continue;
5161 }
5162 *p++ = x;
5163 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005164 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005165 }
5166 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 while (s < e) {
5168 unsigned char ch = *s;
5169 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005170
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5172 w = PyLong_FromLong((long)ch);
5173 if (w == NULL)
5174 goto onError;
5175 x = PyObject_GetItem(mapping, w);
5176 Py_DECREF(w);
5177 if (x == NULL) {
5178 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5179 /* No mapping found means: mapping is undefined. */
5180 PyErr_Clear();
5181 x = Py_None;
5182 Py_INCREF(x);
5183 } else
5184 goto onError;
5185 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005186
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 /* Apply mapping */
5188 if (PyLong_Check(x)) {
5189 long value = PyLong_AS_LONG(x);
5190 if (value < 0 || value > 65535) {
5191 PyErr_SetString(PyExc_TypeError,
5192 "character mapping must be in range(65536)");
5193 Py_DECREF(x);
5194 goto onError;
5195 }
5196 *p++ = (Py_UNICODE)value;
5197 }
5198 else if (x == Py_None) {
5199 /* undefined mapping */
5200 outpos = p-PyUnicode_AS_UNICODE(v);
5201 startinpos = s-starts;
5202 endinpos = startinpos+1;
5203 if (unicode_decode_call_errorhandler(
5204 errors, &errorHandler,
5205 "charmap", "character maps to <undefined>",
5206 &starts, &e, &startinpos, &endinpos, &exc, &s,
5207 &v, &outpos, &p)) {
5208 Py_DECREF(x);
5209 goto onError;
5210 }
5211 Py_DECREF(x);
5212 continue;
5213 }
5214 else if (PyUnicode_Check(x)) {
5215 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005216
Benjamin Peterson29060642009-01-31 22:14:21 +00005217 if (targetsize == 1)
5218 /* 1-1 mapping */
5219 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005220
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 else if (targetsize > 1) {
5222 /* 1-n mapping */
5223 if (targetsize > extrachars) {
5224 /* resize first */
5225 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5226 Py_ssize_t needed = (targetsize - extrachars) + \
5227 (targetsize << 2);
5228 extrachars += needed;
5229 /* XXX overflow detection missing */
5230 if (_PyUnicode_Resize(&v,
5231 PyUnicode_GET_SIZE(v) + needed) < 0) {
5232 Py_DECREF(x);
5233 goto onError;
5234 }
5235 p = PyUnicode_AS_UNICODE(v) + oldpos;
5236 }
5237 Py_UNICODE_COPY(p,
5238 PyUnicode_AS_UNICODE(x),
5239 targetsize);
5240 p += targetsize;
5241 extrachars -= targetsize;
5242 }
5243 /* 1-0 mapping: skip the character */
5244 }
5245 else {
5246 /* wrong return value */
5247 PyErr_SetString(PyExc_TypeError,
5248 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005249 Py_DECREF(x);
5250 goto onError;
5251 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 Py_DECREF(x);
5253 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 }
5256 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5258 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005259 Py_XDECREF(errorHandler);
5260 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005262
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005264 Py_XDECREF(errorHandler);
5265 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 Py_XDECREF(v);
5267 return NULL;
5268}
5269
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005270/* Charmap encoding: the lookup table */
5271
5272struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 PyObject_HEAD
5274 unsigned char level1[32];
5275 int count2, count3;
5276 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005277};
5278
5279static PyObject*
5280encoding_map_size(PyObject *obj, PyObject* args)
5281{
5282 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005283 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005285}
5286
5287static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005288 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 PyDoc_STR("Return the size (in bytes) of this object") },
5290 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005291};
5292
5293static void
5294encoding_map_dealloc(PyObject* o)
5295{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005296 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005297}
5298
5299static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005300 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 "EncodingMap", /*tp_name*/
5302 sizeof(struct encoding_map), /*tp_basicsize*/
5303 0, /*tp_itemsize*/
5304 /* methods */
5305 encoding_map_dealloc, /*tp_dealloc*/
5306 0, /*tp_print*/
5307 0, /*tp_getattr*/
5308 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005309 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005310 0, /*tp_repr*/
5311 0, /*tp_as_number*/
5312 0, /*tp_as_sequence*/
5313 0, /*tp_as_mapping*/
5314 0, /*tp_hash*/
5315 0, /*tp_call*/
5316 0, /*tp_str*/
5317 0, /*tp_getattro*/
5318 0, /*tp_setattro*/
5319 0, /*tp_as_buffer*/
5320 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5321 0, /*tp_doc*/
5322 0, /*tp_traverse*/
5323 0, /*tp_clear*/
5324 0, /*tp_richcompare*/
5325 0, /*tp_weaklistoffset*/
5326 0, /*tp_iter*/
5327 0, /*tp_iternext*/
5328 encoding_map_methods, /*tp_methods*/
5329 0, /*tp_members*/
5330 0, /*tp_getset*/
5331 0, /*tp_base*/
5332 0, /*tp_dict*/
5333 0, /*tp_descr_get*/
5334 0, /*tp_descr_set*/
5335 0, /*tp_dictoffset*/
5336 0, /*tp_init*/
5337 0, /*tp_alloc*/
5338 0, /*tp_new*/
5339 0, /*tp_free*/
5340 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005341};
5342
5343PyObject*
5344PyUnicode_BuildEncodingMap(PyObject* string)
5345{
5346 Py_UNICODE *decode;
5347 PyObject *result;
5348 struct encoding_map *mresult;
5349 int i;
5350 int need_dict = 0;
5351 unsigned char level1[32];
5352 unsigned char level2[512];
5353 unsigned char *mlevel1, *mlevel2, *mlevel3;
5354 int count2 = 0, count3 = 0;
5355
5356 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5357 PyErr_BadArgument();
5358 return NULL;
5359 }
5360 decode = PyUnicode_AS_UNICODE(string);
5361 memset(level1, 0xFF, sizeof level1);
5362 memset(level2, 0xFF, sizeof level2);
5363
5364 /* If there isn't a one-to-one mapping of NULL to \0,
5365 or if there are non-BMP characters, we need to use
5366 a mapping dictionary. */
5367 if (decode[0] != 0)
5368 need_dict = 1;
5369 for (i = 1; i < 256; i++) {
5370 int l1, l2;
5371 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005372#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005373 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005374#endif
5375 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005376 need_dict = 1;
5377 break;
5378 }
5379 if (decode[i] == 0xFFFE)
5380 /* unmapped character */
5381 continue;
5382 l1 = decode[i] >> 11;
5383 l2 = decode[i] >> 7;
5384 if (level1[l1] == 0xFF)
5385 level1[l1] = count2++;
5386 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005387 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005388 }
5389
5390 if (count2 >= 0xFF || count3 >= 0xFF)
5391 need_dict = 1;
5392
5393 if (need_dict) {
5394 PyObject *result = PyDict_New();
5395 PyObject *key, *value;
5396 if (!result)
5397 return NULL;
5398 for (i = 0; i < 256; i++) {
5399 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005400 key = PyLong_FromLong(decode[i]);
5401 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005402 if (!key || !value)
5403 goto failed1;
5404 if (PyDict_SetItem(result, key, value) == -1)
5405 goto failed1;
5406 Py_DECREF(key);
5407 Py_DECREF(value);
5408 }
5409 return result;
5410 failed1:
5411 Py_XDECREF(key);
5412 Py_XDECREF(value);
5413 Py_DECREF(result);
5414 return NULL;
5415 }
5416
5417 /* Create a three-level trie */
5418 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5419 16*count2 + 128*count3 - 1);
5420 if (!result)
5421 return PyErr_NoMemory();
5422 PyObject_Init(result, &EncodingMapType);
5423 mresult = (struct encoding_map*)result;
5424 mresult->count2 = count2;
5425 mresult->count3 = count3;
5426 mlevel1 = mresult->level1;
5427 mlevel2 = mresult->level23;
5428 mlevel3 = mresult->level23 + 16*count2;
5429 memcpy(mlevel1, level1, 32);
5430 memset(mlevel2, 0xFF, 16*count2);
5431 memset(mlevel3, 0, 128*count3);
5432 count3 = 0;
5433 for (i = 1; i < 256; i++) {
5434 int o1, o2, o3, i2, i3;
5435 if (decode[i] == 0xFFFE)
5436 /* unmapped character */
5437 continue;
5438 o1 = decode[i]>>11;
5439 o2 = (decode[i]>>7) & 0xF;
5440 i2 = 16*mlevel1[o1] + o2;
5441 if (mlevel2[i2] == 0xFF)
5442 mlevel2[i2] = count3++;
5443 o3 = decode[i] & 0x7F;
5444 i3 = 128*mlevel2[i2] + o3;
5445 mlevel3[i3] = i;
5446 }
5447 return result;
5448}
5449
5450static int
5451encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5452{
5453 struct encoding_map *map = (struct encoding_map*)mapping;
5454 int l1 = c>>11;
5455 int l2 = (c>>7) & 0xF;
5456 int l3 = c & 0x7F;
5457 int i;
5458
5459#ifdef Py_UNICODE_WIDE
5460 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005462 }
5463#endif
5464 if (c == 0)
5465 return 0;
5466 /* level 1*/
5467 i = map->level1[l1];
5468 if (i == 0xFF) {
5469 return -1;
5470 }
5471 /* level 2*/
5472 i = map->level23[16*i+l2];
5473 if (i == 0xFF) {
5474 return -1;
5475 }
5476 /* level 3 */
5477 i = map->level23[16*map->count2 + 128*i + l3];
5478 if (i == 0) {
5479 return -1;
5480 }
5481 return i;
5482}
5483
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005484/* Lookup the character ch in the mapping. If the character
5485 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005486 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005487static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488{
Christian Heimes217cfd12007-12-02 14:31:20 +00005489 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005490 PyObject *x;
5491
5492 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005494 x = PyObject_GetItem(mapping, w);
5495 Py_DECREF(w);
5496 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5498 /* No mapping found means: mapping is undefined. */
5499 PyErr_Clear();
5500 x = Py_None;
5501 Py_INCREF(x);
5502 return x;
5503 } else
5504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005506 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005508 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 long value = PyLong_AS_LONG(x);
5510 if (value < 0 || value > 255) {
5511 PyErr_SetString(PyExc_TypeError,
5512 "character mapping must be in range(256)");
5513 Py_DECREF(x);
5514 return NULL;
5515 }
5516 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005518 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 /* wrong return value */
5522 PyErr_Format(PyExc_TypeError,
5523 "character mapping must return integer, bytes or None, not %.400s",
5524 x->ob_type->tp_name);
5525 Py_DECREF(x);
5526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 }
5528}
5529
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005530static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005531charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005532{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005533 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5534 /* exponentially overallocate to minimize reallocations */
5535 if (requiredsize < 2*outsize)
5536 requiredsize = 2*outsize;
5537 if (_PyBytes_Resize(outobj, requiredsize))
5538 return -1;
5539 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005540}
5541
Benjamin Peterson14339b62009-01-31 16:36:08 +00005542typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005544}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005545/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005546 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005547 space is available. Return a new reference to the object that
5548 was put in the output buffer, or Py_None, if the mapping was undefined
5549 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005550 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005551static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005552charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005555 PyObject *rep;
5556 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005557 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558
Christian Heimes90aa7642007-12-19 02:45:37 +00005559 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005560 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005562 if (res == -1)
5563 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 if (outsize<requiredsize)
5565 if (charmapencode_resize(outobj, outpos, requiredsize))
5566 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005567 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 outstart[(*outpos)++] = (char)res;
5569 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005570 }
5571
5572 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005573 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005575 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 Py_DECREF(rep);
5577 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005578 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 if (PyLong_Check(rep)) {
5580 Py_ssize_t requiredsize = *outpos+1;
5581 if (outsize<requiredsize)
5582 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5583 Py_DECREF(rep);
5584 return enc_EXCEPTION;
5585 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005586 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005588 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 else {
5590 const char *repchars = PyBytes_AS_STRING(rep);
5591 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5592 Py_ssize_t requiredsize = *outpos+repsize;
5593 if (outsize<requiredsize)
5594 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5595 Py_DECREF(rep);
5596 return enc_EXCEPTION;
5597 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005598 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 memcpy(outstart + *outpos, repchars, repsize);
5600 *outpos += repsize;
5601 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005603 Py_DECREF(rep);
5604 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005605}
5606
5607/* handle an error in PyUnicode_EncodeCharmap
5608 Return 0 on success, -1 on error */
5609static
5610int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005611 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005612 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005613 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005614 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005615{
5616 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005617 Py_ssize_t repsize;
5618 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 Py_UNICODE *uni2;
5620 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005621 Py_ssize_t collstartpos = *inpos;
5622 Py_ssize_t collendpos = *inpos+1;
5623 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624 char *encoding = "charmap";
5625 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005626 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628 /* find all unencodable characters */
5629 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005630 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005631 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 int res = encoding_map_lookup(p[collendpos], mapping);
5633 if (res != -1)
5634 break;
5635 ++collendpos;
5636 continue;
5637 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005638
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 rep = charmapencode_lookup(p[collendpos], mapping);
5640 if (rep==NULL)
5641 return -1;
5642 else if (rep!=Py_None) {
5643 Py_DECREF(rep);
5644 break;
5645 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005646 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 }
5649 /* cache callback name lookup
5650 * (if not done yet, i.e. it's the first error) */
5651 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 if ((errors==NULL) || (!strcmp(errors, "strict")))
5653 *known_errorHandler = 1;
5654 else if (!strcmp(errors, "replace"))
5655 *known_errorHandler = 2;
5656 else if (!strcmp(errors, "ignore"))
5657 *known_errorHandler = 3;
5658 else if (!strcmp(errors, "xmlcharrefreplace"))
5659 *known_errorHandler = 4;
5660 else
5661 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005662 }
5663 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005664 case 1: /* strict */
5665 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5666 return -1;
5667 case 2: /* replace */
5668 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 x = charmapencode_output('?', mapping, res, respos);
5670 if (x==enc_EXCEPTION) {
5671 return -1;
5672 }
5673 else if (x==enc_FAILED) {
5674 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5675 return -1;
5676 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005677 }
5678 /* fall through */
5679 case 3: /* ignore */
5680 *inpos = collendpos;
5681 break;
5682 case 4: /* xmlcharrefreplace */
5683 /* generate replacement (temporarily (mis)uses p) */
5684 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 char buffer[2+29+1+1];
5686 char *cp;
5687 sprintf(buffer, "&#%d;", (int)p[collpos]);
5688 for (cp = buffer; *cp; ++cp) {
5689 x = charmapencode_output(*cp, mapping, res, respos);
5690 if (x==enc_EXCEPTION)
5691 return -1;
5692 else if (x==enc_FAILED) {
5693 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5694 return -1;
5695 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005696 }
5697 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005698 *inpos = collendpos;
5699 break;
5700 default:
5701 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 encoding, reason, p, size, exceptionObject,
5703 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005704 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005706 if (PyBytes_Check(repunicode)) {
5707 /* Directly copy bytes result to output. */
5708 Py_ssize_t outsize = PyBytes_Size(*res);
5709 Py_ssize_t requiredsize;
5710 repsize = PyBytes_Size(repunicode);
5711 requiredsize = *respos + repsize;
5712 if (requiredsize > outsize)
5713 /* Make room for all additional bytes. */
5714 if (charmapencode_resize(res, respos, requiredsize)) {
5715 Py_DECREF(repunicode);
5716 return -1;
5717 }
5718 memcpy(PyBytes_AsString(*res) + *respos,
5719 PyBytes_AsString(repunicode), repsize);
5720 *respos += repsize;
5721 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005722 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005723 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005725 /* generate replacement */
5726 repsize = PyUnicode_GET_SIZE(repunicode);
5727 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 x = charmapencode_output(*uni2, mapping, res, respos);
5729 if (x==enc_EXCEPTION) {
5730 return -1;
5731 }
5732 else if (x==enc_FAILED) {
5733 Py_DECREF(repunicode);
5734 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5735 return -1;
5736 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005737 }
5738 *inpos = newpos;
5739 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740 }
5741 return 0;
5742}
5743
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 Py_ssize_t size,
5746 PyObject *mapping,
5747 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005749 /* output object */
5750 PyObject *res = NULL;
5751 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005752 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005754 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 PyObject *errorHandler = NULL;
5756 PyObject *exc = NULL;
5757 /* the following variable is used for caching string comparisons
5758 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5759 * 3=ignore, 4=xmlcharrefreplace */
5760 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
5762 /* Default to Latin-1 */
5763 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 /* allocate enough for a simple encoding without
5767 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005768 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 if (res == NULL)
5770 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005771 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 /* try to encode it */
5776 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5777 if (x==enc_EXCEPTION) /* error */
5778 goto onError;
5779 if (x==enc_FAILED) { /* unencodable character */
5780 if (charmap_encoding_error(p, size, &inpos, mapping,
5781 &exc,
5782 &known_errorHandler, &errorHandler, errors,
5783 &res, &respos)) {
5784 goto onError;
5785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005786 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 else
5788 /* done with this character => adjust input position */
5789 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005793 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005794 if (_PyBytes_Resize(&res, respos) < 0)
5795 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005796
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 Py_XDECREF(exc);
5798 Py_XDECREF(errorHandler);
5799 return res;
5800
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 Py_XDECREF(res);
5803 Py_XDECREF(exc);
5804 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 return NULL;
5806}
5807
5808PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810{
5811 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 PyErr_BadArgument();
5813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814 }
5815 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005816 PyUnicode_GET_SIZE(unicode),
5817 mapping,
5818 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819}
5820
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821/* create or adjust a UnicodeTranslateError */
5822static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 const Py_UNICODE *unicode, Py_ssize_t size,
5824 Py_ssize_t startpos, Py_ssize_t endpos,
5825 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005827 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005828 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 }
5831 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5833 goto onError;
5834 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5835 goto onError;
5836 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5837 goto onError;
5838 return;
5839 onError:
5840 Py_DECREF(*exceptionObject);
5841 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 }
5843}
5844
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845/* raises a UnicodeTranslateError */
5846static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 const Py_UNICODE *unicode, Py_ssize_t size,
5848 Py_ssize_t startpos, Py_ssize_t endpos,
5849 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850{
5851 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855}
5856
5857/* error handling callback helper:
5858 build arguments, call the callback and check the arguments,
5859 put the result into newpos and return the replacement string, which
5860 has to be freed by the caller */
5861static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005862 PyObject **errorHandler,
5863 const char *reason,
5864 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5865 Py_ssize_t startpos, Py_ssize_t endpos,
5866 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005868 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005870 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871 PyObject *restuple;
5872 PyObject *resunicode;
5873
5874 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878 }
5879
5880 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884
5885 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005887 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005890 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 Py_DECREF(restuple);
5892 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 }
5894 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 &resunicode, &i_newpos)) {
5896 Py_DECREF(restuple);
5897 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005899 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005901 else
5902 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005903 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5905 Py_DECREF(restuple);
5906 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005907 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005908 Py_INCREF(resunicode);
5909 Py_DECREF(restuple);
5910 return resunicode;
5911}
5912
5913/* Lookup the character ch in the mapping and put the result in result,
5914 which must be decrefed by the caller.
5915 Return 0 on success, -1 on error */
5916static
5917int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5918{
Christian Heimes217cfd12007-12-02 14:31:20 +00005919 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005920 PyObject *x;
5921
5922 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005924 x = PyObject_GetItem(mapping, w);
5925 Py_DECREF(w);
5926 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5928 /* No mapping found means: use 1:1 mapping. */
5929 PyErr_Clear();
5930 *result = NULL;
5931 return 0;
5932 } else
5933 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005934 }
5935 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 *result = x;
5937 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005939 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 long value = PyLong_AS_LONG(x);
5941 long max = PyUnicode_GetMax();
5942 if (value < 0 || value > max) {
5943 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005944 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 Py_DECREF(x);
5946 return -1;
5947 }
5948 *result = x;
5949 return 0;
5950 }
5951 else if (PyUnicode_Check(x)) {
5952 *result = x;
5953 return 0;
5954 }
5955 else {
5956 /* wrong return value */
5957 PyErr_SetString(PyExc_TypeError,
5958 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005959 Py_DECREF(x);
5960 return -1;
5961 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005962}
5963/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 if not reallocate and adjust various state variables.
5965 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005966static
Walter Dörwald4894c302003-10-24 14:25:28 +00005967int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005969{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005970 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005971 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 /* remember old output position */
5973 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5974 /* exponentially overallocate to minimize reallocations */
5975 if (requiredsize < 2 * oldsize)
5976 requiredsize = 2 * oldsize;
5977 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5978 return -1;
5979 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980 }
5981 return 0;
5982}
5983/* lookup the character, put the result in the output string and adjust
5984 various state variables. Return a new reference to the object that
5985 was put in the output buffer in *result, or Py_None, if the mapping was
5986 undefined (in which case no character was written).
5987 The called must decref result.
5988 Return 0 on success, -1 on error. */
5989static
Walter Dörwald4894c302003-10-24 14:25:28 +00005990int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5992 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005993{
Walter Dörwald4894c302003-10-24 14:25:28 +00005994 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 /* not found => default to 1:1 mapping */
5998 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005999 }
6000 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006002 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 /* no overflow check, because we know that the space is enough */
6004 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006005 }
6006 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6008 if (repsize==1) {
6009 /* no overflow check, because we know that the space is enough */
6010 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6011 }
6012 else if (repsize!=0) {
6013 /* more than one character */
6014 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6015 (insize - (curinp-startinp)) +
6016 repsize - 1;
6017 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6018 return -1;
6019 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6020 *outp += repsize;
6021 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 }
6023 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006025 return 0;
6026}
6027
6028PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 Py_ssize_t size,
6030 PyObject *mapping,
6031 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 /* output object */
6034 PyObject *res = NULL;
6035 /* pointers to the beginning and end+1 of input */
6036 const Py_UNICODE *startp = p;
6037 const Py_UNICODE *endp = p + size;
6038 /* pointer into the output */
6039 Py_UNICODE *str;
6040 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006041 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 char *reason = "character maps to <undefined>";
6043 PyObject *errorHandler = NULL;
6044 PyObject *exc = NULL;
6045 /* the following variable is used for caching string comparisons
6046 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6047 * 3=ignore, 4=xmlcharrefreplace */
6048 int known_errorHandler = -1;
6049
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 PyErr_BadArgument();
6052 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006054
6055 /* allocate enough for a simple 1:1 translation without
6056 replacements, if we need more, we'll resize */
6057 res = PyUnicode_FromUnicode(NULL, size);
6058 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006062 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 /* try to encode it */
6066 PyObject *x = NULL;
6067 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6068 Py_XDECREF(x);
6069 goto onError;
6070 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006071 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 if (x!=Py_None) /* it worked => adjust input pointer */
6073 ++p;
6074 else { /* untranslatable character */
6075 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6076 Py_ssize_t repsize;
6077 Py_ssize_t newpos;
6078 Py_UNICODE *uni2;
6079 /* startpos for collecting untranslatable chars */
6080 const Py_UNICODE *collstart = p;
6081 const Py_UNICODE *collend = p+1;
6082 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 /* find all untranslatable characters */
6085 while (collend < endp) {
6086 if (charmaptranslate_lookup(*collend, mapping, &x))
6087 goto onError;
6088 Py_XDECREF(x);
6089 if (x!=Py_None)
6090 break;
6091 ++collend;
6092 }
6093 /* cache callback name lookup
6094 * (if not done yet, i.e. it's the first error) */
6095 if (known_errorHandler==-1) {
6096 if ((errors==NULL) || (!strcmp(errors, "strict")))
6097 known_errorHandler = 1;
6098 else if (!strcmp(errors, "replace"))
6099 known_errorHandler = 2;
6100 else if (!strcmp(errors, "ignore"))
6101 known_errorHandler = 3;
6102 else if (!strcmp(errors, "xmlcharrefreplace"))
6103 known_errorHandler = 4;
6104 else
6105 known_errorHandler = 0;
6106 }
6107 switch (known_errorHandler) {
6108 case 1: /* strict */
6109 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006110 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 case 2: /* replace */
6112 /* No need to check for space, this is a 1:1 replacement */
6113 for (coll = collstart; coll<collend; ++coll)
6114 *str++ = '?';
6115 /* fall through */
6116 case 3: /* ignore */
6117 p = collend;
6118 break;
6119 case 4: /* xmlcharrefreplace */
6120 /* generate replacement (temporarily (mis)uses p) */
6121 for (p = collstart; p < collend; ++p) {
6122 char buffer[2+29+1+1];
6123 char *cp;
6124 sprintf(buffer, "&#%d;", (int)*p);
6125 if (charmaptranslate_makespace(&res, &str,
6126 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6127 goto onError;
6128 for (cp = buffer; *cp; ++cp)
6129 *str++ = *cp;
6130 }
6131 p = collend;
6132 break;
6133 default:
6134 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6135 reason, startp, size, &exc,
6136 collstart-startp, collend-startp, &newpos);
6137 if (repunicode == NULL)
6138 goto onError;
6139 /* generate replacement */
6140 repsize = PyUnicode_GET_SIZE(repunicode);
6141 if (charmaptranslate_makespace(&res, &str,
6142 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6143 Py_DECREF(repunicode);
6144 goto onError;
6145 }
6146 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6147 *str++ = *uni2;
6148 p = startp + newpos;
6149 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006150 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006151 }
6152 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153 /* Resize if we allocated to much */
6154 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006155 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 if (PyUnicode_Resize(&res, respos) < 0)
6157 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 }
6159 Py_XDECREF(exc);
6160 Py_XDECREF(errorHandler);
6161 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164 Py_XDECREF(res);
6165 Py_XDECREF(exc);
6166 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 return NULL;
6168}
6169
6170PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 PyObject *mapping,
6172 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173{
6174 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006175
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 str = PyUnicode_FromObject(str);
6177 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 PyUnicode_GET_SIZE(str),
6181 mapping,
6182 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 Py_DECREF(str);
6184 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006185
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 Py_XDECREF(str);
6188 return NULL;
6189}
Tim Petersced69f82003-09-16 20:30:58 +00006190
Guido van Rossum9e896b32000-04-05 20:11:21 +00006191/* --- Decimal Encoder ---------------------------------------------------- */
6192
6193int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 Py_ssize_t length,
6195 char *output,
6196 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006197{
6198 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 PyObject *errorHandler = NULL;
6200 PyObject *exc = NULL;
6201 const char *encoding = "decimal";
6202 const char *reason = "invalid decimal Unicode string";
6203 /* the following variable is used for caching string comparisons
6204 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6205 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006206
6207 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 PyErr_BadArgument();
6209 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006210 }
6211
6212 p = s;
6213 end = s + length;
6214 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 register Py_UNICODE ch = *p;
6216 int decimal;
6217 PyObject *repunicode;
6218 Py_ssize_t repsize;
6219 Py_ssize_t newpos;
6220 Py_UNICODE *uni2;
6221 Py_UNICODE *collstart;
6222 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006223
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006225 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 ++p;
6227 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006228 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 decimal = Py_UNICODE_TODECIMAL(ch);
6230 if (decimal >= 0) {
6231 *output++ = '0' + decimal;
6232 ++p;
6233 continue;
6234 }
6235 if (0 < ch && ch < 256) {
6236 *output++ = (char)ch;
6237 ++p;
6238 continue;
6239 }
6240 /* All other characters are considered unencodable */
6241 collstart = p;
6242 collend = p+1;
6243 while (collend < end) {
6244 if ((0 < *collend && *collend < 256) ||
6245 !Py_UNICODE_ISSPACE(*collend) ||
6246 Py_UNICODE_TODECIMAL(*collend))
6247 break;
6248 }
6249 /* cache callback name lookup
6250 * (if not done yet, i.e. it's the first error) */
6251 if (known_errorHandler==-1) {
6252 if ((errors==NULL) || (!strcmp(errors, "strict")))
6253 known_errorHandler = 1;
6254 else if (!strcmp(errors, "replace"))
6255 known_errorHandler = 2;
6256 else if (!strcmp(errors, "ignore"))
6257 known_errorHandler = 3;
6258 else if (!strcmp(errors, "xmlcharrefreplace"))
6259 known_errorHandler = 4;
6260 else
6261 known_errorHandler = 0;
6262 }
6263 switch (known_errorHandler) {
6264 case 1: /* strict */
6265 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6266 goto onError;
6267 case 2: /* replace */
6268 for (p = collstart; p < collend; ++p)
6269 *output++ = '?';
6270 /* fall through */
6271 case 3: /* ignore */
6272 p = collend;
6273 break;
6274 case 4: /* xmlcharrefreplace */
6275 /* generate replacement (temporarily (mis)uses p) */
6276 for (p = collstart; p < collend; ++p)
6277 output += sprintf(output, "&#%d;", (int)*p);
6278 p = collend;
6279 break;
6280 default:
6281 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6282 encoding, reason, s, length, &exc,
6283 collstart-s, collend-s, &newpos);
6284 if (repunicode == NULL)
6285 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006286 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006287 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006288 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6289 Py_DECREF(repunicode);
6290 goto onError;
6291 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 /* generate replacement */
6293 repsize = PyUnicode_GET_SIZE(repunicode);
6294 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6295 Py_UNICODE ch = *uni2;
6296 if (Py_UNICODE_ISSPACE(ch))
6297 *output++ = ' ';
6298 else {
6299 decimal = Py_UNICODE_TODECIMAL(ch);
6300 if (decimal >= 0)
6301 *output++ = '0' + decimal;
6302 else if (0 < ch && ch < 256)
6303 *output++ = (char)ch;
6304 else {
6305 Py_DECREF(repunicode);
6306 raise_encode_exception(&exc, encoding,
6307 s, length, collstart-s, collend-s, reason);
6308 goto onError;
6309 }
6310 }
6311 }
6312 p = s + newpos;
6313 Py_DECREF(repunicode);
6314 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006315 }
6316 /* 0-terminate the output string */
6317 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006318 Py_XDECREF(exc);
6319 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006320 return 0;
6321
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006323 Py_XDECREF(exc);
6324 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006325 return -1;
6326}
6327
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328/* --- Helpers ------------------------------------------------------------ */
6329
Eric Smith8c663262007-08-25 02:26:07 +00006330#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006331#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006332
Thomas Wouters477c8d52006-05-27 19:21:47 +00006333#include "stringlib/count.h"
6334#include "stringlib/find.h"
6335#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006336#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006337
Eric Smith5807c412008-05-11 21:00:57 +00006338#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006339#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006340#include "stringlib/localeutil.h"
6341
Thomas Wouters477c8d52006-05-27 19:21:47 +00006342/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006343#define ADJUST_INDICES(start, end, len) \
6344 if (end > len) \
6345 end = len; \
6346 else if (end < 0) { \
6347 end += len; \
6348 if (end < 0) \
6349 end = 0; \
6350 } \
6351 if (start < 0) { \
6352 start += len; \
6353 if (start < 0) \
6354 start = 0; \
6355 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006356
Martin v. Löwis18e16552006-02-15 17:27:45 +00006357Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006358 PyObject *substr,
6359 Py_ssize_t start,
6360 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006362 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006363 PyUnicodeObject* str_obj;
6364 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006365
Thomas Wouters477c8d52006-05-27 19:21:47 +00006366 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6367 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006369 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6370 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 Py_DECREF(str_obj);
6372 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 }
Tim Petersced69f82003-09-16 20:30:58 +00006374
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006375 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006376 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006377 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6378 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006379 );
6380
6381 Py_DECREF(sub_obj);
6382 Py_DECREF(str_obj);
6383
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 return result;
6385}
6386
Martin v. Löwis18e16552006-02-15 17:27:45 +00006387Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006388 PyObject *sub,
6389 Py_ssize_t start,
6390 Py_ssize_t end,
6391 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006393 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006394
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006396 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006398 sub = PyUnicode_FromObject(sub);
6399 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 Py_DECREF(str);
6401 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 }
Tim Petersced69f82003-09-16 20:30:58 +00006403
Thomas Wouters477c8d52006-05-27 19:21:47 +00006404 if (direction > 0)
6405 result = stringlib_find_slice(
6406 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6407 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6408 start, end
6409 );
6410 else
6411 result = stringlib_rfind_slice(
6412 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6413 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6414 start, end
6415 );
6416
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006418 Py_DECREF(sub);
6419
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 return result;
6421}
6422
Tim Petersced69f82003-09-16 20:30:58 +00006423static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 PyUnicodeObject *substring,
6426 Py_ssize_t start,
6427 Py_ssize_t end,
6428 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 if (substring->length == 0)
6431 return 1;
6432
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006433 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 end -= substring->length;
6435 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437
6438 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 if (Py_UNICODE_MATCH(self, end, substring))
6440 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 } else {
6442 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 }
6445
6446 return 0;
6447}
6448
Martin v. Löwis18e16552006-02-15 17:27:45 +00006449Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 PyObject *substr,
6451 Py_ssize_t start,
6452 Py_ssize_t end,
6453 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006455 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006456
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 str = PyUnicode_FromObject(str);
6458 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 substr = PyUnicode_FromObject(substr);
6461 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 Py_DECREF(str);
6463 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 }
Tim Petersced69f82003-09-16 20:30:58 +00006465
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 (PyUnicodeObject *)substr,
6468 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 Py_DECREF(str);
6470 Py_DECREF(substr);
6471 return result;
6472}
6473
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474/* Apply fixfct filter to the Unicode object self and return a
6475 reference to the modified object */
6476
Tim Petersced69f82003-09-16 20:30:58 +00006477static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480{
6481
6482 PyUnicodeObject *u;
6483
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006484 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006487
6488 Py_UNICODE_COPY(u->str, self->str, self->length);
6489
Tim Peters7a29bd52001-09-12 03:03:31 +00006490 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 /* fixfct should return TRUE if it modified the buffer. If
6492 FALSE, return a reference to the original buffer instead
6493 (to save space, not time) */
6494 Py_INCREF(self);
6495 Py_DECREF(u);
6496 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 }
6498 return (PyObject*) u;
6499}
6500
Tim Petersced69f82003-09-16 20:30:58 +00006501static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502int fixupper(PyUnicodeObject *self)
6503{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006504 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 Py_UNICODE *s = self->str;
6506 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006507
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006510
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 ch = Py_UNICODE_TOUPPER(*s);
6512 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 *s = ch;
6515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 s++;
6517 }
6518
6519 return status;
6520}
6521
Tim Petersced69f82003-09-16 20:30:58 +00006522static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523int fixlower(PyUnicodeObject *self)
6524{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006525 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 Py_UNICODE *s = self->str;
6527 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006528
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006531
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 ch = Py_UNICODE_TOLOWER(*s);
6533 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 *s = ch;
6536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 s++;
6538 }
6539
6540 return status;
6541}
6542
Tim Petersced69f82003-09-16 20:30:58 +00006543static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544int fixswapcase(PyUnicodeObject *self)
6545{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006546 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 Py_UNICODE *s = self->str;
6548 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006549
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 while (len-- > 0) {
6551 if (Py_UNICODE_ISUPPER(*s)) {
6552 *s = Py_UNICODE_TOLOWER(*s);
6553 status = 1;
6554 } else if (Py_UNICODE_ISLOWER(*s)) {
6555 *s = Py_UNICODE_TOUPPER(*s);
6556 status = 1;
6557 }
6558 s++;
6559 }
6560
6561 return status;
6562}
6563
Tim Petersced69f82003-09-16 20:30:58 +00006564static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565int fixcapitalize(PyUnicodeObject *self)
6566{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006567 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006568 Py_UNICODE *s = self->str;
6569 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006570
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006571 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006573 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 *s = Py_UNICODE_TOUPPER(*s);
6575 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006577 s++;
6578 while (--len > 0) {
6579 if (Py_UNICODE_ISUPPER(*s)) {
6580 *s = Py_UNICODE_TOLOWER(*s);
6581 status = 1;
6582 }
6583 s++;
6584 }
6585 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586}
6587
6588static
6589int fixtitle(PyUnicodeObject *self)
6590{
6591 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6592 register Py_UNICODE *e;
6593 int previous_is_cased;
6594
6595 /* Shortcut for single character strings */
6596 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6598 if (*p != ch) {
6599 *p = ch;
6600 return 1;
6601 }
6602 else
6603 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 }
Tim Petersced69f82003-09-16 20:30:58 +00006605
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 e = p + PyUnicode_GET_SIZE(self);
6607 previous_is_cased = 0;
6608 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006610
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 if (previous_is_cased)
6612 *p = Py_UNICODE_TOLOWER(ch);
6613 else
6614 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006615
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 if (Py_UNICODE_ISLOWER(ch) ||
6617 Py_UNICODE_ISUPPER(ch) ||
6618 Py_UNICODE_ISTITLE(ch))
6619 previous_is_cased = 1;
6620 else
6621 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 }
6623 return 1;
6624}
6625
Tim Peters8ce9f162004-08-27 01:49:32 +00006626PyObject *
6627PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628{
Skip Montanaro6543b452004-09-16 03:28:13 +00006629 const Py_UNICODE blank = ' ';
6630 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006631 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006632 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006633 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6634 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006635 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6636 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006637 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006638 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639
Tim Peters05eba1f2004-08-27 21:32:02 +00006640 fseq = PySequence_Fast(seq, "");
6641 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006642 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006643 }
6644
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006645 /* NOTE: the following code can't call back into Python code,
6646 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006647 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006648
Tim Peters05eba1f2004-08-27 21:32:02 +00006649 seqlen = PySequence_Fast_GET_SIZE(fseq);
6650 /* If empty sequence, return u"". */
6651 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006652 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6653 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006654 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006655 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006656 /* If singleton sequence with an exact Unicode, return that. */
6657 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 item = items[0];
6659 if (PyUnicode_CheckExact(item)) {
6660 Py_INCREF(item);
6661 res = (PyUnicodeObject *)item;
6662 goto Done;
6663 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006664 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006665 else {
6666 /* Set up sep and seplen */
6667 if (separator == NULL) {
6668 sep = &blank;
6669 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006670 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006671 else {
6672 if (!PyUnicode_Check(separator)) {
6673 PyErr_Format(PyExc_TypeError,
6674 "separator: expected str instance,"
6675 " %.80s found",
6676 Py_TYPE(separator)->tp_name);
6677 goto onError;
6678 }
6679 sep = PyUnicode_AS_UNICODE(separator);
6680 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006681 }
6682 }
6683
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006684 /* There are at least two things to join, or else we have a subclass
6685 * of str in the sequence.
6686 * Do a pre-pass to figure out the total amount of space we'll
6687 * need (sz), and see whether all argument are strings.
6688 */
6689 sz = 0;
6690 for (i = 0; i < seqlen; i++) {
6691 const Py_ssize_t old_sz = sz;
6692 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 if (!PyUnicode_Check(item)) {
6694 PyErr_Format(PyExc_TypeError,
6695 "sequence item %zd: expected str instance,"
6696 " %.80s found",
6697 i, Py_TYPE(item)->tp_name);
6698 goto onError;
6699 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006700 sz += PyUnicode_GET_SIZE(item);
6701 if (i != 0)
6702 sz += seplen;
6703 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6704 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006706 goto onError;
6707 }
6708 }
Tim Petersced69f82003-09-16 20:30:58 +00006709
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006710 res = _PyUnicode_New(sz);
6711 if (res == NULL)
6712 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006713
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006714 /* Catenate everything. */
6715 res_p = PyUnicode_AS_UNICODE(res);
6716 for (i = 0; i < seqlen; ++i) {
6717 Py_ssize_t itemlen;
6718 item = items[i];
6719 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 /* Copy item, and maybe the separator. */
6721 if (i) {
6722 Py_UNICODE_COPY(res_p, sep, seplen);
6723 res_p += seplen;
6724 }
6725 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6726 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006727 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006728
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006730 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 return (PyObject *)res;
6732
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006734 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006735 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 return NULL;
6737}
6738
Tim Petersced69f82003-09-16 20:30:58 +00006739static
6740PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 Py_ssize_t left,
6742 Py_ssize_t right,
6743 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744{
6745 PyUnicodeObject *u;
6746
6747 if (left < 0)
6748 left = 0;
6749 if (right < 0)
6750 right = 0;
6751
Tim Peters7a29bd52001-09-12 03:03:31 +00006752 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 Py_INCREF(self);
6754 return self;
6755 }
6756
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006757 if (left > PY_SSIZE_T_MAX - self->length ||
6758 right > PY_SSIZE_T_MAX - (left + self->length)) {
6759 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6760 return NULL;
6761 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 u = _PyUnicode_New(left + self->length + right);
6763 if (u) {
6764 if (left)
6765 Py_UNICODE_FILL(u->str, fill, left);
6766 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6767 if (right)
6768 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6769 }
6770
6771 return u;
6772}
6773
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006774PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777
6778 string = PyUnicode_FromObject(string);
6779 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006782 list = stringlib_splitlines(
6783 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6784 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785
6786 Py_DECREF(string);
6787 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788}
6789
Tim Petersced69f82003-09-16 20:30:58 +00006790static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 PyUnicodeObject *substring,
6793 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006796 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006799 return stringlib_split_whitespace(
6800 (PyObject*) self, self->str, self->length, maxcount
6801 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006803 return stringlib_split(
6804 (PyObject*) self, self->str, self->length,
6805 substring->str, substring->length,
6806 maxcount
6807 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808}
6809
Tim Petersced69f82003-09-16 20:30:58 +00006810static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006811PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 PyUnicodeObject *substring,
6813 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006814{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006815 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006816 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006817
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006818 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006819 return stringlib_rsplit_whitespace(
6820 (PyObject*) self, self->str, self->length, maxcount
6821 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006822
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006823 return stringlib_rsplit(
6824 (PyObject*) self, self->str, self->length,
6825 substring->str, substring->length,
6826 maxcount
6827 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006828}
6829
6830static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 PyUnicodeObject *str1,
6833 PyUnicodeObject *str2,
6834 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835{
6836 PyUnicodeObject *u;
6837
6838 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006840 else if (maxcount == 0 || self->length == 0)
6841 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842
Thomas Wouters477c8d52006-05-27 19:21:47 +00006843 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006844 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006845 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006846 if (str1->length == 0)
6847 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006848 if (str1->length == 1) {
6849 /* replace characters */
6850 Py_UNICODE u1, u2;
6851 if (!findchar(self->str, self->length, str1->str[0]))
6852 goto nothing;
6853 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6854 if (!u)
6855 return NULL;
6856 Py_UNICODE_COPY(u->str, self->str, self->length);
6857 u1 = str1->str[0];
6858 u2 = str2->str[0];
6859 for (i = 0; i < u->length; i++)
6860 if (u->str[i] == u1) {
6861 if (--maxcount < 0)
6862 break;
6863 u->str[i] = u2;
6864 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006866 i = stringlib_find(
6867 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006869 if (i < 0)
6870 goto nothing;
6871 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6872 if (!u)
6873 return NULL;
6874 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006875
6876 /* change everything in-place, starting with this one */
6877 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6878 i += str1->length;
6879
6880 while ( --maxcount > 0) {
6881 i = stringlib_find(self->str+i, self->length-i,
6882 str1->str, str1->length,
6883 i);
6884 if (i == -1)
6885 break;
6886 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6887 i += str1->length;
6888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006891
6892 Py_ssize_t n, i, j, e;
6893 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 Py_UNICODE *p;
6895
6896 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006897 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6898 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006899 if (n == 0)
6900 goto nothing;
6901 /* new_size = self->length + n * (str2->length - str1->length)); */
6902 delta = (str2->length - str1->length);
6903 if (delta == 0) {
6904 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006906 product = n * (str2->length - str1->length);
6907 if ((product / (str2->length - str1->length)) != n) {
6908 PyErr_SetString(PyExc_OverflowError,
6909 "replace string is too long");
6910 return NULL;
6911 }
6912 new_size = self->length + product;
6913 if (new_size < 0) {
6914 PyErr_SetString(PyExc_OverflowError,
6915 "replace string is too long");
6916 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917 }
6918 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006919 u = _PyUnicode_New(new_size);
6920 if (!u)
6921 return NULL;
6922 i = 0;
6923 p = u->str;
6924 e = self->length - str1->length;
6925 if (str1->length > 0) {
6926 while (n-- > 0) {
6927 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006928 j = stringlib_find(self->str+i, self->length-i,
6929 str1->str, str1->length,
6930 i);
6931 if (j == -1)
6932 break;
6933 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006934 /* copy unchanged part [i:j] */
6935 Py_UNICODE_COPY(p, self->str+i, j-i);
6936 p += j - i;
6937 }
6938 /* copy substitution string */
6939 if (str2->length > 0) {
6940 Py_UNICODE_COPY(p, str2->str, str2->length);
6941 p += str2->length;
6942 }
6943 i = j + str1->length;
6944 }
6945 if (i < self->length)
6946 /* copy tail [i:] */
6947 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6948 } else {
6949 /* interleave */
6950 while (n > 0) {
6951 Py_UNICODE_COPY(p, str2->str, str2->length);
6952 p += str2->length;
6953 if (--n <= 0)
6954 break;
6955 *p++ = self->str[i++];
6956 }
6957 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6958 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006961
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006963 /* nothing to replace; return original string (when possible) */
6964 if (PyUnicode_CheckExact(self)) {
6965 Py_INCREF(self);
6966 return (PyObject *) self;
6967 }
6968 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969}
6970
6971/* --- Unicode Object Methods --------------------------------------------- */
6972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006973PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975\n\
6976Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006977characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978
6979static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006980unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 return fixup(self, fixtitle);
6983}
6984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006985PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987\n\
6988Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00006989have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990
6991static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006992unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 return fixup(self, fixcapitalize);
6995}
6996
6997#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006998PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006999 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000\n\
7001Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007002normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003
7004static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007005unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006{
7007 PyObject *list;
7008 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007009 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 /* Split into words */
7012 list = split(self, NULL, -1);
7013 if (!list)
7014 return NULL;
7015
7016 /* Capitalize each word */
7017 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7018 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 if (item == NULL)
7021 goto onError;
7022 Py_DECREF(PyList_GET_ITEM(list, i));
7023 PyList_SET_ITEM(list, i, item);
7024 }
7025
7026 /* Join the words to form a new string */
7027 item = PyUnicode_Join(NULL, list);
7028
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 Py_DECREF(list);
7031 return (PyObject *)item;
7032}
7033#endif
7034
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007035/* Argument converter. Coerces to a single unicode character */
7036
7037static int
7038convert_uc(PyObject *obj, void *addr)
7039{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007040 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7041 PyObject *uniobj;
7042 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007043
Benjamin Peterson14339b62009-01-31 16:36:08 +00007044 uniobj = PyUnicode_FromObject(obj);
7045 if (uniobj == NULL) {
7046 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007048 return 0;
7049 }
7050 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7051 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007053 Py_DECREF(uniobj);
7054 return 0;
7055 }
7056 unistr = PyUnicode_AS_UNICODE(uniobj);
7057 *fillcharloc = unistr[0];
7058 Py_DECREF(uniobj);
7059 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007060}
7061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007062PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007065Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007066done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067
7068static PyObject *
7069unicode_center(PyUnicodeObject *self, PyObject *args)
7070{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007071 Py_ssize_t marg, left;
7072 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007073 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074
Thomas Woutersde017742006-02-16 19:34:37 +00007075 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076 return NULL;
7077
Tim Peters7a29bd52001-09-12 03:03:31 +00007078 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 Py_INCREF(self);
7080 return (PyObject*) self;
7081 }
7082
7083 marg = width - self->length;
7084 left = marg / 2 + (marg & width & 1);
7085
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007086 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087}
7088
Marc-André Lemburge5034372000-08-08 08:04:29 +00007089#if 0
7090
7091/* This code should go into some future Unicode collation support
7092 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007093 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007094
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007095/* speedy UTF-16 code point order comparison */
7096/* gleaned from: */
7097/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7098
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007099static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007100{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007101 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007102 0, 0, 0, 0, 0, 0, 0, 0,
7103 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007104 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007105};
7106
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107static int
7108unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7109{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007110 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007111
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 Py_UNICODE *s1 = str1->str;
7113 Py_UNICODE *s2 = str2->str;
7114
7115 len1 = str1->length;
7116 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007117
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007119 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007120
7121 c1 = *s1++;
7122 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007123
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 if (c1 > (1<<11) * 26)
7125 c1 += utf16Fixup[c1>>11];
7126 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007127 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007128 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007129
7130 if (c1 != c2)
7131 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007132
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007133 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134 }
7135
7136 return (len1 < len2) ? -1 : (len1 != len2);
7137}
7138
Marc-André Lemburge5034372000-08-08 08:04:29 +00007139#else
7140
7141static int
7142unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7143{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007144 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007145
7146 Py_UNICODE *s1 = str1->str;
7147 Py_UNICODE *s2 = str2->str;
7148
7149 len1 = str1->length;
7150 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007151
Marc-André Lemburge5034372000-08-08 08:04:29 +00007152 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007153 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007154
Fredrik Lundh45714e92001-06-26 16:39:36 +00007155 c1 = *s1++;
7156 c2 = *s2++;
7157
7158 if (c1 != c2)
7159 return (c1 < c2) ? -1 : 1;
7160
Marc-André Lemburge5034372000-08-08 08:04:29 +00007161 len1--; len2--;
7162 }
7163
7164 return (len1 < len2) ? -1 : (len1 != len2);
7165}
7166
7167#endif
7168
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007172 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7173 return unicode_compare((PyUnicodeObject *)left,
7174 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007175 PyErr_Format(PyExc_TypeError,
7176 "Can't compare %.100s and %.100s",
7177 left->ob_type->tp_name,
7178 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 return -1;
7180}
7181
Martin v. Löwis5b222132007-06-10 09:51:05 +00007182int
7183PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7184{
7185 int i;
7186 Py_UNICODE *id;
7187 assert(PyUnicode_Check(uni));
7188 id = PyUnicode_AS_UNICODE(uni);
7189 /* Compare Unicode string and source character set string */
7190 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 if (id[i] != str[i])
7192 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007193 /* This check keeps Python strings that end in '\0' from comparing equal
7194 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007195 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007197 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007199 return 0;
7200}
7201
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007202
Benjamin Peterson29060642009-01-31 22:14:21 +00007203#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007204 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007205
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007206PyObject *PyUnicode_RichCompare(PyObject *left,
7207 PyObject *right,
7208 int op)
7209{
7210 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007211
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007212 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7213 PyObject *v;
7214 if (((PyUnicodeObject *) left)->length !=
7215 ((PyUnicodeObject *) right)->length) {
7216 if (op == Py_EQ) {
7217 Py_INCREF(Py_False);
7218 return Py_False;
7219 }
7220 if (op == Py_NE) {
7221 Py_INCREF(Py_True);
7222 return Py_True;
7223 }
7224 }
7225 if (left == right)
7226 result = 0;
7227 else
7228 result = unicode_compare((PyUnicodeObject *)left,
7229 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007230
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007231 /* Convert the return value to a Boolean */
7232 switch (op) {
7233 case Py_EQ:
7234 v = TEST_COND(result == 0);
7235 break;
7236 case Py_NE:
7237 v = TEST_COND(result != 0);
7238 break;
7239 case Py_LE:
7240 v = TEST_COND(result <= 0);
7241 break;
7242 case Py_GE:
7243 v = TEST_COND(result >= 0);
7244 break;
7245 case Py_LT:
7246 v = TEST_COND(result == -1);
7247 break;
7248 case Py_GT:
7249 v = TEST_COND(result == 1);
7250 break;
7251 default:
7252 PyErr_BadArgument();
7253 return NULL;
7254 }
7255 Py_INCREF(v);
7256 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007257 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007258
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007259 Py_INCREF(Py_NotImplemented);
7260 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007261}
7262
Guido van Rossum403d68b2000-03-13 15:55:09 +00007263int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007265{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007266 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007267 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007268
7269 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007270 sub = PyUnicode_FromObject(element);
7271 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 PyErr_Format(PyExc_TypeError,
7273 "'in <string>' requires string as left operand, not %s",
7274 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007275 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007276 }
7277
Thomas Wouters477c8d52006-05-27 19:21:47 +00007278 str = PyUnicode_FromObject(container);
7279 if (!str) {
7280 Py_DECREF(sub);
7281 return -1;
7282 }
7283
7284 result = stringlib_contains_obj(str, sub);
7285
7286 Py_DECREF(str);
7287 Py_DECREF(sub);
7288
Guido van Rossum403d68b2000-03-13 15:55:09 +00007289 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007290}
7291
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292/* Concat to string or Unicode object giving a new Unicode object. */
7293
7294PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296{
7297 PyUnicodeObject *u = NULL, *v = NULL, *w;
7298
7299 /* Coerce the two arguments */
7300 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7301 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007302 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7304 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007305 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306
7307 /* Shortcuts */
7308 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 Py_DECREF(v);
7310 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 }
7312 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 Py_DECREF(u);
7314 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315 }
7316
7317 /* Concat the two Unicode strings */
7318 w = _PyUnicode_New(u->length + v->length);
7319 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 Py_UNICODE_COPY(w->str, u->str, u->length);
7322 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7323
7324 Py_DECREF(u);
7325 Py_DECREF(v);
7326 return (PyObject *)w;
7327
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329 Py_XDECREF(u);
7330 Py_XDECREF(v);
7331 return NULL;
7332}
7333
Walter Dörwald1ab83302007-05-18 17:15:44 +00007334void
7335PyUnicode_Append(PyObject **pleft, PyObject *right)
7336{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007337 PyObject *new;
7338 if (*pleft == NULL)
7339 return;
7340 if (right == NULL || !PyUnicode_Check(*pleft)) {
7341 Py_DECREF(*pleft);
7342 *pleft = NULL;
7343 return;
7344 }
7345 new = PyUnicode_Concat(*pleft, right);
7346 Py_DECREF(*pleft);
7347 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007348}
7349
7350void
7351PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7352{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007353 PyUnicode_Append(pleft, right);
7354 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007355}
7356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007357PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007359\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007360Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007361string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007362interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363
7364static PyObject *
7365unicode_count(PyUnicodeObject *self, PyObject *args)
7366{
7367 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007368 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007369 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370 PyObject *result;
7371
Guido van Rossumb8872e62000-05-09 14:14:27 +00007372 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374 return NULL;
7375
7376 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007377 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007378 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007380
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007381 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007382 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007383 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007384 substring->str, substring->length,
7385 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007386 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387
7388 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007389
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390 return result;
7391}
7392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007393PyDoc_STRVAR(encode__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007394 "S.encode([encoding[, errors]]) -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00007396Encode S using the codec registered for encoding. encoding defaults\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007397to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007398handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007399a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7400'xmlcharrefreplace' as well as any other name registered with\n\
7401codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
7403static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007404unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007406 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407 char *encoding = NULL;
7408 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007409 PyObject *v;
Guido van Rossum35d94282007-08-27 18:20:11 +00007410
Benjamin Peterson308d6372009-09-18 21:42:35 +00007411 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7412 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00007414 v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007415 if (v == NULL)
7416 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00007417 if (!PyBytes_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007418 PyErr_Format(PyExc_TypeError,
Guido van Rossumf15a29f2007-05-04 00:41:39 +00007419 "encoder did not return a bytes object "
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007420 "(type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00007421 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007422 Py_DECREF(v);
7423 return NULL;
7424 }
7425 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007426
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00007428 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007429}
7430
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007431PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433\n\
7434Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007435If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436
7437static PyObject*
7438unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7439{
7440 Py_UNICODE *e;
7441 Py_UNICODE *p;
7442 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007443 Py_UNICODE *qe;
7444 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 PyUnicodeObject *u;
7446 int tabsize = 8;
7447
7448 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450
Thomas Wouters7e474022000-07-16 12:04:32 +00007451 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007452 i = 0; /* chars up to and including most recent \n or \r */
7453 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7454 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 for (p = self->str; p < e; p++)
7456 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 if (tabsize > 0) {
7458 incr = tabsize - (j % tabsize); /* cannot overflow */
7459 if (j > PY_SSIZE_T_MAX - incr)
7460 goto overflow1;
7461 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007462 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 if (j > PY_SSIZE_T_MAX - 1)
7466 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 j++;
7468 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 if (i > PY_SSIZE_T_MAX - j)
7470 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007472 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473 }
7474 }
7475
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007476 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007478
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479 /* Second pass: create output string and fill it */
7480 u = _PyUnicode_New(i + j);
7481 if (!u)
7482 return NULL;
7483
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007484 j = 0; /* same as in first pass */
7485 q = u->str; /* next output char */
7486 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007487
7488 for (p = self->str; p < e; p++)
7489 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 if (tabsize > 0) {
7491 i = tabsize - (j % tabsize);
7492 j += i;
7493 while (i--) {
7494 if (q >= qe)
7495 goto overflow2;
7496 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007497 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007499 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 else {
7501 if (q >= qe)
7502 goto overflow2;
7503 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007504 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 if (*p == '\n' || *p == '\r')
7506 j = 0;
7507 }
7508
7509 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007510
7511 overflow2:
7512 Py_DECREF(u);
7513 overflow1:
7514 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516}
7517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007518PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520\n\
7521Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007522such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523arguments start and end are interpreted as in slice notation.\n\
7524\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007525Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526
7527static PyObject *
7528unicode_find(PyUnicodeObject *self, PyObject *args)
7529{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007530 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007531 Py_ssize_t start;
7532 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007533 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534
Christian Heimes9cd17752007-11-18 19:35:23 +00007535 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537
Thomas Wouters477c8d52006-05-27 19:21:47 +00007538 result = stringlib_find_slice(
7539 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7540 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7541 start, end
7542 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543
7544 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007545
Christian Heimes217cfd12007-12-02 14:31:20 +00007546 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547}
7548
7549static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007550unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551{
7552 if (index < 0 || index >= self->length) {
7553 PyErr_SetString(PyExc_IndexError, "string index out of range");
7554 return NULL;
7555 }
7556
7557 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7558}
7559
Guido van Rossumc2504932007-09-18 19:42:40 +00007560/* Believe it or not, this produces the same value for ASCII strings
7561 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007562static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007563unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564{
Guido van Rossumc2504932007-09-18 19:42:40 +00007565 Py_ssize_t len;
7566 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007567 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007568
7569 if (self->hash != -1)
7570 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007571 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007572 p = self->str;
7573 x = *p << 7;
7574 while (--len >= 0)
7575 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007576 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007577 if (x == -1)
7578 x = -2;
7579 self->hash = x;
7580 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581}
7582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007583PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007586Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
7588static PyObject *
7589unicode_index(PyUnicodeObject *self, PyObject *args)
7590{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007591 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007592 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007593 Py_ssize_t start;
7594 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595
Christian Heimes9cd17752007-11-18 19:35:23 +00007596 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598
Thomas Wouters477c8d52006-05-27 19:21:47 +00007599 result = stringlib_find_slice(
7600 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7601 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7602 start, end
7603 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604
7605 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007606
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 if (result < 0) {
7608 PyErr_SetString(PyExc_ValueError, "substring not found");
7609 return NULL;
7610 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007611
Christian Heimes217cfd12007-12-02 14:31:20 +00007612 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613}
7614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007615PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007618Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007619at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620
7621static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007622unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623{
7624 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7625 register const Py_UNICODE *e;
7626 int cased;
7627
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 /* Shortcut for single character strings */
7629 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007632 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007633 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007635
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 e = p + PyUnicode_GET_SIZE(self);
7637 cased = 0;
7638 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007640
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7642 return PyBool_FromLong(0);
7643 else if (!cased && Py_UNICODE_ISLOWER(ch))
7644 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007646 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647}
7648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007649PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007652Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007653at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654
7655static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007656unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657{
7658 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7659 register const Py_UNICODE *e;
7660 int cased;
7661
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662 /* Shortcut for single character strings */
7663 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007666 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007667 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007669
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670 e = p + PyUnicode_GET_SIZE(self);
7671 cased = 0;
7672 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007674
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7676 return PyBool_FromLong(0);
7677 else if (!cased && Py_UNICODE_ISUPPER(ch))
7678 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007680 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681}
7682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007683PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007686Return True if S is a titlecased string and there is at least one\n\
7687character in S, i.e. upper- and titlecase characters may only\n\
7688follow uncased characters and lowercase characters only cased ones.\n\
7689Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690
7691static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007692unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693{
7694 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7695 register const Py_UNICODE *e;
7696 int cased, previous_is_cased;
7697
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 /* Shortcut for single character strings */
7699 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007700 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7701 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007703 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007704 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007706
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707 e = p + PyUnicode_GET_SIZE(self);
7708 cased = 0;
7709 previous_is_cased = 0;
7710 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007712
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7714 if (previous_is_cased)
7715 return PyBool_FromLong(0);
7716 previous_is_cased = 1;
7717 cased = 1;
7718 }
7719 else if (Py_UNICODE_ISLOWER(ch)) {
7720 if (!previous_is_cased)
7721 return PyBool_FromLong(0);
7722 previous_is_cased = 1;
7723 cased = 1;
7724 }
7725 else
7726 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007728 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729}
7730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007731PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007732 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007734Return True if all characters in S are whitespace\n\
7735and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736
7737static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007738unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739{
7740 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7741 register const Py_UNICODE *e;
7742
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 /* Shortcut for single character strings */
7744 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 Py_UNICODE_ISSPACE(*p))
7746 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007748 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007749 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007751
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 e = p + PyUnicode_GET_SIZE(self);
7753 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 if (!Py_UNICODE_ISSPACE(*p))
7755 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007757 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758}
7759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007760PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007762\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007763Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007764and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007765
7766static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007767unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007768{
7769 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7770 register const Py_UNICODE *e;
7771
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007772 /* Shortcut for single character strings */
7773 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 Py_UNICODE_ISALPHA(*p))
7775 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007776
7777 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007778 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007780
7781 e = p + PyUnicode_GET_SIZE(self);
7782 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 if (!Py_UNICODE_ISALPHA(*p))
7784 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007785 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007786 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007787}
7788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007789PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007791\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007792Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007793and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007794
7795static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007796unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007797{
7798 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7799 register const Py_UNICODE *e;
7800
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007801 /* Shortcut for single character strings */
7802 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 Py_UNICODE_ISALNUM(*p))
7804 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007805
7806 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007807 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007809
7810 e = p + PyUnicode_GET_SIZE(self);
7811 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 if (!Py_UNICODE_ISALNUM(*p))
7813 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007814 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007815 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007816}
7817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007818PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007821Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007822False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823
7824static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007825unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826{
7827 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7828 register const Py_UNICODE *e;
7829
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 /* Shortcut for single character strings */
7831 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 Py_UNICODE_ISDECIMAL(*p))
7833 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007835 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007836 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007838
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 e = p + PyUnicode_GET_SIZE(self);
7840 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 if (!Py_UNICODE_ISDECIMAL(*p))
7842 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007844 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845}
7846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007847PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007850Return True if all characters in S are digits\n\
7851and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852
7853static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007854unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855{
7856 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7857 register const Py_UNICODE *e;
7858
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859 /* Shortcut for single character strings */
7860 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 Py_UNICODE_ISDIGIT(*p))
7862 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007864 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007865 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007867
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868 e = p + PyUnicode_GET_SIZE(self);
7869 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 if (!Py_UNICODE_ISDIGIT(*p))
7871 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007873 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874}
7875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007876PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007879Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007880False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881
7882static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007883unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884{
7885 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7886 register const Py_UNICODE *e;
7887
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 /* Shortcut for single character strings */
7889 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 Py_UNICODE_ISNUMERIC(*p))
7891 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007893 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007894 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007896
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 e = p + PyUnicode_GET_SIZE(self);
7898 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 if (!Py_UNICODE_ISNUMERIC(*p))
7900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007902 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903}
7904
Martin v. Löwis47383402007-08-15 07:32:56 +00007905int
7906PyUnicode_IsIdentifier(PyObject *self)
7907{
7908 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7909 register const Py_UNICODE *e;
7910
7911 /* Special case for empty strings */
7912 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007914
7915 /* PEP 3131 says that the first character must be in
7916 XID_Start and subsequent characters in XID_Continue,
7917 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007918 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007919 letters, digits, underscore). However, given the current
7920 definition of XID_Start and XID_Continue, it is sufficient
7921 to check just for these, except that _ must be allowed
7922 as starting an identifier. */
7923 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7924 return 0;
7925
7926 e = p + PyUnicode_GET_SIZE(self);
7927 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 if (!_PyUnicode_IsXidContinue(*p))
7929 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007930 }
7931 return 1;
7932}
7933
7934PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007936\n\
7937Return True if S is a valid identifier according\n\
7938to the language definition.");
7939
7940static PyObject*
7941unicode_isidentifier(PyObject *self)
7942{
7943 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7944}
7945
Georg Brandl559e5d72008-06-11 18:37:52 +00007946PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007948\n\
7949Return True if all characters in S are considered\n\
7950printable in repr() or S is empty, False otherwise.");
7951
7952static PyObject*
7953unicode_isprintable(PyObject *self)
7954{
7955 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7956 register const Py_UNICODE *e;
7957
7958 /* Shortcut for single character strings */
7959 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7960 Py_RETURN_TRUE;
7961 }
7962
7963 e = p + PyUnicode_GET_SIZE(self);
7964 for (; p < e; p++) {
7965 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7966 Py_RETURN_FALSE;
7967 }
7968 }
7969 Py_RETURN_TRUE;
7970}
7971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007972PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00007973 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974\n\
7975Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00007976iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977
7978static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007979unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007981 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982}
7983
Martin v. Löwis18e16552006-02-15 17:27:45 +00007984static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985unicode_length(PyUnicodeObject *self)
7986{
7987 return self->length;
7988}
7989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007990PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00007993Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007994done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995
7996static PyObject *
7997unicode_ljust(PyUnicodeObject *self, PyObject *args)
7998{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007999 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008000 Py_UNICODE fillchar = ' ';
8001
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008002 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 return NULL;
8004
Tim Peters7a29bd52001-09-12 03:03:31 +00008005 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 Py_INCREF(self);
8007 return (PyObject*) self;
8008 }
8009
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008010 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011}
8012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008013PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008016Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017
8018static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008019unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 return fixup(self, fixlower);
8022}
8023
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008024#define LEFTSTRIP 0
8025#define RIGHTSTRIP 1
8026#define BOTHSTRIP 2
8027
8028/* Arrays indexed by above */
8029static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8030
8031#define STRIPNAME(i) (stripformat[i]+3)
8032
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008033/* externally visible for str.strip(unicode) */
8034PyObject *
8035_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8036{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008037 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8038 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8039 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8040 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8041 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008042
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008044
Benjamin Peterson14339b62009-01-31 16:36:08 +00008045 i = 0;
8046 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8048 i++;
8049 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008050 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008051
Benjamin Peterson14339b62009-01-31 16:36:08 +00008052 j = len;
8053 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 do {
8055 j--;
8056 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8057 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008058 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008059
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 Py_INCREF(self);
8062 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008063 }
8064 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008066}
8067
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068
8069static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008070do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008072 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8073 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008074
Benjamin Peterson14339b62009-01-31 16:36:08 +00008075 i = 0;
8076 if (striptype != RIGHTSTRIP) {
8077 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8078 i++;
8079 }
8080 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008081
Benjamin Peterson14339b62009-01-31 16:36:08 +00008082 j = len;
8083 if (striptype != LEFTSTRIP) {
8084 do {
8085 j--;
8086 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8087 j++;
8088 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008089
Benjamin Peterson14339b62009-01-31 16:36:08 +00008090 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8091 Py_INCREF(self);
8092 return (PyObject*)self;
8093 }
8094 else
8095 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096}
8097
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008098
8099static PyObject *
8100do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8101{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008102 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008103
Benjamin Peterson14339b62009-01-31 16:36:08 +00008104 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8105 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008106
Benjamin Peterson14339b62009-01-31 16:36:08 +00008107 if (sep != NULL && sep != Py_None) {
8108 if (PyUnicode_Check(sep))
8109 return _PyUnicode_XStrip(self, striptype, sep);
8110 else {
8111 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 "%s arg must be None or str",
8113 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008114 return NULL;
8115 }
8116 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008117
Benjamin Peterson14339b62009-01-31 16:36:08 +00008118 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008119}
8120
8121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008122PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008124\n\
8125Return a copy of the string S with leading and trailing\n\
8126whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008127If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008128
8129static PyObject *
8130unicode_strip(PyUnicodeObject *self, PyObject *args)
8131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008132 if (PyTuple_GET_SIZE(args) == 0)
8133 return do_strip(self, BOTHSTRIP); /* Common case */
8134 else
8135 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008136}
8137
8138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008139PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008140 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008141\n\
8142Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008143If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008144
8145static PyObject *
8146unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8147{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008148 if (PyTuple_GET_SIZE(args) == 0)
8149 return do_strip(self, LEFTSTRIP); /* Common case */
8150 else
8151 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008152}
8153
8154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008155PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008157\n\
8158Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008159If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008160
8161static PyObject *
8162unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8163{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 if (PyTuple_GET_SIZE(args) == 0)
8165 return do_strip(self, RIGHTSTRIP); /* Common case */
8166 else
8167 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008168}
8169
8170
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008172unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173{
8174 PyUnicodeObject *u;
8175 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008176 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008177 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178
Georg Brandl222de0f2009-04-12 12:01:50 +00008179 if (len < 1) {
8180 Py_INCREF(unicode_empty);
8181 return (PyObject *)unicode_empty;
8182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183
Tim Peters7a29bd52001-09-12 03:03:31 +00008184 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 /* no repeat, return original string */
8186 Py_INCREF(str);
8187 return (PyObject*) str;
8188 }
Tim Peters8f422462000-09-09 06:13:41 +00008189
8190 /* ensure # of chars needed doesn't overflow int and # of bytes
8191 * needed doesn't overflow size_t
8192 */
8193 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008194 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008195 PyErr_SetString(PyExc_OverflowError,
8196 "repeated string is too long");
8197 return NULL;
8198 }
8199 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8200 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8201 PyErr_SetString(PyExc_OverflowError,
8202 "repeated string is too long");
8203 return NULL;
8204 }
8205 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206 if (!u)
8207 return NULL;
8208
8209 p = u->str;
8210
Georg Brandl222de0f2009-04-12 12:01:50 +00008211 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008212 Py_UNICODE_FILL(p, str->str[0], len);
8213 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008214 Py_ssize_t done = str->length; /* number of characters copied this far */
8215 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008217 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008218 Py_UNICODE_COPY(p+done, p, n);
8219 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 }
8222
8223 return (PyObject*) u;
8224}
8225
8226PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 PyObject *subobj,
8228 PyObject *replobj,
8229 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230{
8231 PyObject *self;
8232 PyObject *str1;
8233 PyObject *str2;
8234 PyObject *result;
8235
8236 self = PyUnicode_FromObject(obj);
8237 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 str1 = PyUnicode_FromObject(subobj);
8240 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 Py_DECREF(self);
8242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 }
8244 str2 = PyUnicode_FromObject(replobj);
8245 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 Py_DECREF(self);
8247 Py_DECREF(str1);
8248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 }
Tim Petersced69f82003-09-16 20:30:58 +00008250 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 (PyUnicodeObject *)str1,
8252 (PyUnicodeObject *)str2,
8253 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 Py_DECREF(self);
8255 Py_DECREF(str1);
8256 Py_DECREF(str2);
8257 return result;
8258}
8259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008260PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008261 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262\n\
8263Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008264old replaced by new. If the optional argument count is\n\
8265given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266
8267static PyObject*
8268unicode_replace(PyUnicodeObject *self, PyObject *args)
8269{
8270 PyUnicodeObject *str1;
8271 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008272 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273 PyObject *result;
8274
Martin v. Löwis18e16552006-02-15 17:27:45 +00008275 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 return NULL;
8277 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8278 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008281 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 Py_DECREF(str1);
8283 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285
8286 result = replace(self, str1, str2, maxcount);
8287
8288 Py_DECREF(str1);
8289 Py_DECREF(str2);
8290 return result;
8291}
8292
8293static
8294PyObject *unicode_repr(PyObject *unicode)
8295{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008296 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008297 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008298 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8299 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8300
8301 /* XXX(nnorwitz): rather than over-allocating, it would be
8302 better to choose a different scheme. Perhaps scan the
8303 first N-chars of the string and allocate based on that size.
8304 */
8305 /* Initial allocation is based on the longest-possible unichr
8306 escape.
8307
8308 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8309 unichr, so in this case it's the longest unichr escape. In
8310 narrow (UTF-16) builds this is five chars per source unichr
8311 since there are two unichrs in the surrogate pair, so in narrow
8312 (UTF-16) builds it's not the longest unichr escape.
8313
8314 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8315 so in the narrow (UTF-16) build case it's the longest unichr
8316 escape.
8317 */
8318
Walter Dörwald1ab83302007-05-18 17:15:44 +00008319 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008321#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008323#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008325#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008327 if (repr == NULL)
8328 return NULL;
8329
Walter Dörwald1ab83302007-05-18 17:15:44 +00008330 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008331
8332 /* Add quote */
8333 *p++ = (findchar(s, size, '\'') &&
8334 !findchar(s, size, '"')) ? '"' : '\'';
8335 while (size-- > 0) {
8336 Py_UNICODE ch = *s++;
8337
8338 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008339 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008340 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008341 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008342 continue;
8343 }
8344
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008346 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008347 *p++ = '\\';
8348 *p++ = 't';
8349 }
8350 else if (ch == '\n') {
8351 *p++ = '\\';
8352 *p++ = 'n';
8353 }
8354 else if (ch == '\r') {
8355 *p++ = '\\';
8356 *p++ = 'r';
8357 }
8358
8359 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008360 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008361 *p++ = '\\';
8362 *p++ = 'x';
8363 *p++ = hexdigits[(ch >> 4) & 0x000F];
8364 *p++ = hexdigits[ch & 0x000F];
8365 }
8366
Georg Brandl559e5d72008-06-11 18:37:52 +00008367 /* Copy ASCII characters as-is */
8368 else if (ch < 0x7F) {
8369 *p++ = ch;
8370 }
8371
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008373 else {
8374 Py_UCS4 ucs = ch;
8375
8376#ifndef Py_UNICODE_WIDE
8377 Py_UNICODE ch2 = 0;
8378 /* Get code point from surrogate pair */
8379 if (size > 0) {
8380 ch2 = *s;
8381 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008383 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008385 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008386 size--;
8387 }
8388 }
8389#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008391 (categories Z* and C* except ASCII space)
8392 */
8393 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8394 /* Map 8-bit characters to '\xhh' */
8395 if (ucs <= 0xff) {
8396 *p++ = '\\';
8397 *p++ = 'x';
8398 *p++ = hexdigits[(ch >> 4) & 0x000F];
8399 *p++ = hexdigits[ch & 0x000F];
8400 }
8401 /* Map 21-bit characters to '\U00xxxxxx' */
8402 else if (ucs >= 0x10000) {
8403 *p++ = '\\';
8404 *p++ = 'U';
8405 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8406 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8407 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8408 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8409 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8410 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8411 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8412 *p++ = hexdigits[ucs & 0x0000000F];
8413 }
8414 /* Map 16-bit characters to '\uxxxx' */
8415 else {
8416 *p++ = '\\';
8417 *p++ = 'u';
8418 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8419 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8420 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8421 *p++ = hexdigits[ucs & 0x000F];
8422 }
8423 }
8424 /* Copy characters as-is */
8425 else {
8426 *p++ = ch;
8427#ifndef Py_UNICODE_WIDE
8428 if (ucs >= 0x10000)
8429 *p++ = ch2;
8430#endif
8431 }
8432 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008433 }
8434 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008435 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008436
8437 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008438 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008439 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440}
8441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008442PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444\n\
8445Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008446such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447arguments start and end are interpreted as in slice notation.\n\
8448\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008449Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450
8451static PyObject *
8452unicode_rfind(PyUnicodeObject *self, PyObject *args)
8453{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008454 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008455 Py_ssize_t start;
8456 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008457 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458
Christian Heimes9cd17752007-11-18 19:35:23 +00008459 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008460 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461
Thomas Wouters477c8d52006-05-27 19:21:47 +00008462 result = stringlib_rfind_slice(
8463 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8464 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8465 start, end
8466 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467
8468 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008469
Christian Heimes217cfd12007-12-02 14:31:20 +00008470 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471}
8472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008473PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008476Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477
8478static PyObject *
8479unicode_rindex(PyUnicodeObject *self, PyObject *args)
8480{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008481 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008482 Py_ssize_t start;
8483 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008484 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485
Christian Heimes9cd17752007-11-18 19:35:23 +00008486 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488
Thomas Wouters477c8d52006-05-27 19:21:47 +00008489 result = stringlib_rfind_slice(
8490 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8491 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8492 start, end
8493 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008494
8495 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008496
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497 if (result < 0) {
8498 PyErr_SetString(PyExc_ValueError, "substring not found");
8499 return NULL;
8500 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008501 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502}
8503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008504PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008507Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008508done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509
8510static PyObject *
8511unicode_rjust(PyUnicodeObject *self, PyObject *args)
8512{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008513 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008514 Py_UNICODE fillchar = ' ';
8515
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008516 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 return NULL;
8518
Tim Peters7a29bd52001-09-12 03:03:31 +00008519 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520 Py_INCREF(self);
8521 return (PyObject*) self;
8522 }
8523
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008524 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525}
8526
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 PyObject *sep,
8529 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530{
8531 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008532
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533 s = PyUnicode_FromObject(s);
8534 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008535 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 if (sep != NULL) {
8537 sep = PyUnicode_FromObject(sep);
8538 if (sep == NULL) {
8539 Py_DECREF(s);
8540 return NULL;
8541 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542 }
8543
8544 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8545
8546 Py_DECREF(s);
8547 Py_XDECREF(sep);
8548 return result;
8549}
8550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008551PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553\n\
8554Return a list of the words in S, using sep as the\n\
8555delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008556splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008557whitespace string is a separator and empty strings are\n\
8558removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559
8560static PyObject*
8561unicode_split(PyUnicodeObject *self, PyObject *args)
8562{
8563 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008564 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565
Martin v. Löwis18e16552006-02-15 17:27:45 +00008566 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567 return NULL;
8568
8569 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575}
8576
Thomas Wouters477c8d52006-05-27 19:21:47 +00008577PyObject *
8578PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8579{
8580 PyObject* str_obj;
8581 PyObject* sep_obj;
8582 PyObject* out;
8583
8584 str_obj = PyUnicode_FromObject(str_in);
8585 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008587 sep_obj = PyUnicode_FromObject(sep_in);
8588 if (!sep_obj) {
8589 Py_DECREF(str_obj);
8590 return NULL;
8591 }
8592
8593 out = stringlib_partition(
8594 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8595 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8596 );
8597
8598 Py_DECREF(sep_obj);
8599 Py_DECREF(str_obj);
8600
8601 return out;
8602}
8603
8604
8605PyObject *
8606PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8607{
8608 PyObject* str_obj;
8609 PyObject* sep_obj;
8610 PyObject* out;
8611
8612 str_obj = PyUnicode_FromObject(str_in);
8613 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008615 sep_obj = PyUnicode_FromObject(sep_in);
8616 if (!sep_obj) {
8617 Py_DECREF(str_obj);
8618 return NULL;
8619 }
8620
8621 out = stringlib_rpartition(
8622 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8623 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8624 );
8625
8626 Py_DECREF(sep_obj);
8627 Py_DECREF(str_obj);
8628
8629 return out;
8630}
8631
8632PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008634\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008635Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008636the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008637found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008638
8639static PyObject*
8640unicode_partition(PyUnicodeObject *self, PyObject *separator)
8641{
8642 return PyUnicode_Partition((PyObject *)self, separator);
8643}
8644
8645PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008646 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008647\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008648Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008649the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008650separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008651
8652static PyObject*
8653unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8654{
8655 return PyUnicode_RPartition((PyObject *)self, separator);
8656}
8657
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008658PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 PyObject *sep,
8660 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008661{
8662 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008663
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008664 s = PyUnicode_FromObject(s);
8665 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008666 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 if (sep != NULL) {
8668 sep = PyUnicode_FromObject(sep);
8669 if (sep == NULL) {
8670 Py_DECREF(s);
8671 return NULL;
8672 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008673 }
8674
8675 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8676
8677 Py_DECREF(s);
8678 Py_XDECREF(sep);
8679 return result;
8680}
8681
8682PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008684\n\
8685Return a list of the words in S, using sep as the\n\
8686delimiter string, starting at the end of the string and\n\
8687working to the front. If maxsplit is given, at most maxsplit\n\
8688splits are done. If sep is not specified, any whitespace string\n\
8689is a separator.");
8690
8691static PyObject*
8692unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8693{
8694 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008695 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008696
Martin v. Löwis18e16552006-02-15 17:27:45 +00008697 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008698 return NULL;
8699
8700 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008702 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008704 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008706}
8707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008708PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710\n\
8711Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008712Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008713is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714
8715static PyObject*
8716unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8717{
Guido van Rossum86662912000-04-11 15:38:46 +00008718 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719
Guido van Rossum86662912000-04-11 15:38:46 +00008720 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 return NULL;
8722
Guido van Rossum86662912000-04-11 15:38:46 +00008723 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724}
8725
8726static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008727PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728{
Walter Dörwald346737f2007-05-31 10:44:43 +00008729 if (PyUnicode_CheckExact(self)) {
8730 Py_INCREF(self);
8731 return self;
8732 } else
8733 /* Subtype -- return genuine unicode string with the same value. */
8734 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8735 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736}
8737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008738PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740\n\
8741Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008742and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743
8744static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008745unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 return fixup(self, fixswapcase);
8748}
8749
Georg Brandlceee0772007-11-27 23:48:05 +00008750PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008752\n\
8753Return a translation table usable for str.translate().\n\
8754If there is only one argument, it must be a dictionary mapping Unicode\n\
8755ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008756Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008757If there are two arguments, they must be strings of equal length, and\n\
8758in the resulting dictionary, each character in x will be mapped to the\n\
8759character at the same position in y. If there is a third argument, it\n\
8760must be a string, whose characters will be mapped to None in the result.");
8761
8762static PyObject*
8763unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8764{
8765 PyObject *x, *y = NULL, *z = NULL;
8766 PyObject *new = NULL, *key, *value;
8767 Py_ssize_t i = 0;
8768 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008769
Georg Brandlceee0772007-11-27 23:48:05 +00008770 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8771 return NULL;
8772 new = PyDict_New();
8773 if (!new)
8774 return NULL;
8775 if (y != NULL) {
8776 /* x must be a string too, of equal length */
8777 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8778 if (!PyUnicode_Check(x)) {
8779 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8780 "be a string if there is a second argument");
8781 goto err;
8782 }
8783 if (PyUnicode_GET_SIZE(x) != ylen) {
8784 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8785 "arguments must have equal length");
8786 goto err;
8787 }
8788 /* create entries for translating chars in x to those in y */
8789 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008790 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8791 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008792 if (!key || !value)
8793 goto err;
8794 res = PyDict_SetItem(new, key, value);
8795 Py_DECREF(key);
8796 Py_DECREF(value);
8797 if (res < 0)
8798 goto err;
8799 }
8800 /* create entries for deleting chars in z */
8801 if (z != NULL) {
8802 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008803 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008804 if (!key)
8805 goto err;
8806 res = PyDict_SetItem(new, key, Py_None);
8807 Py_DECREF(key);
8808 if (res < 0)
8809 goto err;
8810 }
8811 }
8812 } else {
8813 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008814 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008815 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8816 "to maketrans it must be a dict");
8817 goto err;
8818 }
8819 /* copy entries into the new dict, converting string keys to int keys */
8820 while (PyDict_Next(x, &i, &key, &value)) {
8821 if (PyUnicode_Check(key)) {
8822 /* convert string keys to integer keys */
8823 PyObject *newkey;
8824 if (PyUnicode_GET_SIZE(key) != 1) {
8825 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8826 "table must be of length 1");
8827 goto err;
8828 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008829 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008830 if (!newkey)
8831 goto err;
8832 res = PyDict_SetItem(new, newkey, value);
8833 Py_DECREF(newkey);
8834 if (res < 0)
8835 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008836 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008837 /* just keep integer keys */
8838 if (PyDict_SetItem(new, key, value) < 0)
8839 goto err;
8840 } else {
8841 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8842 "be strings or integers");
8843 goto err;
8844 }
8845 }
8846 }
8847 return new;
8848 err:
8849 Py_DECREF(new);
8850 return NULL;
8851}
8852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008853PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855\n\
8856Return a copy of the string S, where all characters have been mapped\n\
8857through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008858Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008859Unmapped characters are left untouched. Characters mapped to None\n\
8860are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861
8862static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008863unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864{
Georg Brandlceee0772007-11-27 23:48:05 +00008865 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866}
8867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008868PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008871Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872
8873static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008874unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876 return fixup(self, fixupper);
8877}
8878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008879PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008882Pad a numeric string S with zeros on the left, to fill a field\n\
8883of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884
8885static PyObject *
8886unicode_zfill(PyUnicodeObject *self, PyObject *args)
8887{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008888 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889 PyUnicodeObject *u;
8890
Martin v. Löwis18e16552006-02-15 17:27:45 +00008891 Py_ssize_t width;
8892 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893 return NULL;
8894
8895 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008896 if (PyUnicode_CheckExact(self)) {
8897 Py_INCREF(self);
8898 return (PyObject*) self;
8899 }
8900 else
8901 return PyUnicode_FromUnicode(
8902 PyUnicode_AS_UNICODE(self),
8903 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 }
8906
8907 fill = width - self->length;
8908
8909 u = pad(self, fill, 0, '0');
8910
Walter Dörwald068325e2002-04-15 13:36:47 +00008911 if (u == NULL)
8912 return NULL;
8913
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914 if (u->str[fill] == '+' || u->str[fill] == '-') {
8915 /* move sign to beginning of string */
8916 u->str[0] = u->str[fill];
8917 u->str[fill] = '0';
8918 }
8919
8920 return (PyObject*) u;
8921}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922
8923#if 0
8924static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008925unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926{
Christian Heimes2202f872008-02-06 14:31:34 +00008927 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928}
8929#endif
8930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008931PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008932 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008934Return True if S starts with the specified prefix, False otherwise.\n\
8935With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008936With optional end, stop comparing S at that position.\n\
8937prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938
8939static PyObject *
8940unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008943 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008945 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008946 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008947 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008949 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8951 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008952 if (PyTuple_Check(subobj)) {
8953 Py_ssize_t i;
8954 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8955 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008957 if (substring == NULL)
8958 return NULL;
8959 result = tailmatch(self, substring, start, end, -1);
8960 Py_DECREF(substring);
8961 if (result) {
8962 Py_RETURN_TRUE;
8963 }
8964 }
8965 /* nothing matched */
8966 Py_RETURN_FALSE;
8967 }
8968 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008970 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008971 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008973 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974}
8975
8976
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008977PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008980Return True if S ends with the specified suffix, False otherwise.\n\
8981With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008982With optional end, stop comparing S at that position.\n\
8983suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984
8985static PyObject *
8986unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008989 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008991 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008992 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008993 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008995 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8997 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008998 if (PyTuple_Check(subobj)) {
8999 Py_ssize_t i;
9000 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9001 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009003 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009005 result = tailmatch(self, substring, start, end, +1);
9006 Py_DECREF(substring);
9007 if (result) {
9008 Py_RETURN_TRUE;
9009 }
9010 }
9011 Py_RETURN_FALSE;
9012 }
9013 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009017 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009019 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020}
9021
Eric Smith8c663262007-08-25 02:26:07 +00009022#include "stringlib/string_format.h"
9023
9024PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009026\n\
9027");
9028
Eric Smith4a7d76d2008-05-30 18:10:19 +00009029static PyObject *
9030unicode__format__(PyObject* self, PyObject* args)
9031{
9032 PyObject *format_spec;
9033
9034 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9035 return NULL;
9036
9037 return _PyUnicode_FormatAdvanced(self,
9038 PyUnicode_AS_UNICODE(format_spec),
9039 PyUnicode_GET_SIZE(format_spec));
9040}
9041
Eric Smith8c663262007-08-25 02:26:07 +00009042PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009043 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009044\n\
9045");
9046
9047static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009048unicode__sizeof__(PyUnicodeObject *v)
9049{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009050 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9051 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009052}
9053
9054PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009055 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009056
9057static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009058unicode_getnewargs(PyUnicodeObject *v)
9059{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009060 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009061}
9062
9063
Guido van Rossumd57fd912000-03-10 22:53:23 +00009064static PyMethodDef unicode_methods[] = {
9065
9066 /* Order is according to common usage: often used methods should
9067 appear first, since lookup is done sequentially. */
9068
Benjamin Peterson308d6372009-09-18 21:42:35 +00009069 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009070 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9071 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009072 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009073 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9074 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9075 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9076 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9077 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9078 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9079 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009080 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009081 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9082 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9083 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009084 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009085 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9086 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9087 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009088 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009089 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009090 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009091 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009092 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9093 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9094 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9095 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9096 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9097 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9098 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9099 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9100 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9101 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9102 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9103 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9104 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9105 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009106 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009107 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009108 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009109 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009110 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009111 {"maketrans", (PyCFunction) unicode_maketrans,
9112 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009113 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009114#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009115 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116#endif
9117
9118#if 0
9119 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009120 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121#endif
9122
Benjamin Peterson14339b62009-01-31 16:36:08 +00009123 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124 {NULL, NULL}
9125};
9126
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009127static PyObject *
9128unicode_mod(PyObject *v, PyObject *w)
9129{
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 if (!PyUnicode_Check(v)) {
9131 Py_INCREF(Py_NotImplemented);
9132 return Py_NotImplemented;
9133 }
9134 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009135}
9136
9137static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009138 0, /*nb_add*/
9139 0, /*nb_subtract*/
9140 0, /*nb_multiply*/
9141 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009142};
9143
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009145 (lenfunc) unicode_length, /* sq_length */
9146 PyUnicode_Concat, /* sq_concat */
9147 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9148 (ssizeargfunc) unicode_getitem, /* sq_item */
9149 0, /* sq_slice */
9150 0, /* sq_ass_item */
9151 0, /* sq_ass_slice */
9152 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153};
9154
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009155static PyObject*
9156unicode_subscript(PyUnicodeObject* self, PyObject* item)
9157{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009158 if (PyIndex_Check(item)) {
9159 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009160 if (i == -1 && PyErr_Occurred())
9161 return NULL;
9162 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009163 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009164 return unicode_getitem(self, i);
9165 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009166 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009167 Py_UNICODE* source_buf;
9168 Py_UNICODE* result_buf;
9169 PyObject* result;
9170
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009171 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009173 return NULL;
9174 }
9175
9176 if (slicelength <= 0) {
9177 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009178 } else if (start == 0 && step == 1 && slicelength == self->length &&
9179 PyUnicode_CheckExact(self)) {
9180 Py_INCREF(self);
9181 return (PyObject *)self;
9182 } else if (step == 1) {
9183 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009184 } else {
9185 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009186 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9187 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009188
Benjamin Peterson29060642009-01-31 22:14:21 +00009189 if (result_buf == NULL)
9190 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009191
9192 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9193 result_buf[i] = source_buf[cur];
9194 }
Tim Petersced69f82003-09-16 20:30:58 +00009195
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009196 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009197 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009198 return result;
9199 }
9200 } else {
9201 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9202 return NULL;
9203 }
9204}
9205
9206static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009207 (lenfunc)unicode_length, /* mp_length */
9208 (binaryfunc)unicode_subscript, /* mp_subscript */
9209 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009210};
9211
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213/* Helpers for PyUnicode_Format() */
9214
9215static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009216getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009218 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 (*p_argidx)++;
9221 if (arglen < 0)
9222 return args;
9223 else
9224 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 }
9226 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009227 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228 return NULL;
9229}
9230
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009231/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009233static PyObject *
9234formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009236 char *p;
9237 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009239
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240 x = PyFloat_AsDouble(v);
9241 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009242 return NULL;
9243
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009245 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009246
Eric Smith0923d1d2009-04-16 20:16:10 +00009247 p = PyOS_double_to_string(x, type, prec,
9248 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009249 if (p == NULL)
9250 return NULL;
9251 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009252 PyMem_Free(p);
9253 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254}
9255
Tim Peters38fd5b62000-09-21 05:43:11 +00009256static PyObject*
9257formatlong(PyObject *val, int flags, int prec, int type)
9258{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009259 char *buf;
9260 int len;
9261 PyObject *str; /* temporary string object. */
9262 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009263
Benjamin Peterson14339b62009-01-31 16:36:08 +00009264 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9265 if (!str)
9266 return NULL;
9267 result = PyUnicode_FromStringAndSize(buf, len);
9268 Py_DECREF(str);
9269 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009270}
9271
Guido van Rossumd57fd912000-03-10 22:53:23 +00009272static int
9273formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009274 size_t buflen,
9275 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009277 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009278 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009279 if (PyUnicode_GET_SIZE(v) == 1) {
9280 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9281 buf[1] = '\0';
9282 return 1;
9283 }
9284#ifndef Py_UNICODE_WIDE
9285 if (PyUnicode_GET_SIZE(v) == 2) {
9286 /* Decode a valid surrogate pair */
9287 int c0 = PyUnicode_AS_UNICODE(v)[0];
9288 int c1 = PyUnicode_AS_UNICODE(v)[1];
9289 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9290 0xDC00 <= c1 && c1 <= 0xDFFF) {
9291 buf[0] = c0;
9292 buf[1] = c1;
9293 buf[2] = '\0';
9294 return 2;
9295 }
9296 }
9297#endif
9298 goto onError;
9299 }
9300 else {
9301 /* Integer input truncated to a character */
9302 long x;
9303 x = PyLong_AsLong(v);
9304 if (x == -1 && PyErr_Occurred())
9305 goto onError;
9306
9307 if (x < 0 || x > 0x10ffff) {
9308 PyErr_SetString(PyExc_OverflowError,
9309 "%c arg not in range(0x110000)");
9310 return -1;
9311 }
9312
9313#ifndef Py_UNICODE_WIDE
9314 if (x > 0xffff) {
9315 x -= 0x10000;
9316 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9317 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9318 return 2;
9319 }
9320#endif
9321 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009322 buf[1] = '\0';
9323 return 1;
9324 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009325
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009327 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009328 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009329 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009330}
9331
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009332/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009333 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009334*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009335#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009336
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009338 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339{
9340 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009341 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342 int args_owned = 0;
9343 PyUnicodeObject *result = NULL;
9344 PyObject *dict = NULL;
9345 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009346
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009348 PyErr_BadInternalCall();
9349 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350 }
9351 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009352 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354 fmt = PyUnicode_AS_UNICODE(uformat);
9355 fmtcnt = PyUnicode_GET_SIZE(uformat);
9356
9357 reslen = rescnt = fmtcnt + 100;
9358 result = _PyUnicode_New(reslen);
9359 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009360 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 res = PyUnicode_AS_UNICODE(result);
9362
9363 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 arglen = PyTuple_Size(args);
9365 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366 }
9367 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009368 arglen = -1;
9369 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009371 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009372 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009373 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374
9375 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 if (*fmt != '%') {
9377 if (--rescnt < 0) {
9378 rescnt = fmtcnt + 100;
9379 reslen += rescnt;
9380 if (_PyUnicode_Resize(&result, reslen) < 0)
9381 goto onError;
9382 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9383 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009384 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009385 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009386 }
9387 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 /* Got a format specifier */
9389 int flags = 0;
9390 Py_ssize_t width = -1;
9391 int prec = -1;
9392 Py_UNICODE c = '\0';
9393 Py_UNICODE fill;
9394 int isnumok;
9395 PyObject *v = NULL;
9396 PyObject *temp = NULL;
9397 Py_UNICODE *pbuf;
9398 Py_UNICODE sign;
9399 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009400 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401
Benjamin Peterson29060642009-01-31 22:14:21 +00009402 fmt++;
9403 if (*fmt == '(') {
9404 Py_UNICODE *keystart;
9405 Py_ssize_t keylen;
9406 PyObject *key;
9407 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009408
Benjamin Peterson29060642009-01-31 22:14:21 +00009409 if (dict == NULL) {
9410 PyErr_SetString(PyExc_TypeError,
9411 "format requires a mapping");
9412 goto onError;
9413 }
9414 ++fmt;
9415 --fmtcnt;
9416 keystart = fmt;
9417 /* Skip over balanced parentheses */
9418 while (pcount > 0 && --fmtcnt >= 0) {
9419 if (*fmt == ')')
9420 --pcount;
9421 else if (*fmt == '(')
9422 ++pcount;
9423 fmt++;
9424 }
9425 keylen = fmt - keystart - 1;
9426 if (fmtcnt < 0 || pcount > 0) {
9427 PyErr_SetString(PyExc_ValueError,
9428 "incomplete format key");
9429 goto onError;
9430 }
9431#if 0
9432 /* keys are converted to strings using UTF-8 and
9433 then looked up since Python uses strings to hold
9434 variables names etc. in its namespaces and we
9435 wouldn't want to break common idioms. */
9436 key = PyUnicode_EncodeUTF8(keystart,
9437 keylen,
9438 NULL);
9439#else
9440 key = PyUnicode_FromUnicode(keystart, keylen);
9441#endif
9442 if (key == NULL)
9443 goto onError;
9444 if (args_owned) {
9445 Py_DECREF(args);
9446 args_owned = 0;
9447 }
9448 args = PyObject_GetItem(dict, key);
9449 Py_DECREF(key);
9450 if (args == NULL) {
9451 goto onError;
9452 }
9453 args_owned = 1;
9454 arglen = -1;
9455 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009456 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009457 while (--fmtcnt >= 0) {
9458 switch (c = *fmt++) {
9459 case '-': flags |= F_LJUST; continue;
9460 case '+': flags |= F_SIGN; continue;
9461 case ' ': flags |= F_BLANK; continue;
9462 case '#': flags |= F_ALT; continue;
9463 case '0': flags |= F_ZERO; continue;
9464 }
9465 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009466 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 if (c == '*') {
9468 v = getnextarg(args, arglen, &argidx);
9469 if (v == NULL)
9470 goto onError;
9471 if (!PyLong_Check(v)) {
9472 PyErr_SetString(PyExc_TypeError,
9473 "* wants int");
9474 goto onError;
9475 }
9476 width = PyLong_AsLong(v);
9477 if (width == -1 && PyErr_Occurred())
9478 goto onError;
9479 if (width < 0) {
9480 flags |= F_LJUST;
9481 width = -width;
9482 }
9483 if (--fmtcnt >= 0)
9484 c = *fmt++;
9485 }
9486 else if (c >= '0' && c <= '9') {
9487 width = c - '0';
9488 while (--fmtcnt >= 0) {
9489 c = *fmt++;
9490 if (c < '0' || c > '9')
9491 break;
9492 if ((width*10) / 10 != width) {
9493 PyErr_SetString(PyExc_ValueError,
9494 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009495 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009496 }
9497 width = width*10 + (c - '0');
9498 }
9499 }
9500 if (c == '.') {
9501 prec = 0;
9502 if (--fmtcnt >= 0)
9503 c = *fmt++;
9504 if (c == '*') {
9505 v = getnextarg(args, arglen, &argidx);
9506 if (v == NULL)
9507 goto onError;
9508 if (!PyLong_Check(v)) {
9509 PyErr_SetString(PyExc_TypeError,
9510 "* wants int");
9511 goto onError;
9512 }
9513 prec = PyLong_AsLong(v);
9514 if (prec == -1 && PyErr_Occurred())
9515 goto onError;
9516 if (prec < 0)
9517 prec = 0;
9518 if (--fmtcnt >= 0)
9519 c = *fmt++;
9520 }
9521 else if (c >= '0' && c <= '9') {
9522 prec = c - '0';
9523 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009524 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009525 if (c < '0' || c > '9')
9526 break;
9527 if ((prec*10) / 10 != prec) {
9528 PyErr_SetString(PyExc_ValueError,
9529 "prec too big");
9530 goto onError;
9531 }
9532 prec = prec*10 + (c - '0');
9533 }
9534 }
9535 } /* prec */
9536 if (fmtcnt >= 0) {
9537 if (c == 'h' || c == 'l' || c == 'L') {
9538 if (--fmtcnt >= 0)
9539 c = *fmt++;
9540 }
9541 }
9542 if (fmtcnt < 0) {
9543 PyErr_SetString(PyExc_ValueError,
9544 "incomplete format");
9545 goto onError;
9546 }
9547 if (c != '%') {
9548 v = getnextarg(args, arglen, &argidx);
9549 if (v == NULL)
9550 goto onError;
9551 }
9552 sign = 0;
9553 fill = ' ';
9554 switch (c) {
9555
9556 case '%':
9557 pbuf = formatbuf;
9558 /* presume that buffer length is at least 1 */
9559 pbuf[0] = '%';
9560 len = 1;
9561 break;
9562
9563 case 's':
9564 case 'r':
9565 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009566 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009567 temp = v;
9568 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009569 }
9570 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 if (c == 's')
9572 temp = PyObject_Str(v);
9573 else if (c == 'r')
9574 temp = PyObject_Repr(v);
9575 else
9576 temp = PyObject_ASCII(v);
9577 if (temp == NULL)
9578 goto onError;
9579 if (PyUnicode_Check(temp))
9580 /* nothing to do */;
9581 else {
9582 Py_DECREF(temp);
9583 PyErr_SetString(PyExc_TypeError,
9584 "%s argument has non-string str()");
9585 goto onError;
9586 }
9587 }
9588 pbuf = PyUnicode_AS_UNICODE(temp);
9589 len = PyUnicode_GET_SIZE(temp);
9590 if (prec >= 0 && len > prec)
9591 len = prec;
9592 break;
9593
9594 case 'i':
9595 case 'd':
9596 case 'u':
9597 case 'o':
9598 case 'x':
9599 case 'X':
9600 if (c == 'i')
9601 c = 'd';
9602 isnumok = 0;
9603 if (PyNumber_Check(v)) {
9604 PyObject *iobj=NULL;
9605
9606 if (PyLong_Check(v)) {
9607 iobj = v;
9608 Py_INCREF(iobj);
9609 }
9610 else {
9611 iobj = PyNumber_Long(v);
9612 }
9613 if (iobj!=NULL) {
9614 if (PyLong_Check(iobj)) {
9615 isnumok = 1;
9616 temp = formatlong(iobj, flags, prec, c);
9617 Py_DECREF(iobj);
9618 if (!temp)
9619 goto onError;
9620 pbuf = PyUnicode_AS_UNICODE(temp);
9621 len = PyUnicode_GET_SIZE(temp);
9622 sign = 1;
9623 }
9624 else {
9625 Py_DECREF(iobj);
9626 }
9627 }
9628 }
9629 if (!isnumok) {
9630 PyErr_Format(PyExc_TypeError,
9631 "%%%c format: a number is required, "
9632 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9633 goto onError;
9634 }
9635 if (flags & F_ZERO)
9636 fill = '0';
9637 break;
9638
9639 case 'e':
9640 case 'E':
9641 case 'f':
9642 case 'F':
9643 case 'g':
9644 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009645 temp = formatfloat(v, flags, prec, c);
9646 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009647 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009648 pbuf = PyUnicode_AS_UNICODE(temp);
9649 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 sign = 1;
9651 if (flags & F_ZERO)
9652 fill = '0';
9653 break;
9654
9655 case 'c':
9656 pbuf = formatbuf;
9657 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9658 if (len < 0)
9659 goto onError;
9660 break;
9661
9662 default:
9663 PyErr_Format(PyExc_ValueError,
9664 "unsupported format character '%c' (0x%x) "
9665 "at index %zd",
9666 (31<=c && c<=126) ? (char)c : '?',
9667 (int)c,
9668 (Py_ssize_t)(fmt - 1 -
9669 PyUnicode_AS_UNICODE(uformat)));
9670 goto onError;
9671 }
9672 if (sign) {
9673 if (*pbuf == '-' || *pbuf == '+') {
9674 sign = *pbuf++;
9675 len--;
9676 }
9677 else if (flags & F_SIGN)
9678 sign = '+';
9679 else if (flags & F_BLANK)
9680 sign = ' ';
9681 else
9682 sign = 0;
9683 }
9684 if (width < len)
9685 width = len;
9686 if (rescnt - (sign != 0) < width) {
9687 reslen -= rescnt;
9688 rescnt = width + fmtcnt + 100;
9689 reslen += rescnt;
9690 if (reslen < 0) {
9691 Py_XDECREF(temp);
9692 PyErr_NoMemory();
9693 goto onError;
9694 }
9695 if (_PyUnicode_Resize(&result, reslen) < 0) {
9696 Py_XDECREF(temp);
9697 goto onError;
9698 }
9699 res = PyUnicode_AS_UNICODE(result)
9700 + reslen - rescnt;
9701 }
9702 if (sign) {
9703 if (fill != ' ')
9704 *res++ = sign;
9705 rescnt--;
9706 if (width > len)
9707 width--;
9708 }
9709 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9710 assert(pbuf[0] == '0');
9711 assert(pbuf[1] == c);
9712 if (fill != ' ') {
9713 *res++ = *pbuf++;
9714 *res++ = *pbuf++;
9715 }
9716 rescnt -= 2;
9717 width -= 2;
9718 if (width < 0)
9719 width = 0;
9720 len -= 2;
9721 }
9722 if (width > len && !(flags & F_LJUST)) {
9723 do {
9724 --rescnt;
9725 *res++ = fill;
9726 } while (--width > len);
9727 }
9728 if (fill == ' ') {
9729 if (sign)
9730 *res++ = sign;
9731 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9732 assert(pbuf[0] == '0');
9733 assert(pbuf[1] == c);
9734 *res++ = *pbuf++;
9735 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009736 }
9737 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009738 Py_UNICODE_COPY(res, pbuf, len);
9739 res += len;
9740 rescnt -= len;
9741 while (--width >= len) {
9742 --rescnt;
9743 *res++ = ' ';
9744 }
9745 if (dict && (argidx < arglen) && c != '%') {
9746 PyErr_SetString(PyExc_TypeError,
9747 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009748 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009749 goto onError;
9750 }
9751 Py_XDECREF(temp);
9752 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753 } /* until end */
9754 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009755 PyErr_SetString(PyExc_TypeError,
9756 "not all arguments converted during string formatting");
9757 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758 }
9759
Thomas Woutersa96affe2006-03-12 00:29:36 +00009760 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009763 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764 }
9765 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766 return (PyObject *)result;
9767
Benjamin Peterson29060642009-01-31 22:14:21 +00009768 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769 Py_XDECREF(result);
9770 Py_DECREF(uformat);
9771 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009772 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773 }
9774 return NULL;
9775}
9776
Jeremy Hylton938ace62002-07-17 16:30:39 +00009777static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009778unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9779
Tim Peters6d6c1a32001-08-02 04:15:00 +00009780static PyObject *
9781unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9782{
Benjamin Peterson29060642009-01-31 22:14:21 +00009783 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009784 static char *kwlist[] = {"object", "encoding", "errors", 0};
9785 char *encoding = NULL;
9786 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009787
Benjamin Peterson14339b62009-01-31 16:36:08 +00009788 if (type != &PyUnicode_Type)
9789 return unicode_subtype_new(type, args, kwds);
9790 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009791 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009792 return NULL;
9793 if (x == NULL)
9794 return (PyObject *)_PyUnicode_New(0);
9795 if (encoding == NULL && errors == NULL)
9796 return PyObject_Str(x);
9797 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009798 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009799}
9800
Guido van Rossume023fe02001-08-30 03:12:59 +00009801static PyObject *
9802unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9803{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009804 PyUnicodeObject *tmp, *pnew;
9805 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009806
Benjamin Peterson14339b62009-01-31 16:36:08 +00009807 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9808 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9809 if (tmp == NULL)
9810 return NULL;
9811 assert(PyUnicode_Check(tmp));
9812 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9813 if (pnew == NULL) {
9814 Py_DECREF(tmp);
9815 return NULL;
9816 }
9817 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9818 if (pnew->str == NULL) {
9819 _Py_ForgetReference((PyObject *)pnew);
9820 PyObject_Del(pnew);
9821 Py_DECREF(tmp);
9822 return PyErr_NoMemory();
9823 }
9824 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9825 pnew->length = n;
9826 pnew->hash = tmp->hash;
9827 Py_DECREF(tmp);
9828 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009829}
9830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009831PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009832 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009833\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009834Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009835encoding defaults to the current default string encoding.\n\
9836errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009837
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009838static PyObject *unicode_iter(PyObject *seq);
9839
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009841 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009842 "str", /* tp_name */
9843 sizeof(PyUnicodeObject), /* tp_size */
9844 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009846 (destructor)unicode_dealloc, /* tp_dealloc */
9847 0, /* tp_print */
9848 0, /* tp_getattr */
9849 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009850 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009851 unicode_repr, /* tp_repr */
9852 &unicode_as_number, /* tp_as_number */
9853 &unicode_as_sequence, /* tp_as_sequence */
9854 &unicode_as_mapping, /* tp_as_mapping */
9855 (hashfunc) unicode_hash, /* tp_hash*/
9856 0, /* tp_call*/
9857 (reprfunc) unicode_str, /* tp_str */
9858 PyObject_GenericGetAttr, /* tp_getattro */
9859 0, /* tp_setattro */
9860 0, /* tp_as_buffer */
9861 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009862 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009863 unicode_doc, /* tp_doc */
9864 0, /* tp_traverse */
9865 0, /* tp_clear */
9866 PyUnicode_RichCompare, /* tp_richcompare */
9867 0, /* tp_weaklistoffset */
9868 unicode_iter, /* tp_iter */
9869 0, /* tp_iternext */
9870 unicode_methods, /* tp_methods */
9871 0, /* tp_members */
9872 0, /* tp_getset */
9873 &PyBaseObject_Type, /* tp_base */
9874 0, /* tp_dict */
9875 0, /* tp_descr_get */
9876 0, /* tp_descr_set */
9877 0, /* tp_dictoffset */
9878 0, /* tp_init */
9879 0, /* tp_alloc */
9880 unicode_new, /* tp_new */
9881 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882};
9883
9884/* Initialize the Unicode implementation */
9885
Thomas Wouters78890102000-07-22 19:25:51 +00009886void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009888 int i;
9889
Thomas Wouters477c8d52006-05-27 19:21:47 +00009890 /* XXX - move this array to unicodectype.c ? */
9891 Py_UNICODE linebreak[] = {
9892 0x000A, /* LINE FEED */
9893 0x000D, /* CARRIAGE RETURN */
9894 0x001C, /* FILE SEPARATOR */
9895 0x001D, /* GROUP SEPARATOR */
9896 0x001E, /* RECORD SEPARATOR */
9897 0x0085, /* NEXT LINE */
9898 0x2028, /* LINE SEPARATOR */
9899 0x2029, /* PARAGRAPH SEPARATOR */
9900 };
9901
Fred Drakee4315f52000-05-09 19:53:39 +00009902 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009903 free_list = NULL;
9904 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009906 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009907 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009908
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009909 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009910 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009911 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009912 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009913
9914 /* initialize the linebreak bloom filter */
9915 bloom_linebreak = make_bloom_mask(
9916 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9917 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009918
9919 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920}
9921
9922/* Finalize the Unicode implementation */
9923
Christian Heimesa156e092008-02-16 07:38:31 +00009924int
9925PyUnicode_ClearFreeList(void)
9926{
9927 int freelist_size = numfree;
9928 PyUnicodeObject *u;
9929
9930 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 PyUnicodeObject *v = u;
9932 u = *(PyUnicodeObject **)u;
9933 if (v->str)
9934 PyObject_DEL(v->str);
9935 Py_XDECREF(v->defenc);
9936 PyObject_Del(v);
9937 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009938 }
9939 free_list = NULL;
9940 assert(numfree == 0);
9941 return freelist_size;
9942}
9943
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944void
Thomas Wouters78890102000-07-22 19:25:51 +00009945_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009947 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009949 Py_XDECREF(unicode_empty);
9950 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009951
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009952 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009953 if (unicode_latin1[i]) {
9954 Py_DECREF(unicode_latin1[i]);
9955 unicode_latin1[i] = NULL;
9956 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009957 }
Christian Heimesa156e092008-02-16 07:38:31 +00009958 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009960
Walter Dörwald16807132007-05-25 13:52:07 +00009961void
9962PyUnicode_InternInPlace(PyObject **p)
9963{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009964 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9965 PyObject *t;
9966 if (s == NULL || !PyUnicode_Check(s))
9967 Py_FatalError(
9968 "PyUnicode_InternInPlace: unicode strings only please!");
9969 /* If it's a subclass, we don't really know what putting
9970 it in the interned dict might do. */
9971 if (!PyUnicode_CheckExact(s))
9972 return;
9973 if (PyUnicode_CHECK_INTERNED(s))
9974 return;
9975 if (interned == NULL) {
9976 interned = PyDict_New();
9977 if (interned == NULL) {
9978 PyErr_Clear(); /* Don't leave an exception */
9979 return;
9980 }
9981 }
9982 /* It might be that the GetItem call fails even
9983 though the key is present in the dictionary,
9984 namely when this happens during a stack overflow. */
9985 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009987 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +00009988
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 if (t) {
9990 Py_INCREF(t);
9991 Py_DECREF(*p);
9992 *p = t;
9993 return;
9994 }
Walter Dörwald16807132007-05-25 13:52:07 +00009995
Benjamin Peterson14339b62009-01-31 16:36:08 +00009996 PyThreadState_GET()->recursion_critical = 1;
9997 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9998 PyErr_Clear();
9999 PyThreadState_GET()->recursion_critical = 0;
10000 return;
10001 }
10002 PyThreadState_GET()->recursion_critical = 0;
10003 /* The two references in interned are not counted by refcnt.
10004 The deallocator will take care of this */
10005 Py_REFCNT(s) -= 2;
10006 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010007}
10008
10009void
10010PyUnicode_InternImmortal(PyObject **p)
10011{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010012 PyUnicode_InternInPlace(p);
10013 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10014 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10015 Py_INCREF(*p);
10016 }
Walter Dörwald16807132007-05-25 13:52:07 +000010017}
10018
10019PyObject *
10020PyUnicode_InternFromString(const char *cp)
10021{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010022 PyObject *s = PyUnicode_FromString(cp);
10023 if (s == NULL)
10024 return NULL;
10025 PyUnicode_InternInPlace(&s);
10026 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010027}
10028
10029void _Py_ReleaseInternedUnicodeStrings(void)
10030{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010031 PyObject *keys;
10032 PyUnicodeObject *s;
10033 Py_ssize_t i, n;
10034 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010035
Benjamin Peterson14339b62009-01-31 16:36:08 +000010036 if (interned == NULL || !PyDict_Check(interned))
10037 return;
10038 keys = PyDict_Keys(interned);
10039 if (keys == NULL || !PyList_Check(keys)) {
10040 PyErr_Clear();
10041 return;
10042 }
Walter Dörwald16807132007-05-25 13:52:07 +000010043
Benjamin Peterson14339b62009-01-31 16:36:08 +000010044 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10045 detector, interned unicode strings are not forcibly deallocated;
10046 rather, we give them their stolen references back, and then clear
10047 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010048
Benjamin Peterson14339b62009-01-31 16:36:08 +000010049 n = PyList_GET_SIZE(keys);
10050 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010051 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010052 for (i = 0; i < n; i++) {
10053 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10054 switch (s->state) {
10055 case SSTATE_NOT_INTERNED:
10056 /* XXX Shouldn't happen */
10057 break;
10058 case SSTATE_INTERNED_IMMORTAL:
10059 Py_REFCNT(s) += 1;
10060 immortal_size += s->length;
10061 break;
10062 case SSTATE_INTERNED_MORTAL:
10063 Py_REFCNT(s) += 2;
10064 mortal_size += s->length;
10065 break;
10066 default:
10067 Py_FatalError("Inconsistent interned string state.");
10068 }
10069 s->state = SSTATE_NOT_INTERNED;
10070 }
10071 fprintf(stderr, "total size of all interned strings: "
10072 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10073 "mortal/immortal\n", mortal_size, immortal_size);
10074 Py_DECREF(keys);
10075 PyDict_Clear(interned);
10076 Py_DECREF(interned);
10077 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010078}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010079
10080
10081/********************* Unicode Iterator **************************/
10082
10083typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010084 PyObject_HEAD
10085 Py_ssize_t it_index;
10086 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010087} unicodeiterobject;
10088
10089static void
10090unicodeiter_dealloc(unicodeiterobject *it)
10091{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010092 _PyObject_GC_UNTRACK(it);
10093 Py_XDECREF(it->it_seq);
10094 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010095}
10096
10097static int
10098unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10099{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010100 Py_VISIT(it->it_seq);
10101 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010102}
10103
10104static PyObject *
10105unicodeiter_next(unicodeiterobject *it)
10106{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010107 PyUnicodeObject *seq;
10108 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010109
Benjamin Peterson14339b62009-01-31 16:36:08 +000010110 assert(it != NULL);
10111 seq = it->it_seq;
10112 if (seq == NULL)
10113 return NULL;
10114 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010115
Benjamin Peterson14339b62009-01-31 16:36:08 +000010116 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10117 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010118 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010119 if (item != NULL)
10120 ++it->it_index;
10121 return item;
10122 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010123
Benjamin Peterson14339b62009-01-31 16:36:08 +000010124 Py_DECREF(seq);
10125 it->it_seq = NULL;
10126 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010127}
10128
10129static PyObject *
10130unicodeiter_len(unicodeiterobject *it)
10131{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010132 Py_ssize_t len = 0;
10133 if (it->it_seq)
10134 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10135 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010136}
10137
10138PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10139
10140static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010141 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010142 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010143 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010144};
10145
10146PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010147 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10148 "str_iterator", /* tp_name */
10149 sizeof(unicodeiterobject), /* tp_basicsize */
10150 0, /* tp_itemsize */
10151 /* methods */
10152 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10153 0, /* tp_print */
10154 0, /* tp_getattr */
10155 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010156 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010157 0, /* tp_repr */
10158 0, /* tp_as_number */
10159 0, /* tp_as_sequence */
10160 0, /* tp_as_mapping */
10161 0, /* tp_hash */
10162 0, /* tp_call */
10163 0, /* tp_str */
10164 PyObject_GenericGetAttr, /* tp_getattro */
10165 0, /* tp_setattro */
10166 0, /* tp_as_buffer */
10167 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10168 0, /* tp_doc */
10169 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10170 0, /* tp_clear */
10171 0, /* tp_richcompare */
10172 0, /* tp_weaklistoffset */
10173 PyObject_SelfIter, /* tp_iter */
10174 (iternextfunc)unicodeiter_next, /* tp_iternext */
10175 unicodeiter_methods, /* tp_methods */
10176 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010177};
10178
10179static PyObject *
10180unicode_iter(PyObject *seq)
10181{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010182 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010183
Benjamin Peterson14339b62009-01-31 16:36:08 +000010184 if (!PyUnicode_Check(seq)) {
10185 PyErr_BadInternalCall();
10186 return NULL;
10187 }
10188 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10189 if (it == NULL)
10190 return NULL;
10191 it->it_index = 0;
10192 Py_INCREF(seq);
10193 it->it_seq = (PyUnicodeObject *)seq;
10194 _PyObject_GC_TRACK(it);
10195 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010196}
10197
Martin v. Löwis5b222132007-06-10 09:51:05 +000010198size_t
10199Py_UNICODE_strlen(const Py_UNICODE *u)
10200{
10201 int res = 0;
10202 while(*u++)
10203 res++;
10204 return res;
10205}
10206
10207Py_UNICODE*
10208Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10209{
10210 Py_UNICODE *u = s1;
10211 while ((*u++ = *s2++));
10212 return s1;
10213}
10214
10215Py_UNICODE*
10216Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10217{
10218 Py_UNICODE *u = s1;
10219 while ((*u++ = *s2++))
10220 if (n-- == 0)
10221 break;
10222 return s1;
10223}
10224
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010225Py_UNICODE*
10226Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10227{
10228 Py_UNICODE *u1 = s1;
10229 u1 += Py_UNICODE_strlen(u1);
10230 Py_UNICODE_strcpy(u1, s2);
10231 return s1;
10232}
10233
Martin v. Löwis5b222132007-06-10 09:51:05 +000010234int
10235Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10236{
10237 while (*s1 && *s2 && *s1 == *s2)
10238 s1++, s2++;
10239 if (*s1 && *s2)
10240 return (*s1 < *s2) ? -1 : +1;
10241 if (*s1)
10242 return 1;
10243 if (*s2)
10244 return -1;
10245 return 0;
10246}
10247
Victor Stinneref8d95c2010-08-16 22:03:11 +000010248int
10249Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10250{
10251 register Py_UNICODE u1, u2;
10252 for (; n != 0; n--) {
10253 u1 = *s1;
10254 u2 = *s2;
10255 if (u1 != u2)
10256 return (u1 < u2) ? -1 : +1;
10257 if (u1 == '\0')
10258 return 0;
10259 s1++;
10260 s2++;
10261 }
10262 return 0;
10263}
10264
Martin v. Löwis5b222132007-06-10 09:51:05 +000010265Py_UNICODE*
10266Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10267{
10268 const Py_UNICODE *p;
10269 for (p = s; *p; p++)
10270 if (*p == c)
10271 return (Py_UNICODE*)p;
10272 return NULL;
10273}
10274
Victor Stinner331ea922010-08-10 16:37:20 +000010275Py_UNICODE*
10276Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10277{
10278 const Py_UNICODE *p;
10279 p = s + Py_UNICODE_strlen(s);
10280 while (p != s) {
10281 p--;
10282 if (*p == c)
10283 return (Py_UNICODE*)p;
10284 }
10285 return NULL;
10286}
10287
Victor Stinner71133ff2010-09-01 23:43:53 +000010288Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010289PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010290{
10291 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10292 Py_UNICODE *copy;
10293 Py_ssize_t size;
10294
10295 /* Ensure we won't overflow the size. */
10296 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10297 PyErr_NoMemory();
10298 return NULL;
10299 }
10300 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10301 size *= sizeof(Py_UNICODE);
10302 copy = PyMem_Malloc(size);
10303 if (copy == NULL) {
10304 PyErr_NoMemory();
10305 return NULL;
10306 }
10307 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10308 return copy;
10309}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010310
Georg Brandl66c221e2010-10-14 07:04:07 +000010311/* A _string module, to export formatter_parser and formatter_field_name_split
10312 to the string.Formatter class implemented in Python. */
10313
10314static PyMethodDef _string_methods[] = {
10315 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10316 METH_O, PyDoc_STR("split the argument as a field name")},
10317 {"formatter_parser", (PyCFunction) formatter_parser,
10318 METH_O, PyDoc_STR("parse the argument as a format string")},
10319 {NULL, NULL}
10320};
10321
10322static struct PyModuleDef _string_module = {
10323 PyModuleDef_HEAD_INIT,
10324 "_string",
10325 PyDoc_STR("string helper module"),
10326 0,
10327 _string_methods,
10328 NULL,
10329 NULL,
10330 NULL,
10331 NULL
10332};
10333
10334PyMODINIT_FUNC
10335PyInit__string(void)
10336{
10337 return PyModule_Create(&_string_module);
10338}
10339
10340
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010341#ifdef __cplusplus
10342}
10343#endif