blob: f66773e26a64e17b2375dcc4cd8e20c4996ec6ed [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes2202f872008-02-06 14:31:34 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes2202f872008-02-06 14:31:34 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000101 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000106static PyUnicodeObject *free_list;
107static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Christian Heimes190d79e2008-01-30 11:58:22 +0000116/* Fast detection of the most frequent whitespace characters */
117const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000118 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000121/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000122/* case 0x000C: * FORM FEED */
123/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000124 0, 1, 1, 1, 1, 1, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000126/* case 0x001C: * FILE SEPARATOR */
127/* case 0x001D: * GROUP SEPARATOR */
128/* case 0x001E: * RECORD SEPARATOR */
129/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000131/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000132 1, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000136
Benjamin Peterson14339b62009-01-31 16:36:08 +0000137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000145};
146
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147static PyObject *unicode_encode_call_errorhandler(const char *errors,
148 PyObject **errorHandler,const char *encoding, const char *reason,
149 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
150 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
151
Victor Stinner31be90b2010-04-22 19:38:16 +0000152static void raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
228Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
229{
230 /* calculate simple bloom-style bitmask for a given unicode string */
231
Antoine Pitrouf068f942010-01-13 14:19:12 +0000232 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233 Py_ssize_t i;
234
235 mask = 0;
236 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000237 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000238
239 return mask;
240}
241
242Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
243{
244 Py_ssize_t i;
245
246 for (i = 0; i < setlen; i++)
247 if (set[i] == chr)
248 return 1;
249
250 return 0;
251}
252
Benjamin Peterson29060642009-01-31 22:14:21 +0000253#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000254 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
255
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256/* --- Unicode Object ----------------------------------------------------- */
257
258static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000259int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000260 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261{
262 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000263
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000264 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000266 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000268 /* Resizing shared object (unicode_empty or single character
269 objects) in-place is not allowed. Use PyUnicode_Resize()
270 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000271
Benjamin Peterson14339b62009-01-31 16:36:08 +0000272 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000273 (unicode->length == 1 &&
274 unicode->str[0] < 256U &&
275 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000277 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 return -1;
279 }
280
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281 /* We allocate one more byte to make sure the string is Ux0000 terminated.
282 The overallocation is also used by fastsearch, which assumes that it's
283 safe to look at str[length] (without making any assumptions about what
284 it contains). */
285
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000287 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 PyErr_NoMemory();
292 return -1;
293 }
294 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296
Benjamin Peterson29060642009-01-31 22:14:21 +0000297 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000300 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 }
302 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000303
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 return 0;
305}
306
307/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000308 Ux0000 terminated; some code (e.g. new_identifier)
309 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Thomas Wouters477c8d52006-05-27 19:21:47 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000376 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000377 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000379
Benjamin Peterson29060642009-01-31 22:14:21 +0000380 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000381 /* XXX UNREF/NEWREF interface should be more symmetrical */
382 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000384 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386}
387
388static
Guido van Rossum9475a232001-10-05 20:51:39 +0000389void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390{
Walter Dörwald16807132007-05-25 13:52:07 +0000391 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000392 case SSTATE_NOT_INTERNED:
393 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000394
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_INTERNED_MORTAL:
396 /* revive dead object temporarily for DelItem */
397 Py_REFCNT(unicode) = 3;
398 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
399 Py_FatalError(
400 "deletion of interned string failed");
401 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000402
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_INTERNED_IMMORTAL:
404 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 default:
407 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000408 }
409
Guido van Rossum604ddf82001-12-06 20:03:56 +0000410 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000412 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
414 PyObject_DEL(unicode->str);
415 unicode->str = NULL;
416 unicode->length = 0;
417 }
418 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000419 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 }
421 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000422 *(PyUnicodeObject **)unicode = free_list;
423 free_list = unicode;
424 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 }
426 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000427 PyObject_DEL(unicode->str);
428 Py_XDECREF(unicode->defenc);
429 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430 }
431}
432
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000433static
434int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435{
436 register PyUnicodeObject *v;
437
438 /* Argument checks */
439 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000440 PyErr_BadInternalCall();
441 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000443 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000444 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000445 PyErr_BadInternalCall();
446 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 }
448
449 /* Resizing unicode_empty and single character objects is not
450 possible since these are being shared. We simply return a fresh
451 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000452 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000453 (v == unicode_empty || v->length == 1)) {
454 PyUnicodeObject *w = _PyUnicode_New(length);
455 if (w == NULL)
456 return -1;
457 Py_UNICODE_COPY(w->str, v->str,
458 length < v->length ? length : v->length);
459 Py_DECREF(*unicode);
460 *unicode = w;
461 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000462 }
463
464 /* Note that we don't have to modify *unicode for unshared Unicode
465 objects, since we can modify them in-place. */
466 return unicode_resize(v, length);
467}
468
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000469int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
470{
471 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
472}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000473
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000475 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476{
477 PyUnicodeObject *unicode;
478
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000479 /* If the Unicode data is known at construction time, we can apply
480 some optimizations which share commonly used objects. */
481 if (u != NULL) {
482
Benjamin Peterson29060642009-01-31 22:14:21 +0000483 /* Optimization for empty strings */
484 if (size == 0 && unicode_empty != NULL) {
485 Py_INCREF(unicode_empty);
486 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000487 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000488
489 /* Single character Unicode objects in the Latin-1 range are
490 shared when using this constructor */
491 if (size == 1 && *u < 256) {
492 unicode = unicode_latin1[*u];
493 if (!unicode) {
494 unicode = _PyUnicode_New(1);
495 if (!unicode)
496 return NULL;
497 unicode->str[0] = *u;
498 unicode_latin1[*u] = unicode;
499 }
500 Py_INCREF(unicode);
501 return (PyObject *)unicode;
502 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000503 }
Tim Petersced69f82003-09-16 20:30:58 +0000504
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 unicode = _PyUnicode_New(size);
506 if (!unicode)
507 return NULL;
508
509 /* Copy the Unicode data into the new object */
510 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000511 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512
513 return (PyObject *)unicode;
514}
515
Walter Dörwaldd2034312007-05-18 16:29:38 +0000516PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000517{
518 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000519
Benjamin Peterson14339b62009-01-31 16:36:08 +0000520 if (size < 0) {
521 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000522 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000523 return NULL;
524 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000525
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000526 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000527 some optimizations which share commonly used objects.
528 Also, this means the input must be UTF-8, so fall back to the
529 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 if (u != NULL) {
531
Benjamin Peterson29060642009-01-31 22:14:21 +0000532 /* Optimization for empty strings */
533 if (size == 0 && unicode_empty != NULL) {
534 Py_INCREF(unicode_empty);
535 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000536 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000537
538 /* Single characters are shared when using this constructor.
539 Restrict to ASCII, since the input must be UTF-8. */
540 if (size == 1 && Py_CHARMASK(*u) < 128) {
541 unicode = unicode_latin1[Py_CHARMASK(*u)];
542 if (!unicode) {
543 unicode = _PyUnicode_New(1);
544 if (!unicode)
545 return NULL;
546 unicode->str[0] = Py_CHARMASK(*u);
547 unicode_latin1[Py_CHARMASK(*u)] = unicode;
548 }
549 Py_INCREF(unicode);
550 return (PyObject *)unicode;
551 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000552
553 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000554 }
555
Walter Dörwald55507312007-05-18 13:12:10 +0000556 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000557 if (!unicode)
558 return NULL;
559
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000560 return (PyObject *)unicode;
561}
562
Walter Dörwaldd2034312007-05-18 16:29:38 +0000563PyObject *PyUnicode_FromString(const char *u)
564{
565 size_t size = strlen(u);
566 if (size > PY_SSIZE_T_MAX) {
567 PyErr_SetString(PyExc_OverflowError, "input too long");
568 return NULL;
569 }
570
571 return PyUnicode_FromStringAndSize(u, size);
572}
573
Guido van Rossumd57fd912000-03-10 22:53:23 +0000574#ifdef HAVE_WCHAR_H
575
Mark Dickinson081dfee2009-03-18 14:47:41 +0000576#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
577# define CONVERT_WCHAR_TO_SURROGATES
578#endif
579
580#ifdef CONVERT_WCHAR_TO_SURROGATES
581
582/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
583 to convert from UTF32 to UTF16. */
584
585PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
586 Py_ssize_t size)
587{
588 PyUnicodeObject *unicode;
589 register Py_ssize_t i;
590 Py_ssize_t alloc;
591 const wchar_t *orig_w;
592
593 if (w == NULL) {
594 if (size == 0)
595 return PyUnicode_FromStringAndSize(NULL, 0);
596 PyErr_BadInternalCall();
597 return NULL;
598 }
599
600 if (size == -1) {
601 size = wcslen(w);
602 }
603
604 alloc = size;
605 orig_w = w;
606 for (i = size; i > 0; i--) {
607 if (*w > 0xFFFF)
608 alloc++;
609 w++;
610 }
611 w = orig_w;
612 unicode = _PyUnicode_New(alloc);
613 if (!unicode)
614 return NULL;
615
616 /* Copy the wchar_t data into the new object */
617 {
618 register Py_UNICODE *u;
619 u = PyUnicode_AS_UNICODE(unicode);
620 for (i = size; i > 0; i--) {
621 if (*w > 0xFFFF) {
622 wchar_t ordinal = *w++;
623 ordinal -= 0x10000;
624 *u++ = 0xD800 | (ordinal >> 10);
625 *u++ = 0xDC00 | (ordinal & 0x3FF);
626 }
627 else
628 *u++ = *w++;
629 }
630 }
631 return (PyObject *)unicode;
632}
633
634#else
635
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000637 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638{
639 PyUnicodeObject *unicode;
640
641 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000642 if (size == 0)
643 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000644 PyErr_BadInternalCall();
645 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646 }
647
Martin v. Löwis790465f2008-04-05 20:41:37 +0000648 if (size == -1) {
649 size = wcslen(w);
650 }
651
Guido van Rossumd57fd912000-03-10 22:53:23 +0000652 unicode = _PyUnicode_New(size);
653 if (!unicode)
654 return NULL;
655
656 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000657#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000659#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000660 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000661 register Py_UNICODE *u;
662 register Py_ssize_t i;
663 u = PyUnicode_AS_UNICODE(unicode);
664 for (i = size; i > 0; i--)
665 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 }
667#endif
668
669 return (PyObject *)unicode;
670}
671
Mark Dickinson081dfee2009-03-18 14:47:41 +0000672#endif /* CONVERT_WCHAR_TO_SURROGATES */
673
674#undef CONVERT_WCHAR_TO_SURROGATES
675
Walter Dörwald346737f2007-05-31 10:44:43 +0000676static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000677makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
678 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000679{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000680 *fmt++ = '%';
681 if (width) {
682 if (zeropad)
683 *fmt++ = '0';
684 fmt += sprintf(fmt, "%d", width);
685 }
686 if (precision)
687 fmt += sprintf(fmt, ".%d", precision);
688 if (longflag)
689 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000690 else if (longlongflag) {
691 /* longlongflag should only ever be nonzero on machines with
692 HAVE_LONG_LONG defined */
693#ifdef HAVE_LONG_LONG
694 char *f = PY_FORMAT_LONG_LONG;
695 while (*f)
696 *fmt++ = *f++;
697#else
698 /* we shouldn't ever get here */
699 assert(0);
700 *fmt++ = 'l';
701#endif
702 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000703 else if (size_tflag) {
704 char *f = PY_FORMAT_SIZE_T;
705 while (*f)
706 *fmt++ = *f++;
707 }
708 *fmt++ = c;
709 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000710}
711
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
713
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000714/* size of fixed-size buffer for formatting single arguments */
715#define ITEM_BUFFER_LEN 21
716/* maximum number of characters required for output of %ld. 21 characters
717 allows for 64-bit integers (in decimal) and an optional sign. */
718#define MAX_LONG_CHARS 21
719/* maximum number of characters required for output of %lld.
720 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
721 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
722#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724PyObject *
725PyUnicode_FromFormatV(const char *format, va_list vargs)
726{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000727 va_list count;
728 Py_ssize_t callcount = 0;
729 PyObject **callresults = NULL;
730 PyObject **callresult = NULL;
731 Py_ssize_t n = 0;
732 int width = 0;
733 int precision = 0;
734 int zeropad;
735 const char* f;
736 Py_UNICODE *s;
737 PyObject *string;
738 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000739 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000740 /* use abuffer instead of buffer, if we need more space
741 * (which can happen if there's a format specifier with width). */
742 char *abuffer = NULL;
743 char *realbuffer;
744 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000745 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000746 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000748 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000749 /* step 1: count the number of %S/%R/%A/%s format specifications
750 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
751 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
752 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000753 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000754 if (*f == '%') {
755 if (*(f+1)=='%')
756 continue;
757 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
758 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000759 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000761 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000762 ;
763 if (*f == 's')
764 ++callcount;
765 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000766 else if (128 <= (unsigned char)*f) {
767 PyErr_Format(PyExc_ValueError,
768 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000769 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000771 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000772 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 }
774 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000775 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000776 if (callcount) {
777 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
778 if (!callresults) {
779 PyErr_NoMemory();
780 return NULL;
781 }
782 callresult = callresults;
783 }
784 /* step 3: figure out how large a buffer we need */
785 for (f = format; *f; f++) {
786 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000787#ifdef HAVE_LONG_LONG
788 int longlongflag = 0;
789#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000790 const char* p = f;
791 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000792 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000794 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000796
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
798 * they don't affect the amount of space we reserve.
799 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800 if (*f == 'l') {
801 if (f[1] == 'd' || f[1] == 'u') {
802 ++f;
803 }
804#ifdef HAVE_LONG_LONG
805 else if (f[1] == 'l' &&
806 (f[2] == 'd' || f[2] == 'u')) {
807 longlongflag = 1;
808 f += 2;
809 }
810#endif
811 }
812 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000813 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000814 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000815
Benjamin Peterson14339b62009-01-31 16:36:08 +0000816 switch (*f) {
817 case 'c':
818 (void)va_arg(count, int);
819 /* fall through... */
820 case '%':
821 n++;
822 break;
823 case 'd': case 'u': case 'i': case 'x':
824 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000825#ifdef HAVE_LONG_LONG
826 if (longlongflag) {
827 if (width < MAX_LONG_LONG_CHARS)
828 width = MAX_LONG_LONG_CHARS;
829 }
830 else
831#endif
832 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
833 including sign. Decimal takes the most space. This
834 isn't enough for octal. If a width is specified we
835 need more (which we allocate later). */
836 if (width < MAX_LONG_CHARS)
837 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000838 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000839 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000840 if (abuffersize < width)
841 abuffersize = width;
842 break;
843 case 's':
844 {
845 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000846 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000847 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
848 if (!str)
849 goto fail;
850 n += PyUnicode_GET_SIZE(str);
851 /* Remember the str and switch to the next slot */
852 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 break;
854 }
855 case 'U':
856 {
857 PyObject *obj = va_arg(count, PyObject *);
858 assert(obj && PyUnicode_Check(obj));
859 n += PyUnicode_GET_SIZE(obj);
860 break;
861 }
862 case 'V':
863 {
864 PyObject *obj = va_arg(count, PyObject *);
865 const char *str = va_arg(count, const char *);
866 assert(obj || str);
867 assert(!obj || PyUnicode_Check(obj));
868 if (obj)
869 n += PyUnicode_GET_SIZE(obj);
870 else
871 n += strlen(str);
872 break;
873 }
874 case 'S':
875 {
876 PyObject *obj = va_arg(count, PyObject *);
877 PyObject *str;
878 assert(obj);
879 str = PyObject_Str(obj);
880 if (!str)
881 goto fail;
882 n += PyUnicode_GET_SIZE(str);
883 /* Remember the str and switch to the next slot */
884 *callresult++ = str;
885 break;
886 }
887 case 'R':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *repr;
891 assert(obj);
892 repr = PyObject_Repr(obj);
893 if (!repr)
894 goto fail;
895 n += PyUnicode_GET_SIZE(repr);
896 /* Remember the repr and switch to the next slot */
897 *callresult++ = repr;
898 break;
899 }
900 case 'A':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *ascii;
904 assert(obj);
905 ascii = PyObject_ASCII(obj);
906 if (!ascii)
907 goto fail;
908 n += PyUnicode_GET_SIZE(ascii);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = ascii;
911 break;
912 }
913 case 'p':
914 (void) va_arg(count, int);
915 /* maximum 64-bit pointer representation:
916 * 0xffffffffffffffff
917 * so 19 characters is enough.
918 * XXX I count 18 -- what's the extra for?
919 */
920 n += 19;
921 break;
922 default:
923 /* if we stumble upon an unknown
924 formatting code, copy the rest of
925 the format string to the output
926 string. (we cannot just skip the
927 code, since there's no way to know
928 what's in the argument list) */
929 n += strlen(p);
930 goto expand;
931 }
932 } else
933 n++;
934 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000935 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000936 if (abuffersize > ITEM_BUFFER_LEN) {
937 /* add 1 for sprintf's trailing null byte */
938 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000939 if (!abuffer) {
940 PyErr_NoMemory();
941 goto fail;
942 }
943 realbuffer = abuffer;
944 }
945 else
946 realbuffer = buffer;
947 /* step 4: fill the buffer */
948 /* Since we've analyzed how much space we need for the worst case,
949 we don't have to resize the string.
950 There can be no errors beyond this point. */
951 string = PyUnicode_FromUnicode(NULL, n);
952 if (!string)
953 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000954
Benjamin Peterson14339b62009-01-31 16:36:08 +0000955 s = PyUnicode_AS_UNICODE(string);
956 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000957
Benjamin Peterson14339b62009-01-31 16:36:08 +0000958 for (f = format; *f; f++) {
959 if (*f == '%') {
960 const char* p = f++;
961 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000962 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000963 int size_tflag = 0;
964 zeropad = (*f == '0');
965 /* parse the width.precision part */
966 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000967 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 width = (width*10) + *f++ - '0';
969 precision = 0;
970 if (*f == '.') {
971 f++;
David Malcolm96960882010-11-05 17:23:41 +0000972 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000973 precision = (precision*10) + *f++ - '0';
974 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 /* Handle %ld, %lu, %lld and %llu. */
976 if (*f == 'l') {
977 if (f[1] == 'd' || f[1] == 'u') {
978 longflag = 1;
979 ++f;
980 }
981#ifdef HAVE_LONG_LONG
982 else if (f[1] == 'l' &&
983 (f[2] == 'd' || f[2] == 'u')) {
984 longlongflag = 1;
985 f += 2;
986 }
987#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000988 }
989 /* handle the size_t flag. */
990 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
991 size_tflag = 1;
992 ++f;
993 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000994
Benjamin Peterson14339b62009-01-31 16:36:08 +0000995 switch (*f) {
996 case 'c':
997 *s++ = va_arg(vargs, int);
998 break;
999 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001000 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1001 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 if (longflag)
1003 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001004#ifdef HAVE_LONG_LONG
1005 else if (longlongflag)
1006 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1007#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 else if (size_tflag)
1009 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1010 else
1011 sprintf(realbuffer, fmt, va_arg(vargs, int));
1012 appendstring(realbuffer);
1013 break;
1014 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001015 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1016 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001017 if (longflag)
1018 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001019#ifdef HAVE_LONG_LONG
1020 else if (longlongflag)
1021 sprintf(realbuffer, fmt, va_arg(vargs,
1022 unsigned PY_LONG_LONG));
1023#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001024 else if (size_tflag)
1025 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1026 else
1027 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1028 appendstring(realbuffer);
1029 break;
1030 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001031 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001032 sprintf(realbuffer, fmt, va_arg(vargs, int));
1033 appendstring(realbuffer);
1034 break;
1035 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001036 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 sprintf(realbuffer, fmt, va_arg(vargs, int));
1038 appendstring(realbuffer);
1039 break;
1040 case 's':
1041 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001042 /* unused, since we already have the result */
1043 (void) va_arg(vargs, char *);
1044 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1045 PyUnicode_GET_SIZE(*callresult));
1046 s += PyUnicode_GET_SIZE(*callresult);
1047 /* We're done with the unicode()/repr() => forget it */
1048 Py_DECREF(*callresult);
1049 /* switch to next unicode()/repr() result */
1050 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 break;
1052 }
1053 case 'U':
1054 {
1055 PyObject *obj = va_arg(vargs, PyObject *);
1056 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1058 s += size;
1059 break;
1060 }
1061 case 'V':
1062 {
1063 PyObject *obj = va_arg(vargs, PyObject *);
1064 const char *str = va_arg(vargs, const char *);
1065 if (obj) {
1066 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1067 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1068 s += size;
1069 } else {
1070 appendstring(str);
1071 }
1072 break;
1073 }
1074 case 'S':
1075 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001076 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001077 {
1078 Py_UNICODE *ucopy;
1079 Py_ssize_t usize;
1080 Py_ssize_t upos;
1081 /* unused, since we already have the result */
1082 (void) va_arg(vargs, PyObject *);
1083 ucopy = PyUnicode_AS_UNICODE(*callresult);
1084 usize = PyUnicode_GET_SIZE(*callresult);
1085 for (upos = 0; upos<usize;)
1086 *s++ = ucopy[upos++];
1087 /* We're done with the unicode()/repr() => forget it */
1088 Py_DECREF(*callresult);
1089 /* switch to next unicode()/repr() result */
1090 ++callresult;
1091 break;
1092 }
1093 case 'p':
1094 sprintf(buffer, "%p", va_arg(vargs, void*));
1095 /* %p is ill-defined: ensure leading 0x. */
1096 if (buffer[1] == 'X')
1097 buffer[1] = 'x';
1098 else if (buffer[1] != 'x') {
1099 memmove(buffer+2, buffer, strlen(buffer)+1);
1100 buffer[0] = '0';
1101 buffer[1] = 'x';
1102 }
1103 appendstring(buffer);
1104 break;
1105 case '%':
1106 *s++ = '%';
1107 break;
1108 default:
1109 appendstring(p);
1110 goto end;
1111 }
Victor Stinner1205f272010-09-11 00:54:47 +00001112 }
Victor Stinner1205f272010-09-11 00:54:47 +00001113 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001114 *s++ = *f;
1115 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001116
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 if (callresults)
1119 PyObject_Free(callresults);
1120 if (abuffer)
1121 PyObject_Free(abuffer);
1122 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1123 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001124 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 if (callresults) {
1126 PyObject **callresult2 = callresults;
1127 while (callresult2 < callresult) {
1128 Py_DECREF(*callresult2);
1129 ++callresult2;
1130 }
1131 PyObject_Free(callresults);
1132 }
1133 if (abuffer)
1134 PyObject_Free(abuffer);
1135 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001136}
1137
1138#undef appendstring
1139
1140PyObject *
1141PyUnicode_FromFormat(const char *format, ...)
1142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001143 PyObject* ret;
1144 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001145
1146#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001147 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001148#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001150#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001151 ret = PyUnicode_FromFormatV(format, vargs);
1152 va_end(vargs);
1153 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001154}
1155
Victor Stinner5593d8a2010-10-02 11:11:27 +00001156/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1157 convert a Unicode object to a wide character string.
1158
1159 - If w is NULL: return the number of wide characters (including the nul
1160 character) required to convert the unicode object. Ignore size argument.
1161
1162 - Otherwise: return the number of wide characters (excluding the nul
1163 character) written into w. Write at most size wide characters (including
1164 the nul character). */
1165static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001166unicode_aswidechar(PyUnicodeObject *unicode,
1167 wchar_t *w,
1168 Py_ssize_t size)
1169{
1170#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001171 Py_ssize_t res;
1172 if (w != NULL) {
1173 res = PyUnicode_GET_SIZE(unicode);
1174 if (size > res)
1175 size = res + 1;
1176 else
1177 res = size;
1178 memcpy(w, unicode->str, size * sizeof(wchar_t));
1179 return res;
1180 }
1181 else
1182 return PyUnicode_GET_SIZE(unicode) + 1;
1183#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1184 register const Py_UNICODE *u;
1185 const Py_UNICODE *uend;
1186 const wchar_t *worig, *wend;
1187 Py_ssize_t nchar;
1188
Victor Stinner137c34c2010-09-29 10:25:54 +00001189 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001190 uend = u + PyUnicode_GET_SIZE(unicode);
1191 if (w != NULL) {
1192 worig = w;
1193 wend = w + size;
1194 while (u != uend && w != wend) {
1195 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1196 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1197 {
1198 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1199 u += 2;
1200 }
1201 else {
1202 *w = *u;
1203 u++;
1204 }
1205 w++;
1206 }
1207 if (w != wend)
1208 *w = L'\0';
1209 return w - worig;
1210 }
1211 else {
1212 nchar = 1; /* nul character at the end */
1213 while (u != uend) {
1214 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1215 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1216 u += 2;
1217 else
1218 u++;
1219 nchar++;
1220 }
1221 }
1222 return nchar;
1223#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1224 register Py_UNICODE *u, *uend, ordinal;
1225 register Py_ssize_t i;
1226 wchar_t *worig, *wend;
1227 Py_ssize_t nchar;
1228
1229 u = PyUnicode_AS_UNICODE(unicode);
1230 uend = u + PyUnicode_GET_SIZE(u);
1231 if (w != NULL) {
1232 worig = w;
1233 wend = w + size;
1234 while (u != uend && w != wend) {
1235 ordinal = *u;
1236 if (ordinal > 0xffff) {
1237 ordinal -= 0x10000;
1238 *w++ = 0xD800 | (ordinal >> 10);
1239 *w++ = 0xDC00 | (ordinal & 0x3FF);
1240 }
1241 else
1242 *w++ = ordinal;
1243 u++;
1244 }
1245 if (w != wend)
1246 *w = 0;
1247 return w - worig;
1248 }
1249 else {
1250 nchar = 1; /* nul character */
1251 while (u != uend) {
1252 if (*u > 0xffff)
1253 nchar += 2;
1254 else
1255 nchar++;
1256 u++;
1257 }
1258 return nchar;
1259 }
1260#else
1261# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001262#endif
1263}
1264
1265Py_ssize_t
1266PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1267 wchar_t *w,
1268 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269{
1270 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001271 PyErr_BadInternalCall();
1272 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00001274 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275}
1276
Victor Stinner137c34c2010-09-29 10:25:54 +00001277wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001278PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001279 Py_ssize_t *size)
1280{
1281 wchar_t* buffer;
1282 Py_ssize_t buflen;
1283
1284 if (unicode == NULL) {
1285 PyErr_BadInternalCall();
1286 return NULL;
1287 }
1288
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001289 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001290 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001291 PyErr_NoMemory();
1292 return NULL;
1293 }
1294
Victor Stinner137c34c2010-09-29 10:25:54 +00001295 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1296 if (buffer == NULL) {
1297 PyErr_NoMemory();
1298 return NULL;
1299 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001300 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001301 if (size != NULL)
1302 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001303 return buffer;
1304}
1305
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306#endif
1307
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001308PyObject *PyUnicode_FromOrdinal(int ordinal)
1309{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001310 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001311
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001312 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001313 PyErr_SetString(PyExc_ValueError,
1314 "chr() arg not in range(0x110000)");
1315 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001316 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001317
1318#ifndef Py_UNICODE_WIDE
1319 if (ordinal > 0xffff) {
1320 ordinal -= 0x10000;
1321 s[0] = 0xD800 | (ordinal >> 10);
1322 s[1] = 0xDC00 | (ordinal & 0x3FF);
1323 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001324 }
1325#endif
1326
Hye-Shik Chang40574832004-04-06 07:24:51 +00001327 s[0] = (Py_UNICODE)ordinal;
1328 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001329}
1330
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331PyObject *PyUnicode_FromObject(register PyObject *obj)
1332{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001333 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001334 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001335 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001336 Py_INCREF(obj);
1337 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001338 }
1339 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 /* For a Unicode subtype that's not a Unicode object,
1341 return a true Unicode object with the same data. */
1342 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1343 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001344 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001345 PyErr_Format(PyExc_TypeError,
1346 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001347 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001348 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001349}
1350
1351PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001352 const char *encoding,
1353 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001354{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001355 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001356 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001357
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 PyErr_BadInternalCall();
1360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001362
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001363 /* Decoding bytes objects is the most common case and should be fast */
1364 if (PyBytes_Check(obj)) {
1365 if (PyBytes_GET_SIZE(obj) == 0) {
1366 Py_INCREF(unicode_empty);
1367 v = (PyObject *) unicode_empty;
1368 }
1369 else {
1370 v = PyUnicode_Decode(
1371 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1372 encoding, errors);
1373 }
1374 return v;
1375 }
1376
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001377 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001378 PyErr_SetString(PyExc_TypeError,
1379 "decoding str is not supported");
1380 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001381 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001382
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001383 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1384 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1385 PyErr_Format(PyExc_TypeError,
1386 "coercing to str: need bytes, bytearray "
1387 "or buffer-like object, %.80s found",
1388 Py_TYPE(obj)->tp_name);
1389 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001390 }
Tim Petersced69f82003-09-16 20:30:58 +00001391
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001392 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001393 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001394 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395 }
Tim Petersced69f82003-09-16 20:30:58 +00001396 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001397 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001398
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001399 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001400 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401}
1402
Victor Stinner600d3be2010-06-10 12:00:55 +00001403/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001404 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1405 1 on success. */
1406static int
1407normalize_encoding(const char *encoding,
1408 char *lower,
1409 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001411 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001412 char *l;
1413 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001415 e = encoding;
1416 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001417 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001418 while (*e) {
1419 if (l == l_end)
1420 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001421 if (Py_ISUPPER(*e)) {
1422 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001423 }
1424 else if (*e == '_') {
1425 *l++ = '-';
1426 e++;
1427 }
1428 else {
1429 *l++ = *e++;
1430 }
1431 }
1432 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001433 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001434}
1435
1436PyObject *PyUnicode_Decode(const char *s,
1437 Py_ssize_t size,
1438 const char *encoding,
1439 const char *errors)
1440{
1441 PyObject *buffer = NULL, *unicode;
1442 Py_buffer info;
1443 char lower[11]; /* Enough for any encoding shortcut */
1444
1445 if (encoding == NULL)
1446 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001447
1448 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001449 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1450 if (strcmp(lower, "utf-8") == 0)
1451 return PyUnicode_DecodeUTF8(s, size, errors);
1452 else if ((strcmp(lower, "latin-1") == 0) ||
1453 (strcmp(lower, "iso-8859-1") == 0))
1454 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001455#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001456 else if (strcmp(lower, "mbcs") == 0)
1457 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001458#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001459 else if (strcmp(lower, "ascii") == 0)
1460 return PyUnicode_DecodeASCII(s, size, errors);
1461 else if (strcmp(lower, "utf-16") == 0)
1462 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1463 else if (strcmp(lower, "utf-32") == 0)
1464 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466
1467 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001468 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001469 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001470 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001471 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001472 if (buffer == NULL)
1473 goto onError;
1474 unicode = PyCodec_Decode(buffer, encoding, errors);
1475 if (unicode == NULL)
1476 goto onError;
1477 if (!PyUnicode_Check(unicode)) {
1478 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001479 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001480 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481 Py_DECREF(unicode);
1482 goto onError;
1483 }
1484 Py_DECREF(buffer);
1485 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001486
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488 Py_XDECREF(buffer);
1489 return NULL;
1490}
1491
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001492PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1493 const char *encoding,
1494 const char *errors)
1495{
1496 PyObject *v;
1497
1498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 goto onError;
1501 }
1502
1503 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001504 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001505
1506 /* Decode via the codec registry */
1507 v = PyCodec_Decode(unicode, encoding, errors);
1508 if (v == NULL)
1509 goto onError;
1510 return v;
1511
Benjamin Peterson29060642009-01-31 22:14:21 +00001512 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001513 return NULL;
1514}
1515
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001516PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1517 const char *encoding,
1518 const char *errors)
1519{
1520 PyObject *v;
1521
1522 if (!PyUnicode_Check(unicode)) {
1523 PyErr_BadArgument();
1524 goto onError;
1525 }
1526
1527 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001528 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001529
1530 /* Decode via the codec registry */
1531 v = PyCodec_Decode(unicode, encoding, errors);
1532 if (v == NULL)
1533 goto onError;
1534 if (!PyUnicode_Check(v)) {
1535 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001536 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001537 Py_TYPE(v)->tp_name);
1538 Py_DECREF(v);
1539 goto onError;
1540 }
1541 return v;
1542
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001544 return NULL;
1545}
1546
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001548 Py_ssize_t size,
1549 const char *encoding,
1550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001551{
1552 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001553
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554 unicode = PyUnicode_FromUnicode(s, size);
1555 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1558 Py_DECREF(unicode);
1559 return v;
1560}
1561
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001562PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1563 const char *encoding,
1564 const char *errors)
1565{
1566 PyObject *v;
1567
1568 if (!PyUnicode_Check(unicode)) {
1569 PyErr_BadArgument();
1570 goto onError;
1571 }
1572
1573 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001575
1576 /* Encode via the codec registry */
1577 v = PyCodec_Encode(unicode, encoding, errors);
1578 if (v == NULL)
1579 goto onError;
1580 return v;
1581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001583 return NULL;
1584}
1585
Victor Stinnerad158722010-10-27 00:25:46 +00001586PyObject *
1587PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001588{
Victor Stinner313a1202010-06-11 23:56:51 +00001589#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001590 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1591 PyUnicode_GET_SIZE(unicode),
1592 NULL);
1593#elif defined(__APPLE__)
1594 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1595 PyUnicode_GET_SIZE(unicode),
1596 "surrogateescape");
1597#else
1598 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001599 return PyUnicode_AsEncodedString(unicode,
1600 Py_FileSystemDefaultEncoding,
1601 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001602 }
1603 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001604 /* locale encoding with surrogateescape */
1605 wchar_t *wchar;
1606 char *bytes;
1607 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001608 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001609
1610 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1611 if (wchar == NULL)
1612 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001613 bytes = _Py_wchar2char(wchar, &error_pos);
1614 if (bytes == NULL) {
1615 if (error_pos != (size_t)-1) {
1616 char *errmsg = strerror(errno);
1617 PyObject *exc = NULL;
1618 if (errmsg == NULL)
1619 errmsg = "Py_wchar2char() failed";
1620 raise_encode_exception(&exc,
1621 "filesystemencoding",
1622 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1623 error_pos, error_pos+1,
1624 errmsg);
1625 Py_XDECREF(exc);
1626 }
1627 else
1628 PyErr_NoMemory();
1629 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001630 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001631 }
1632 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001633
1634 bytes_obj = PyBytes_FromString(bytes);
1635 PyMem_Free(bytes);
1636 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001637 }
Victor Stinnerad158722010-10-27 00:25:46 +00001638#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001639}
1640
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1642 const char *encoding,
1643 const char *errors)
1644{
1645 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001646 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001647
Guido van Rossumd57fd912000-03-10 22:53:23 +00001648 if (!PyUnicode_Check(unicode)) {
1649 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651 }
Fred Drakee4315f52000-05-09 19:53:39 +00001652
Tim Petersced69f82003-09-16 20:30:58 +00001653 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001654 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001655
1656 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001657 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1658 if (strcmp(lower, "utf-8") == 0)
1659 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1660 PyUnicode_GET_SIZE(unicode),
1661 errors);
1662 else if ((strcmp(lower, "latin-1") == 0) ||
1663 (strcmp(lower, "iso-8859-1") == 0))
1664 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1665 PyUnicode_GET_SIZE(unicode),
1666 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001667#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001668 else if (strcmp(lower, "mbcs") == 0)
1669 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1670 PyUnicode_GET_SIZE(unicode),
1671 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001672#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001673 else if (strcmp(lower, "ascii") == 0)
1674 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1675 PyUnicode_GET_SIZE(unicode),
1676 errors);
1677 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001678 /* During bootstrap, we may need to find the encodings
1679 package, to load the file system encoding, and require the
1680 file system encoding in order to load the encodings
1681 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001682
Victor Stinner59e62db2010-05-15 13:14:32 +00001683 Break out of this dependency by assuming that the path to
1684 the encodings module is ASCII-only. XXX could try wcstombs
1685 instead, if the file system encoding is the locale's
1686 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001687 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001688 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1689 !PyThreadState_GET()->interp->codecs_initialized)
1690 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1691 PyUnicode_GET_SIZE(unicode),
1692 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693
1694 /* Encode via the codec registry */
1695 v = PyCodec_Encode(unicode, encoding, errors);
1696 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001697 return NULL;
1698
1699 /* The normal path */
1700 if (PyBytes_Check(v))
1701 return v;
1702
1703 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001704 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001705 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001706 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001707
1708 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1709 "encoder %s returned bytearray instead of bytes",
1710 encoding);
1711 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001712 Py_DECREF(v);
1713 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001714 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001715
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001716 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1717 Py_DECREF(v);
1718 return b;
1719 }
1720
1721 PyErr_Format(PyExc_TypeError,
1722 "encoder did not return a bytes object (type=%.400s)",
1723 Py_TYPE(v)->tp_name);
1724 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001725 return NULL;
1726}
1727
1728PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1729 const char *encoding,
1730 const char *errors)
1731{
1732 PyObject *v;
1733
1734 if (!PyUnicode_Check(unicode)) {
1735 PyErr_BadArgument();
1736 goto onError;
1737 }
1738
1739 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001740 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001741
1742 /* Encode via the codec registry */
1743 v = PyCodec_Encode(unicode, encoding, errors);
1744 if (v == NULL)
1745 goto onError;
1746 if (!PyUnicode_Check(v)) {
1747 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001748 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001749 Py_TYPE(v)->tp_name);
1750 Py_DECREF(v);
1751 goto onError;
1752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001754
Benjamin Peterson29060642009-01-31 22:14:21 +00001755 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 return NULL;
1757}
1758
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001759PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001760 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001761{
1762 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001763 if (v)
1764 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001765 if (errors != NULL)
1766 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001767 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001768 PyUnicode_GET_SIZE(unicode),
1769 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001770 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001771 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001772 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001773 return v;
1774}
1775
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001776PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001777PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001778 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001779 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1780}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001781
Christian Heimes5894ba72007-11-04 11:43:14 +00001782PyObject*
1783PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1784{
Victor Stinnerad158722010-10-27 00:25:46 +00001785#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1786 return PyUnicode_DecodeMBCS(s, size, NULL);
1787#elif defined(__APPLE__)
1788 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1789#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001790 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1791 can be undefined. If it is case, decode using UTF-8. The following assumes
1792 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1793 bootstrapping process where the codecs aren't ready yet.
1794 */
1795 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001796 return PyUnicode_Decode(s, size,
1797 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001798 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001799 }
1800 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001801 /* locale encoding with surrogateescape */
1802 wchar_t *wchar;
1803 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001804 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001805
1806 if (s[size] != '\0' || size != strlen(s)) {
1807 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1808 return NULL;
1809 }
1810
Victor Stinner168e1172010-10-16 23:16:16 +00001811 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001812 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001813 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001814
Victor Stinner168e1172010-10-16 23:16:16 +00001815 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001816 PyMem_Free(wchar);
1817 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001818 }
Victor Stinnerad158722010-10-27 00:25:46 +00001819#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001820}
1821
Martin v. Löwis011e8422009-05-05 04:43:17 +00001822
1823int
1824PyUnicode_FSConverter(PyObject* arg, void* addr)
1825{
1826 PyObject *output = NULL;
1827 Py_ssize_t size;
1828 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001829 if (arg == NULL) {
1830 Py_DECREF(*(PyObject**)addr);
1831 return 1;
1832 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001833 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001834 output = arg;
1835 Py_INCREF(output);
1836 }
1837 else {
1838 arg = PyUnicode_FromObject(arg);
1839 if (!arg)
1840 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001841 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001842 Py_DECREF(arg);
1843 if (!output)
1844 return 0;
1845 if (!PyBytes_Check(output)) {
1846 Py_DECREF(output);
1847 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1848 return 0;
1849 }
1850 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001851 size = PyBytes_GET_SIZE(output);
1852 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001853 if (size != strlen(data)) {
1854 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1855 Py_DECREF(output);
1856 return 0;
1857 }
1858 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001859 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001860}
1861
1862
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001863int
1864PyUnicode_FSDecoder(PyObject* arg, void* addr)
1865{
1866 PyObject *output = NULL;
1867 Py_ssize_t size;
1868 void *data;
1869 if (arg == NULL) {
1870 Py_DECREF(*(PyObject**)addr);
1871 return 1;
1872 }
1873 if (PyUnicode_Check(arg)) {
1874 output = arg;
1875 Py_INCREF(output);
1876 }
1877 else {
1878 arg = PyBytes_FromObject(arg);
1879 if (!arg)
1880 return 0;
1881 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1882 PyBytes_GET_SIZE(arg));
1883 Py_DECREF(arg);
1884 if (!output)
1885 return 0;
1886 if (!PyUnicode_Check(output)) {
1887 Py_DECREF(output);
1888 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1889 return 0;
1890 }
1891 }
1892 size = PyUnicode_GET_SIZE(output);
1893 data = PyUnicode_AS_UNICODE(output);
1894 if (size != Py_UNICODE_strlen(data)) {
1895 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1896 Py_DECREF(output);
1897 return 0;
1898 }
1899 *(PyObject**)addr = output;
1900 return Py_CLEANUP_SUPPORTED;
1901}
1902
1903
Martin v. Löwis5b222132007-06-10 09:51:05 +00001904char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001905_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001906{
Christian Heimesf3863112007-11-22 07:46:41 +00001907 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001908 if (!PyUnicode_Check(unicode)) {
1909 PyErr_BadArgument();
1910 return NULL;
1911 }
Christian Heimesf3863112007-11-22 07:46:41 +00001912 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1913 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001914 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001915 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001916 *psize = PyBytes_GET_SIZE(bytes);
1917 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001918}
1919
1920char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001921_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001922{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001923 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001924}
1925
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1927{
1928 if (!PyUnicode_Check(unicode)) {
1929 PyErr_BadArgument();
1930 goto onError;
1931 }
1932 return PyUnicode_AS_UNICODE(unicode);
1933
Benjamin Peterson29060642009-01-31 22:14:21 +00001934 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 return NULL;
1936}
1937
Martin v. Löwis18e16552006-02-15 17:27:45 +00001938Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939{
1940 if (!PyUnicode_Check(unicode)) {
1941 PyErr_BadArgument();
1942 goto onError;
1943 }
1944 return PyUnicode_GET_SIZE(unicode);
1945
Benjamin Peterson29060642009-01-31 22:14:21 +00001946 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 return -1;
1948}
1949
Thomas Wouters78890102000-07-22 19:25:51 +00001950const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001951{
Victor Stinner42cb4622010-09-01 19:39:01 +00001952 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001953}
1954
Victor Stinner554f3f02010-06-16 23:33:54 +00001955/* create or adjust a UnicodeDecodeError */
1956static void
1957make_decode_exception(PyObject **exceptionObject,
1958 const char *encoding,
1959 const char *input, Py_ssize_t length,
1960 Py_ssize_t startpos, Py_ssize_t endpos,
1961 const char *reason)
1962{
1963 if (*exceptionObject == NULL) {
1964 *exceptionObject = PyUnicodeDecodeError_Create(
1965 encoding, input, length, startpos, endpos, reason);
1966 }
1967 else {
1968 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1969 goto onError;
1970 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1971 goto onError;
1972 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1973 goto onError;
1974 }
1975 return;
1976
1977onError:
1978 Py_DECREF(*exceptionObject);
1979 *exceptionObject = NULL;
1980}
1981
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001982/* error handling callback helper:
1983 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001984 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 and adjust various state variables.
1986 return 0 on success, -1 on error
1987*/
1988
1989static
1990int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001991 const char *encoding, const char *reason,
1992 const char **input, const char **inend, Py_ssize_t *startinpos,
1993 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1994 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001995{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001996 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001997
1998 PyObject *restuple = NULL;
1999 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002000 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002001 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002002 Py_ssize_t requiredsize;
2003 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002004 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002005 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002006 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002007 int res = -1;
2008
2009 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002010 *errorHandler = PyCodec_LookupError(errors);
2011 if (*errorHandler == NULL)
2012 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013 }
2014
Victor Stinner554f3f02010-06-16 23:33:54 +00002015 make_decode_exception(exceptionObject,
2016 encoding,
2017 *input, *inend - *input,
2018 *startinpos, *endinpos,
2019 reason);
2020 if (*exceptionObject == NULL)
2021 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002022
2023 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2024 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002025 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002027 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002028 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 }
2030 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002032
2033 /* Copy back the bytes variables, which might have been modified by the
2034 callback */
2035 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2036 if (!inputobj)
2037 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002038 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002039 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002040 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002041 *input = PyBytes_AS_STRING(inputobj);
2042 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002043 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002044 /* we can DECREF safely, as the exception has another reference,
2045 so the object won't go away. */
2046 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002049 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002050 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002051 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2052 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002053 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054
2055 /* need more space? (at least enough for what we
2056 have+the replacement+the rest of the string (starting
2057 at the new input position), so we won't have to check space
2058 when there are no errors in the rest of the string) */
2059 repptr = PyUnicode_AS_UNICODE(repunicode);
2060 repsize = PyUnicode_GET_SIZE(repunicode);
2061 requiredsize = *outpos + repsize + insize-newpos;
2062 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002063 if (requiredsize<2*outsize)
2064 requiredsize = 2*outsize;
2065 if (_PyUnicode_Resize(output, requiredsize) < 0)
2066 goto onError;
2067 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002068 }
2069 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002070 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002071 Py_UNICODE_COPY(*outptr, repptr, repsize);
2072 *outptr += repsize;
2073 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002074
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002075 /* we made it! */
2076 res = 0;
2077
Benjamin Peterson29060642009-01-31 22:14:21 +00002078 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 Py_XDECREF(restuple);
2080 return res;
2081}
2082
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002083/* --- UTF-7 Codec -------------------------------------------------------- */
2084
Antoine Pitrou244651a2009-05-04 18:56:13 +00002085/* See RFC2152 for details. We encode conservatively and decode liberally. */
2086
2087/* Three simple macros defining base-64. */
2088
2089/* Is c a base-64 character? */
2090
2091#define IS_BASE64(c) \
2092 (((c) >= 'A' && (c) <= 'Z') || \
2093 ((c) >= 'a' && (c) <= 'z') || \
2094 ((c) >= '0' && (c) <= '9') || \
2095 (c) == '+' || (c) == '/')
2096
2097/* given that c is a base-64 character, what is its base-64 value? */
2098
2099#define FROM_BASE64(c) \
2100 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2101 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2102 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2103 (c) == '+' ? 62 : 63)
2104
2105/* What is the base-64 character of the bottom 6 bits of n? */
2106
2107#define TO_BASE64(n) \
2108 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2109
2110/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2111 * decoded as itself. We are permissive on decoding; the only ASCII
2112 * byte not decoding to itself is the + which begins a base64
2113 * string. */
2114
2115#define DECODE_DIRECT(c) \
2116 ((c) <= 127 && (c) != '+')
2117
2118/* The UTF-7 encoder treats ASCII characters differently according to
2119 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2120 * the above). See RFC2152. This array identifies these different
2121 * sets:
2122 * 0 : "Set D"
2123 * alphanumeric and '(),-./:?
2124 * 1 : "Set O"
2125 * !"#$%&*;<=>@[]^_`{|}
2126 * 2 : "whitespace"
2127 * ht nl cr sp
2128 * 3 : special (must be base64 encoded)
2129 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2130 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002131
Tim Petersced69f82003-09-16 20:30:58 +00002132static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002133char utf7_category[128] = {
2134/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2135 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2136/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2137 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2138/* sp ! " # $ % & ' ( ) * + , - . / */
2139 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2140/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2142/* @ A B C D E F G H I J K L M N O */
2143 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2144/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2146/* ` a b c d e f g h i j k l m n o */
2147 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2148/* p q r s t u v w x y z { | } ~ del */
2149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002150};
2151
Antoine Pitrou244651a2009-05-04 18:56:13 +00002152/* ENCODE_DIRECT: this character should be encoded as itself. The
2153 * answer depends on whether we are encoding set O as itself, and also
2154 * on whether we are encoding whitespace as itself. RFC2152 makes it
2155 * clear that the answers to these questions vary between
2156 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002157
Antoine Pitrou244651a2009-05-04 18:56:13 +00002158#define ENCODE_DIRECT(c, directO, directWS) \
2159 ((c) < 128 && (c) > 0 && \
2160 ((utf7_category[(c)] == 0) || \
2161 (directWS && (utf7_category[(c)] == 2)) || \
2162 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002163
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002164PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002165 Py_ssize_t size,
2166 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002167{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002168 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2169}
2170
Antoine Pitrou244651a2009-05-04 18:56:13 +00002171/* The decoder. The only state we preserve is our read position,
2172 * i.e. how many characters we have consumed. So if we end in the
2173 * middle of a shift sequence we have to back off the read position
2174 * and the output to the beginning of the sequence, otherwise we lose
2175 * all the shift state (seen bits, number of bits seen, high
2176 * surrogate). */
2177
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002178PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002179 Py_ssize_t size,
2180 const char *errors,
2181 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002182{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002183 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002184 Py_ssize_t startinpos;
2185 Py_ssize_t endinpos;
2186 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002187 const char *e;
2188 PyUnicodeObject *unicode;
2189 Py_UNICODE *p;
2190 const char *errmsg = "";
2191 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002192 Py_UNICODE *shiftOutStart;
2193 unsigned int base64bits = 0;
2194 unsigned long base64buffer = 0;
2195 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002196 PyObject *errorHandler = NULL;
2197 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002198
2199 unicode = _PyUnicode_New(size);
2200 if (!unicode)
2201 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002202 if (size == 0) {
2203 if (consumed)
2204 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002205 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002206 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002207
2208 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002209 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002210 e = s + size;
2211
2212 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002213 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002214 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002215 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002216
Antoine Pitrou244651a2009-05-04 18:56:13 +00002217 if (inShift) { /* in a base-64 section */
2218 if (IS_BASE64(ch)) { /* consume a base-64 character */
2219 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2220 base64bits += 6;
2221 s++;
2222 if (base64bits >= 16) {
2223 /* we have enough bits for a UTF-16 value */
2224 Py_UNICODE outCh = (Py_UNICODE)
2225 (base64buffer >> (base64bits-16));
2226 base64bits -= 16;
2227 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2228 if (surrogate) {
2229 /* expecting a second surrogate */
2230 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2231#ifdef Py_UNICODE_WIDE
2232 *p++ = (((surrogate & 0x3FF)<<10)
2233 | (outCh & 0x3FF)) + 0x10000;
2234#else
2235 *p++ = surrogate;
2236 *p++ = outCh;
2237#endif
2238 surrogate = 0;
2239 }
2240 else {
2241 surrogate = 0;
2242 errmsg = "second surrogate missing";
2243 goto utf7Error;
2244 }
2245 }
2246 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2247 /* first surrogate */
2248 surrogate = outCh;
2249 }
2250 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2251 errmsg = "unexpected second surrogate";
2252 goto utf7Error;
2253 }
2254 else {
2255 *p++ = outCh;
2256 }
2257 }
2258 }
2259 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002260 inShift = 0;
2261 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002262 if (surrogate) {
2263 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002264 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002265 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002266 if (base64bits > 0) { /* left-over bits */
2267 if (base64bits >= 6) {
2268 /* We've seen at least one base-64 character */
2269 errmsg = "partial character in shift sequence";
2270 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002271 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002272 else {
2273 /* Some bits remain; they should be zero */
2274 if (base64buffer != 0) {
2275 errmsg = "non-zero padding bits in shift sequence";
2276 goto utf7Error;
2277 }
2278 }
2279 }
2280 if (ch != '-') {
2281 /* '-' is absorbed; other terminating
2282 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002283 *p++ = ch;
2284 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002285 }
2286 }
2287 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002288 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002289 s++; /* consume '+' */
2290 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002291 s++;
2292 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293 }
2294 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002295 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002296 shiftOutStart = p;
2297 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002298 }
2299 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002300 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002301 *p++ = ch;
2302 s++;
2303 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002304 else {
2305 startinpos = s-starts;
2306 s++;
2307 errmsg = "unexpected special character";
2308 goto utf7Error;
2309 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002310 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002311utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 outpos = p-PyUnicode_AS_UNICODE(unicode);
2313 endinpos = s-starts;
2314 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002315 errors, &errorHandler,
2316 "utf7", errmsg,
2317 &starts, &e, &startinpos, &endinpos, &exc, &s,
2318 &unicode, &outpos, &p))
2319 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002320 }
2321
Antoine Pitrou244651a2009-05-04 18:56:13 +00002322 /* end of string */
2323
2324 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2325 /* if we're in an inconsistent state, that's an error */
2326 if (surrogate ||
2327 (base64bits >= 6) ||
2328 (base64bits > 0 && base64buffer != 0)) {
2329 outpos = p-PyUnicode_AS_UNICODE(unicode);
2330 endinpos = size;
2331 if (unicode_decode_call_errorhandler(
2332 errors, &errorHandler,
2333 "utf7", "unterminated shift sequence",
2334 &starts, &e, &startinpos, &endinpos, &exc, &s,
2335 &unicode, &outpos, &p))
2336 goto onError;
2337 if (s < e)
2338 goto restart;
2339 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002340 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002341
2342 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002343 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002344 if (inShift) {
2345 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002346 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002347 }
2348 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002349 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002350 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002351 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002352
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002353 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002354 goto onError;
2355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002356 Py_XDECREF(errorHandler);
2357 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002358 return (PyObject *)unicode;
2359
Benjamin Peterson29060642009-01-31 22:14:21 +00002360 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002361 Py_XDECREF(errorHandler);
2362 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002363 Py_DECREF(unicode);
2364 return NULL;
2365}
2366
2367
2368PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002369 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002370 int base64SetO,
2371 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002372 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002373{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002374 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002375 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002376 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002377 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002378 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002379 unsigned int base64bits = 0;
2380 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002381 char * out;
2382 char * start;
2383
2384 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002385 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002386
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002387 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002388 return PyErr_NoMemory();
2389
Antoine Pitrou244651a2009-05-04 18:56:13 +00002390 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002391 if (v == NULL)
2392 return NULL;
2393
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002394 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002395 for (;i < size; ++i) {
2396 Py_UNICODE ch = s[i];
2397
Antoine Pitrou244651a2009-05-04 18:56:13 +00002398 if (inShift) {
2399 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2400 /* shifting out */
2401 if (base64bits) { /* output remaining bits */
2402 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2403 base64buffer = 0;
2404 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002405 }
2406 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002407 /* Characters not in the BASE64 set implicitly unshift the sequence
2408 so no '-' is required, except if the character is itself a '-' */
2409 if (IS_BASE64(ch) || ch == '-') {
2410 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002411 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002412 *out++ = (char) ch;
2413 }
2414 else {
2415 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002416 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002417 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002418 else { /* not in a shift sequence */
2419 if (ch == '+') {
2420 *out++ = '+';
2421 *out++ = '-';
2422 }
2423 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2424 *out++ = (char) ch;
2425 }
2426 else {
2427 *out++ = '+';
2428 inShift = 1;
2429 goto encode_char;
2430 }
2431 }
2432 continue;
2433encode_char:
2434#ifdef Py_UNICODE_WIDE
2435 if (ch >= 0x10000) {
2436 /* code first surrogate */
2437 base64bits += 16;
2438 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2439 while (base64bits >= 6) {
2440 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2441 base64bits -= 6;
2442 }
2443 /* prepare second surrogate */
2444 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2445 }
2446#endif
2447 base64bits += 16;
2448 base64buffer = (base64buffer << 16) | ch;
2449 while (base64bits >= 6) {
2450 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2451 base64bits -= 6;
2452 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002453 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002454 if (base64bits)
2455 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2456 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002457 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002458 if (_PyBytes_Resize(&v, out - start) < 0)
2459 return NULL;
2460 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002461}
2462
Antoine Pitrou244651a2009-05-04 18:56:13 +00002463#undef IS_BASE64
2464#undef FROM_BASE64
2465#undef TO_BASE64
2466#undef DECODE_DIRECT
2467#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002468
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469/* --- UTF-8 Codec -------------------------------------------------------- */
2470
Tim Petersced69f82003-09-16 20:30:58 +00002471static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002473 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2474 illegal prefix. See RFC 3629 for details */
2475 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2476 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2480 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2481 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002482 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2487 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2488 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2489 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2490 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491};
2492
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 Py_ssize_t size,
2495 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496{
Walter Dörwald69652032004-09-07 20:24:22 +00002497 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2498}
2499
Antoine Pitrouab868312009-01-10 15:40:25 +00002500/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2501#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2502
2503/* Mask to quickly check whether a C 'long' contains a
2504 non-ASCII, UTF8-encoded char. */
2505#if (SIZEOF_LONG == 8)
2506# define ASCII_CHAR_MASK 0x8080808080808080L
2507#elif (SIZEOF_LONG == 4)
2508# define ASCII_CHAR_MASK 0x80808080L
2509#else
2510# error C 'long' size should be either 4 or 8!
2511#endif
2512
Walter Dörwald69652032004-09-07 20:24:22 +00002513PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002514 Py_ssize_t size,
2515 const char *errors,
2516 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002517{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002518 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002520 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002521 Py_ssize_t startinpos;
2522 Py_ssize_t endinpos;
2523 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002524 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 PyUnicodeObject *unicode;
2526 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002527 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002528 PyObject *errorHandler = NULL;
2529 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530
2531 /* Note: size will always be longer than the resulting Unicode
2532 character count */
2533 unicode = _PyUnicode_New(size);
2534 if (!unicode)
2535 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002536 if (size == 0) {
2537 if (consumed)
2538 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541
2542 /* Unpack UTF-8 encoded data */
2543 p = unicode->str;
2544 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002545 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546
2547 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002548 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549
2550 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002551 /* Fast path for runs of ASCII characters. Given that common UTF-8
2552 input will consist of an overwhelming majority of ASCII
2553 characters, we try to optimize for this case by checking
2554 as many characters as a C 'long' can contain.
2555 First, check if we can do an aligned read, as most CPUs have
2556 a penalty for unaligned reads.
2557 */
2558 if (!((size_t) s & LONG_PTR_MASK)) {
2559 /* Help register allocation */
2560 register const char *_s = s;
2561 register Py_UNICODE *_p = p;
2562 while (_s < aligned_end) {
2563 /* Read a whole long at a time (either 4 or 8 bytes),
2564 and do a fast unrolled copy if it only contains ASCII
2565 characters. */
2566 unsigned long data = *(unsigned long *) _s;
2567 if (data & ASCII_CHAR_MASK)
2568 break;
2569 _p[0] = (unsigned char) _s[0];
2570 _p[1] = (unsigned char) _s[1];
2571 _p[2] = (unsigned char) _s[2];
2572 _p[3] = (unsigned char) _s[3];
2573#if (SIZEOF_LONG == 8)
2574 _p[4] = (unsigned char) _s[4];
2575 _p[5] = (unsigned char) _s[5];
2576 _p[6] = (unsigned char) _s[6];
2577 _p[7] = (unsigned char) _s[7];
2578#endif
2579 _s += SIZEOF_LONG;
2580 _p += SIZEOF_LONG;
2581 }
2582 s = _s;
2583 p = _p;
2584 if (s == e)
2585 break;
2586 ch = (unsigned char)*s;
2587 }
2588 }
2589
2590 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002591 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592 s++;
2593 continue;
2594 }
2595
2596 n = utf8_code_length[ch];
2597
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002598 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002599 if (consumed)
2600 break;
2601 else {
2602 errmsg = "unexpected end of data";
2603 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002604 endinpos = startinpos+1;
2605 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2606 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002607 goto utf8Error;
2608 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
2611 switch (n) {
2612
2613 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002614 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002615 startinpos = s-starts;
2616 endinpos = startinpos+1;
2617 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618
2619 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002620 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002621 startinpos = s-starts;
2622 endinpos = startinpos+1;
2623 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624
2625 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002626 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002627 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002628 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002629 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002630 goto utf8Error;
2631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002633 assert ((ch > 0x007F) && (ch <= 0x07FF));
2634 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 break;
2636
2637 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002638 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2639 will result in surrogates in range d800-dfff. Surrogates are
2640 not valid UTF-8 so they are rejected.
2641 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2642 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002643 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002644 (s[2] & 0xc0) != 0x80 ||
2645 ((unsigned char)s[0] == 0xE0 &&
2646 (unsigned char)s[1] < 0xA0) ||
2647 ((unsigned char)s[0] == 0xED &&
2648 (unsigned char)s[1] > 0x9F)) {
2649 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002650 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002651 endinpos = startinpos + 1;
2652
2653 /* if s[1] first two bits are 1 and 0, then the invalid
2654 continuation byte is s[2], so increment endinpos by 1,
2655 if not, s[1] is invalid and endinpos doesn't need to
2656 be incremented. */
2657 if ((s[1] & 0xC0) == 0x80)
2658 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002659 goto utf8Error;
2660 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002662 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2663 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002664 break;
2665
2666 case 4:
2667 if ((s[1] & 0xc0) != 0x80 ||
2668 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002669 (s[3] & 0xc0) != 0x80 ||
2670 ((unsigned char)s[0] == 0xF0 &&
2671 (unsigned char)s[1] < 0x90) ||
2672 ((unsigned char)s[0] == 0xF4 &&
2673 (unsigned char)s[1] > 0x8F)) {
2674 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002675 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002676 endinpos = startinpos + 1;
2677 if ((s[1] & 0xC0) == 0x80) {
2678 endinpos++;
2679 if ((s[2] & 0xC0) == 0x80)
2680 endinpos++;
2681 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 goto utf8Error;
2683 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002684 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002685 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2686 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2687
Fredrik Lundh8f455852001-06-27 18:59:43 +00002688#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002690#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002691 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002692
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002693 /* translate from 10000..10FFFF to 0..FFFF */
2694 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002695
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002696 /* high surrogate = top 10 bits added to D800 */
2697 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002698
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002699 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002700 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002701#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703 }
2704 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002705 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002706
Benjamin Peterson29060642009-01-31 22:14:21 +00002707 utf8Error:
2708 outpos = p-PyUnicode_AS_UNICODE(unicode);
2709 if (unicode_decode_call_errorhandler(
2710 errors, &errorHandler,
2711 "utf8", errmsg,
2712 &starts, &e, &startinpos, &endinpos, &exc, &s,
2713 &unicode, &outpos, &p))
2714 goto onError;
2715 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 }
Walter Dörwald69652032004-09-07 20:24:22 +00002717 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719
2720 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002721 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 goto onError;
2723
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 Py_XDECREF(errorHandler);
2725 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 return (PyObject *)unicode;
2727
Benjamin Peterson29060642009-01-31 22:14:21 +00002728 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 Py_XDECREF(errorHandler);
2730 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 Py_DECREF(unicode);
2732 return NULL;
2733}
2734
Antoine Pitrouab868312009-01-10 15:40:25 +00002735#undef ASCII_CHAR_MASK
2736
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002737#ifdef __APPLE__
2738
2739/* Simplified UTF-8 decoder using surrogateescape error handler,
2740 used to decode the command line arguments on Mac OS X. */
2741
2742wchar_t*
2743_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2744{
2745 int n;
2746 const char *e;
2747 wchar_t *unicode, *p;
2748
2749 /* Note: size will always be longer than the resulting Unicode
2750 character count */
2751 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2752 PyErr_NoMemory();
2753 return NULL;
2754 }
2755 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2756 if (!unicode)
2757 return NULL;
2758
2759 /* Unpack UTF-8 encoded data */
2760 p = unicode;
2761 e = s + size;
2762 while (s < e) {
2763 Py_UCS4 ch = (unsigned char)*s;
2764
2765 if (ch < 0x80) {
2766 *p++ = (wchar_t)ch;
2767 s++;
2768 continue;
2769 }
2770
2771 n = utf8_code_length[ch];
2772 if (s + n > e) {
2773 goto surrogateescape;
2774 }
2775
2776 switch (n) {
2777 case 0:
2778 case 1:
2779 goto surrogateescape;
2780
2781 case 2:
2782 if ((s[1] & 0xc0) != 0x80)
2783 goto surrogateescape;
2784 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2785 assert ((ch > 0x007F) && (ch <= 0x07FF));
2786 *p++ = (wchar_t)ch;
2787 break;
2788
2789 case 3:
2790 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2791 will result in surrogates in range d800-dfff. Surrogates are
2792 not valid UTF-8 so they are rejected.
2793 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2794 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2795 if ((s[1] & 0xc0) != 0x80 ||
2796 (s[2] & 0xc0) != 0x80 ||
2797 ((unsigned char)s[0] == 0xE0 &&
2798 (unsigned char)s[1] < 0xA0) ||
2799 ((unsigned char)s[0] == 0xED &&
2800 (unsigned char)s[1] > 0x9F)) {
2801
2802 goto surrogateescape;
2803 }
2804 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2805 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2806 *p++ = (Py_UNICODE)ch;
2807 break;
2808
2809 case 4:
2810 if ((s[1] & 0xc0) != 0x80 ||
2811 (s[2] & 0xc0) != 0x80 ||
2812 (s[3] & 0xc0) != 0x80 ||
2813 ((unsigned char)s[0] == 0xF0 &&
2814 (unsigned char)s[1] < 0x90) ||
2815 ((unsigned char)s[0] == 0xF4 &&
2816 (unsigned char)s[1] > 0x8F)) {
2817 goto surrogateescape;
2818 }
2819 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2820 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2821 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2822
2823#if SIZEOF_WCHAR_T == 4
2824 *p++ = (wchar_t)ch;
2825#else
2826 /* compute and append the two surrogates: */
2827
2828 /* translate from 10000..10FFFF to 0..FFFF */
2829 ch -= 0x10000;
2830
2831 /* high surrogate = top 10 bits added to D800 */
2832 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2833
2834 /* low surrogate = bottom 10 bits added to DC00 */
2835 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2836#endif
2837 break;
2838 }
2839 s += n;
2840 continue;
2841
2842 surrogateescape:
2843 *p++ = 0xDC00 + ch;
2844 s++;
2845 }
2846 *p = L'\0';
2847 return unicode;
2848}
2849
2850#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002851
Tim Peters602f7402002-04-27 18:03:26 +00002852/* Allocation strategy: if the string is short, convert into a stack buffer
2853 and allocate exactly as much space needed at the end. Else allocate the
2854 maximum possible needed (4 result bytes per Unicode character), and return
2855 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002856*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002857PyObject *
2858PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 Py_ssize_t size,
2860 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861{
Tim Peters602f7402002-04-27 18:03:26 +00002862#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002863
Guido van Rossum98297ee2007-11-06 21:34:58 +00002864 Py_ssize_t i; /* index into s of next input byte */
2865 PyObject *result; /* result string object */
2866 char *p; /* next free byte in output buffer */
2867 Py_ssize_t nallocated; /* number of result bytes allocated */
2868 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002869 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002870 PyObject *errorHandler = NULL;
2871 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002872
Tim Peters602f7402002-04-27 18:03:26 +00002873 assert(s != NULL);
2874 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875
Tim Peters602f7402002-04-27 18:03:26 +00002876 if (size <= MAX_SHORT_UNICHARS) {
2877 /* Write into the stack buffer; nallocated can't overflow.
2878 * At the end, we'll allocate exactly as much heap space as it
2879 * turns out we need.
2880 */
2881 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002882 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002883 p = stackbuf;
2884 }
2885 else {
2886 /* Overallocate on the heap, and give the excess back at the end. */
2887 nallocated = size * 4;
2888 if (nallocated / 4 != size) /* overflow! */
2889 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002890 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002891 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002892 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002893 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002894 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002895
Tim Peters602f7402002-04-27 18:03:26 +00002896 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002897 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002898
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002899 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002900 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002902
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002904 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002905 *p++ = (char)(0xc0 | (ch >> 6));
2906 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002907 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002908#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002909 /* Special case: check for high and low surrogate */
2910 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2911 Py_UCS4 ch2 = s[i];
2912 /* Combine the two surrogates to form a UCS4 value */
2913 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2914 i++;
2915
2916 /* Encode UCS4 Unicode ordinals */
2917 *p++ = (char)(0xf0 | (ch >> 18));
2918 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002919 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2920 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002921 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002922#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002923 Py_ssize_t newpos;
2924 PyObject *rep;
2925 Py_ssize_t repsize, k;
2926 rep = unicode_encode_call_errorhandler
2927 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2928 s, size, &exc, i-1, i, &newpos);
2929 if (!rep)
2930 goto error;
2931
2932 if (PyBytes_Check(rep))
2933 repsize = PyBytes_GET_SIZE(rep);
2934 else
2935 repsize = PyUnicode_GET_SIZE(rep);
2936
2937 if (repsize > 4) {
2938 Py_ssize_t offset;
2939
2940 if (result == NULL)
2941 offset = p - stackbuf;
2942 else
2943 offset = p - PyBytes_AS_STRING(result);
2944
2945 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2946 /* integer overflow */
2947 PyErr_NoMemory();
2948 goto error;
2949 }
2950 nallocated += repsize - 4;
2951 if (result != NULL) {
2952 if (_PyBytes_Resize(&result, nallocated) < 0)
2953 goto error;
2954 } else {
2955 result = PyBytes_FromStringAndSize(NULL, nallocated);
2956 if (result == NULL)
2957 goto error;
2958 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2959 }
2960 p = PyBytes_AS_STRING(result) + offset;
2961 }
2962
2963 if (PyBytes_Check(rep)) {
2964 char *prep = PyBytes_AS_STRING(rep);
2965 for(k = repsize; k > 0; k--)
2966 *p++ = *prep++;
2967 } else /* rep is unicode */ {
2968 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2969 Py_UNICODE c;
2970
2971 for(k=0; k<repsize; k++) {
2972 c = prep[k];
2973 if (0x80 <= c) {
2974 raise_encode_exception(&exc, "utf-8", s, size,
2975 i-1, i, "surrogates not allowed");
2976 goto error;
2977 }
2978 *p++ = (char)prep[k];
2979 }
2980 }
2981 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002982#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002983 }
Victor Stinner445a6232010-04-22 20:01:57 +00002984#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002985 } else if (ch < 0x10000) {
2986 *p++ = (char)(0xe0 | (ch >> 12));
2987 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2988 *p++ = (char)(0x80 | (ch & 0x3f));
2989 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002990 /* Encode UCS4 Unicode ordinals */
2991 *p++ = (char)(0xf0 | (ch >> 18));
2992 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2993 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2994 *p++ = (char)(0x80 | (ch & 0x3f));
2995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002997
Guido van Rossum98297ee2007-11-06 21:34:58 +00002998 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002999 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003000 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003001 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003002 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003003 }
3004 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003005 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003006 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003007 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003008 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003009 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003010 Py_XDECREF(errorHandler);
3011 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003012 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003013 error:
3014 Py_XDECREF(errorHandler);
3015 Py_XDECREF(exc);
3016 Py_XDECREF(result);
3017 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003018
Tim Peters602f7402002-04-27 18:03:26 +00003019#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020}
3021
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3023{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 if (!PyUnicode_Check(unicode)) {
3025 PyErr_BadArgument();
3026 return NULL;
3027 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003028 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 PyUnicode_GET_SIZE(unicode),
3030 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031}
3032
Walter Dörwald41980ca2007-08-16 21:55:45 +00003033/* --- UTF-32 Codec ------------------------------------------------------- */
3034
3035PyObject *
3036PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 Py_ssize_t size,
3038 const char *errors,
3039 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003040{
3041 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3042}
3043
3044PyObject *
3045PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 Py_ssize_t size,
3047 const char *errors,
3048 int *byteorder,
3049 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003050{
3051 const char *starts = s;
3052 Py_ssize_t startinpos;
3053 Py_ssize_t endinpos;
3054 Py_ssize_t outpos;
3055 PyUnicodeObject *unicode;
3056 Py_UNICODE *p;
3057#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003058 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003059 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003060#else
3061 const int pairs = 0;
3062#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003063 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003064 int bo = 0; /* assume native ordering by default */
3065 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003066 /* Offsets from q for retrieving bytes in the right order. */
3067#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3068 int iorder[] = {0, 1, 2, 3};
3069#else
3070 int iorder[] = {3, 2, 1, 0};
3071#endif
3072 PyObject *errorHandler = NULL;
3073 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003074
Walter Dörwald41980ca2007-08-16 21:55:45 +00003075 q = (unsigned char *)s;
3076 e = q + size;
3077
3078 if (byteorder)
3079 bo = *byteorder;
3080
3081 /* Check for BOM marks (U+FEFF) in the input and adjust current
3082 byte order setting accordingly. In native mode, the leading BOM
3083 mark is skipped, in all other modes, it is copied to the output
3084 stream as-is (giving a ZWNBSP character). */
3085 if (bo == 0) {
3086 if (size >= 4) {
3087 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003089#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003090 if (bom == 0x0000FEFF) {
3091 q += 4;
3092 bo = -1;
3093 }
3094 else if (bom == 0xFFFE0000) {
3095 q += 4;
3096 bo = 1;
3097 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003098#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003099 if (bom == 0x0000FEFF) {
3100 q += 4;
3101 bo = 1;
3102 }
3103 else if (bom == 0xFFFE0000) {
3104 q += 4;
3105 bo = -1;
3106 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003107#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003108 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003109 }
3110
3111 if (bo == -1) {
3112 /* force LE */
3113 iorder[0] = 0;
3114 iorder[1] = 1;
3115 iorder[2] = 2;
3116 iorder[3] = 3;
3117 }
3118 else if (bo == 1) {
3119 /* force BE */
3120 iorder[0] = 3;
3121 iorder[1] = 2;
3122 iorder[2] = 1;
3123 iorder[3] = 0;
3124 }
3125
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003126 /* On narrow builds we split characters outside the BMP into two
3127 codepoints => count how much extra space we need. */
3128#ifndef Py_UNICODE_WIDE
3129 for (qq = q; qq < e; qq += 4)
3130 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3131 pairs++;
3132#endif
3133
3134 /* This might be one to much, because of a BOM */
3135 unicode = _PyUnicode_New((size+3)/4+pairs);
3136 if (!unicode)
3137 return NULL;
3138 if (size == 0)
3139 return (PyObject *)unicode;
3140
3141 /* Unpack UTF-32 encoded data */
3142 p = unicode->str;
3143
Walter Dörwald41980ca2007-08-16 21:55:45 +00003144 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 Py_UCS4 ch;
3146 /* remaining bytes at the end? (size should be divisible by 4) */
3147 if (e-q<4) {
3148 if (consumed)
3149 break;
3150 errmsg = "truncated data";
3151 startinpos = ((const char *)q)-starts;
3152 endinpos = ((const char *)e)-starts;
3153 goto utf32Error;
3154 /* The remaining input chars are ignored if the callback
3155 chooses to skip the input */
3156 }
3157 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3158 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003159
Benjamin Peterson29060642009-01-31 22:14:21 +00003160 if (ch >= 0x110000)
3161 {
3162 errmsg = "codepoint not in range(0x110000)";
3163 startinpos = ((const char *)q)-starts;
3164 endinpos = startinpos+4;
3165 goto utf32Error;
3166 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003167#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 if (ch >= 0x10000)
3169 {
3170 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3171 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3172 }
3173 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003174#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003175 *p++ = ch;
3176 q += 4;
3177 continue;
3178 utf32Error:
3179 outpos = p-PyUnicode_AS_UNICODE(unicode);
3180 if (unicode_decode_call_errorhandler(
3181 errors, &errorHandler,
3182 "utf32", errmsg,
3183 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3184 &unicode, &outpos, &p))
3185 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003186 }
3187
3188 if (byteorder)
3189 *byteorder = bo;
3190
3191 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003192 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003193
3194 /* Adjust length */
3195 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3196 goto onError;
3197
3198 Py_XDECREF(errorHandler);
3199 Py_XDECREF(exc);
3200 return (PyObject *)unicode;
3201
Benjamin Peterson29060642009-01-31 22:14:21 +00003202 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003203 Py_DECREF(unicode);
3204 Py_XDECREF(errorHandler);
3205 Py_XDECREF(exc);
3206 return NULL;
3207}
3208
3209PyObject *
3210PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003211 Py_ssize_t size,
3212 const char *errors,
3213 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003214{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003215 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003216 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003217 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003218#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003219 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003220#else
3221 const int pairs = 0;
3222#endif
3223 /* Offsets from p for storing byte pairs in the right order. */
3224#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3225 int iorder[] = {0, 1, 2, 3};
3226#else
3227 int iorder[] = {3, 2, 1, 0};
3228#endif
3229
Benjamin Peterson29060642009-01-31 22:14:21 +00003230#define STORECHAR(CH) \
3231 do { \
3232 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3233 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3234 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3235 p[iorder[0]] = (CH) & 0xff; \
3236 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003237 } while(0)
3238
3239 /* In narrow builds we can output surrogate pairs as one codepoint,
3240 so we need less space. */
3241#ifndef Py_UNICODE_WIDE
3242 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3244 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3245 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003246#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003247 nsize = (size - pairs + (byteorder == 0));
3248 bytesize = nsize * 4;
3249 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003251 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003252 if (v == NULL)
3253 return NULL;
3254
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003255 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003256 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003257 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003258 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003259 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003260
3261 if (byteorder == -1) {
3262 /* force LE */
3263 iorder[0] = 0;
3264 iorder[1] = 1;
3265 iorder[2] = 2;
3266 iorder[3] = 3;
3267 }
3268 else if (byteorder == 1) {
3269 /* force BE */
3270 iorder[0] = 3;
3271 iorder[1] = 2;
3272 iorder[2] = 1;
3273 iorder[3] = 0;
3274 }
3275
3276 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003277 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003278#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003279 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3280 Py_UCS4 ch2 = *s;
3281 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3282 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3283 s++;
3284 size--;
3285 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003286 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003287#endif
3288 STORECHAR(ch);
3289 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003290
3291 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003292 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003293#undef STORECHAR
3294}
3295
3296PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3297{
3298 if (!PyUnicode_Check(unicode)) {
3299 PyErr_BadArgument();
3300 return NULL;
3301 }
3302 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 PyUnicode_GET_SIZE(unicode),
3304 NULL,
3305 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003306}
3307
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308/* --- UTF-16 Codec ------------------------------------------------------- */
3309
Tim Peters772747b2001-08-09 22:21:55 +00003310PyObject *
3311PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003312 Py_ssize_t size,
3313 const char *errors,
3314 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315{
Walter Dörwald69652032004-09-07 20:24:22 +00003316 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3317}
3318
Antoine Pitrouab868312009-01-10 15:40:25 +00003319/* Two masks for fast checking of whether a C 'long' may contain
3320 UTF16-encoded surrogate characters. This is an efficient heuristic,
3321 assuming that non-surrogate characters with a code point >= 0x8000 are
3322 rare in most input.
3323 FAST_CHAR_MASK is used when the input is in native byte ordering,
3324 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003325*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003326#if (SIZEOF_LONG == 8)
3327# define FAST_CHAR_MASK 0x8000800080008000L
3328# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3329#elif (SIZEOF_LONG == 4)
3330# define FAST_CHAR_MASK 0x80008000L
3331# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3332#else
3333# error C 'long' size should be either 4 or 8!
3334#endif
3335
Walter Dörwald69652032004-09-07 20:24:22 +00003336PyObject *
3337PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 Py_ssize_t size,
3339 const char *errors,
3340 int *byteorder,
3341 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003342{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003344 Py_ssize_t startinpos;
3345 Py_ssize_t endinpos;
3346 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 PyUnicodeObject *unicode;
3348 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003349 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003350 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003351 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003352 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003353 /* Offsets from q for retrieving byte pairs in the right order. */
3354#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3355 int ihi = 1, ilo = 0;
3356#else
3357 int ihi = 0, ilo = 1;
3358#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 PyObject *errorHandler = NULL;
3360 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361
3362 /* Note: size will always be longer than the resulting Unicode
3363 character count */
3364 unicode = _PyUnicode_New(size);
3365 if (!unicode)
3366 return NULL;
3367 if (size == 0)
3368 return (PyObject *)unicode;
3369
3370 /* Unpack UTF-16 encoded data */
3371 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003372 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003373 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374
3375 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003376 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003378 /* Check for BOM marks (U+FEFF) in the input and adjust current
3379 byte order setting accordingly. In native mode, the leading BOM
3380 mark is skipped, in all other modes, it is copied to the output
3381 stream as-is (giving a ZWNBSP character). */
3382 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003383 if (size >= 2) {
3384 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003385#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003386 if (bom == 0xFEFF) {
3387 q += 2;
3388 bo = -1;
3389 }
3390 else if (bom == 0xFFFE) {
3391 q += 2;
3392 bo = 1;
3393 }
Tim Petersced69f82003-09-16 20:30:58 +00003394#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 if (bom == 0xFEFF) {
3396 q += 2;
3397 bo = 1;
3398 }
3399 else if (bom == 0xFFFE) {
3400 q += 2;
3401 bo = -1;
3402 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003403#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003404 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406
Tim Peters772747b2001-08-09 22:21:55 +00003407 if (bo == -1) {
3408 /* force LE */
3409 ihi = 1;
3410 ilo = 0;
3411 }
3412 else if (bo == 1) {
3413 /* force BE */
3414 ihi = 0;
3415 ilo = 1;
3416 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003417#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3418 native_ordering = ilo < ihi;
3419#else
3420 native_ordering = ilo > ihi;
3421#endif
Tim Peters772747b2001-08-09 22:21:55 +00003422
Antoine Pitrouab868312009-01-10 15:40:25 +00003423 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003424 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003425 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003426 /* First check for possible aligned read of a C 'long'. Unaligned
3427 reads are more expensive, better to defer to another iteration. */
3428 if (!((size_t) q & LONG_PTR_MASK)) {
3429 /* Fast path for runs of non-surrogate chars. */
3430 register const unsigned char *_q = q;
3431 Py_UNICODE *_p = p;
3432 if (native_ordering) {
3433 /* Native ordering is simple: as long as the input cannot
3434 possibly contain a surrogate char, do an unrolled copy
3435 of several 16-bit code points to the target object.
3436 The non-surrogate check is done on several input bytes
3437 at a time (as many as a C 'long' can contain). */
3438 while (_q < aligned_end) {
3439 unsigned long data = * (unsigned long *) _q;
3440 if (data & FAST_CHAR_MASK)
3441 break;
3442 _p[0] = ((unsigned short *) _q)[0];
3443 _p[1] = ((unsigned short *) _q)[1];
3444#if (SIZEOF_LONG == 8)
3445 _p[2] = ((unsigned short *) _q)[2];
3446 _p[3] = ((unsigned short *) _q)[3];
3447#endif
3448 _q += SIZEOF_LONG;
3449 _p += SIZEOF_LONG / 2;
3450 }
3451 }
3452 else {
3453 /* Byteswapped ordering is similar, but we must decompose
3454 the copy bytewise, and take care of zero'ing out the
3455 upper bytes if the target object is in 32-bit units
3456 (that is, in UCS-4 builds). */
3457 while (_q < aligned_end) {
3458 unsigned long data = * (unsigned long *) _q;
3459 if (data & SWAPPED_FAST_CHAR_MASK)
3460 break;
3461 /* Zero upper bytes in UCS-4 builds */
3462#if (Py_UNICODE_SIZE > 2)
3463 _p[0] = 0;
3464 _p[1] = 0;
3465#if (SIZEOF_LONG == 8)
3466 _p[2] = 0;
3467 _p[3] = 0;
3468#endif
3469#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003470 /* Issue #4916; UCS-4 builds on big endian machines must
3471 fill the two last bytes of each 4-byte unit. */
3472#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3473# define OFF 2
3474#else
3475# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003476#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003477 ((unsigned char *) _p)[OFF + 1] = _q[0];
3478 ((unsigned char *) _p)[OFF + 0] = _q[1];
3479 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3480 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3481#if (SIZEOF_LONG == 8)
3482 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3483 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3484 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3485 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3486#endif
3487#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003488 _q += SIZEOF_LONG;
3489 _p += SIZEOF_LONG / 2;
3490 }
3491 }
3492 p = _p;
3493 q = _q;
3494 if (q >= e)
3495 break;
3496 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003497 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498
Benjamin Peterson14339b62009-01-31 16:36:08 +00003499 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003500
3501 if (ch < 0xD800 || ch > 0xDFFF) {
3502 *p++ = ch;
3503 continue;
3504 }
3505
3506 /* UTF-16 code pair: */
3507 if (q > e) {
3508 errmsg = "unexpected end of data";
3509 startinpos = (((const char *)q) - 2) - starts;
3510 endinpos = ((const char *)e) + 1 - starts;
3511 goto utf16Error;
3512 }
3513 if (0xD800 <= ch && ch <= 0xDBFF) {
3514 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3515 q += 2;
3516 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003517#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003518 *p++ = ch;
3519 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003520#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003521 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003522#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003523 continue;
3524 }
3525 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003526 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003527 startinpos = (((const char *)q)-4)-starts;
3528 endinpos = startinpos+2;
3529 goto utf16Error;
3530 }
3531
Benjamin Peterson14339b62009-01-31 16:36:08 +00003532 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 errmsg = "illegal encoding";
3534 startinpos = (((const char *)q)-2)-starts;
3535 endinpos = startinpos+2;
3536 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003537
Benjamin Peterson29060642009-01-31 22:14:21 +00003538 utf16Error:
3539 outpos = p - PyUnicode_AS_UNICODE(unicode);
3540 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003541 errors,
3542 &errorHandler,
3543 "utf16", errmsg,
3544 &starts,
3545 (const char **)&e,
3546 &startinpos,
3547 &endinpos,
3548 &exc,
3549 (const char **)&q,
3550 &unicode,
3551 &outpos,
3552 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003553 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003555 /* remaining byte at the end? (size should be even) */
3556 if (e == q) {
3557 if (!consumed) {
3558 errmsg = "truncated data";
3559 startinpos = ((const char *)q) - starts;
3560 endinpos = ((const char *)e) + 1 - starts;
3561 outpos = p - PyUnicode_AS_UNICODE(unicode);
3562 if (unicode_decode_call_errorhandler(
3563 errors,
3564 &errorHandler,
3565 "utf16", errmsg,
3566 &starts,
3567 (const char **)&e,
3568 &startinpos,
3569 &endinpos,
3570 &exc,
3571 (const char **)&q,
3572 &unicode,
3573 &outpos,
3574 &p))
3575 goto onError;
3576 /* The remaining input chars are ignored if the callback
3577 chooses to skip the input */
3578 }
3579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580
3581 if (byteorder)
3582 *byteorder = bo;
3583
Walter Dörwald69652032004-09-07 20:24:22 +00003584 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003585 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003586
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003588 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 goto onError;
3590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 Py_XDECREF(errorHandler);
3592 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 return (PyObject *)unicode;
3594
Benjamin Peterson29060642009-01-31 22:14:21 +00003595 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 Py_XDECREF(errorHandler);
3598 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 return NULL;
3600}
3601
Antoine Pitrouab868312009-01-10 15:40:25 +00003602#undef FAST_CHAR_MASK
3603#undef SWAPPED_FAST_CHAR_MASK
3604
Tim Peters772747b2001-08-09 22:21:55 +00003605PyObject *
3606PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003607 Py_ssize_t size,
3608 const char *errors,
3609 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003611 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003612 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003613 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003614#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003615 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003616#else
3617 const int pairs = 0;
3618#endif
Tim Peters772747b2001-08-09 22:21:55 +00003619 /* Offsets from p for storing byte pairs in the right order. */
3620#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3621 int ihi = 1, ilo = 0;
3622#else
3623 int ihi = 0, ilo = 1;
3624#endif
3625
Benjamin Peterson29060642009-01-31 22:14:21 +00003626#define STORECHAR(CH) \
3627 do { \
3628 p[ihi] = ((CH) >> 8) & 0xff; \
3629 p[ilo] = (CH) & 0xff; \
3630 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003631 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003633#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003634 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 if (s[i] >= 0x10000)
3636 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003637#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003638 /* 2 * (size + pairs + (byteorder == 0)) */
3639 if (size > PY_SSIZE_T_MAX ||
3640 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003642 nsize = size + pairs + (byteorder == 0);
3643 bytesize = nsize * 2;
3644 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003646 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 if (v == NULL)
3648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003650 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003652 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003653 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003654 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003655
3656 if (byteorder == -1) {
3657 /* force LE */
3658 ihi = 1;
3659 ilo = 0;
3660 }
3661 else if (byteorder == 1) {
3662 /* force BE */
3663 ihi = 0;
3664 ilo = 1;
3665 }
3666
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003667 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 Py_UNICODE ch = *s++;
3669 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003670#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 if (ch >= 0x10000) {
3672 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3673 ch = 0xD800 | ((ch-0x10000) >> 10);
3674 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003675#endif
Tim Peters772747b2001-08-09 22:21:55 +00003676 STORECHAR(ch);
3677 if (ch2)
3678 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003679 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003680
3681 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003682 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003683#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684}
3685
3686PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3687{
3688 if (!PyUnicode_Check(unicode)) {
3689 PyErr_BadArgument();
3690 return NULL;
3691 }
3692 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 PyUnicode_GET_SIZE(unicode),
3694 NULL,
3695 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696}
3697
3698/* --- Unicode Escape Codec ----------------------------------------------- */
3699
Fredrik Lundh06d12682001-01-24 07:59:11 +00003700static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003701
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 Py_ssize_t size,
3704 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003706 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003707 Py_ssize_t startinpos;
3708 Py_ssize_t endinpos;
3709 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003712 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003714 char* message;
3715 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 PyObject *errorHandler = NULL;
3717 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003718
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 /* Escaped strings will always be longer than the resulting
3720 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 length after conversion to the true value.
3722 (but if the error callback returns a long replacement string
3723 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 v = _PyUnicode_New(size);
3725 if (v == NULL)
3726 goto onError;
3727 if (size == 0)
3728 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003729
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003732
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733 while (s < end) {
3734 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003735 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003736 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737
3738 /* Non-escape characters are interpreted as Unicode ordinals */
3739 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003740 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741 continue;
3742 }
3743
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 /* \ - Escapes */
3746 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003747 c = *s++;
3748 if (s > end)
3749 c = '\0'; /* Invalid after \ */
3750 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751
Benjamin Peterson29060642009-01-31 22:14:21 +00003752 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 case '\n': break;
3754 case '\\': *p++ = '\\'; break;
3755 case '\'': *p++ = '\''; break;
3756 case '\"': *p++ = '\"'; break;
3757 case 'b': *p++ = '\b'; break;
3758 case 'f': *p++ = '\014'; break; /* FF */
3759 case 't': *p++ = '\t'; break;
3760 case 'n': *p++ = '\n'; break;
3761 case 'r': *p++ = '\r'; break;
3762 case 'v': *p++ = '\013'; break; /* VT */
3763 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3764
Benjamin Peterson29060642009-01-31 22:14:21 +00003765 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766 case '0': case '1': case '2': case '3':
3767 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003768 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003769 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003770 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003771 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003772 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003774 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 break;
3776
Benjamin Peterson29060642009-01-31 22:14:21 +00003777 /* hex escapes */
3778 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003780 digits = 2;
3781 message = "truncated \\xXX escape";
3782 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003786 digits = 4;
3787 message = "truncated \\uXXXX escape";
3788 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789
Benjamin Peterson29060642009-01-31 22:14:21 +00003790 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003791 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003792 digits = 8;
3793 message = "truncated \\UXXXXXXXX escape";
3794 hexescape:
3795 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 outpos = p-PyUnicode_AS_UNICODE(v);
3797 if (s+digits>end) {
3798 endinpos = size;
3799 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003800 errors, &errorHandler,
3801 "unicodeescape", "end of string in escape sequence",
3802 &starts, &end, &startinpos, &endinpos, &exc, &s,
3803 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804 goto onError;
3805 goto nextByte;
3806 }
3807 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003808 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003809 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 endinpos = (s+i+1)-starts;
3811 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003812 errors, &errorHandler,
3813 "unicodeescape", message,
3814 &starts, &end, &startinpos, &endinpos, &exc, &s,
3815 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003816 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003818 }
3819 chr = (chr<<4) & ~0xF;
3820 if (c >= '0' && c <= '9')
3821 chr += c - '0';
3822 else if (c >= 'a' && c <= 'f')
3823 chr += 10 + c - 'a';
3824 else
3825 chr += 10 + c - 'A';
3826 }
3827 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003828 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829 /* _decoding_error will have already written into the
3830 target buffer. */
3831 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003832 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003833 /* when we get here, chr is a 32-bit unicode character */
3834 if (chr <= 0xffff)
3835 /* UCS-2 character */
3836 *p++ = (Py_UNICODE) chr;
3837 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003838 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003839 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003840#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003841 *p++ = chr;
3842#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003843 chr -= 0x10000L;
3844 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003845 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003846#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003847 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003848 endinpos = s-starts;
3849 outpos = p-PyUnicode_AS_UNICODE(v);
3850 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 errors, &errorHandler,
3852 "unicodeescape", "illegal Unicode character",
3853 &starts, &end, &startinpos, &endinpos, &exc, &s,
3854 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003855 goto onError;
3856 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003857 break;
3858
Benjamin Peterson29060642009-01-31 22:14:21 +00003859 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003860 case 'N':
3861 message = "malformed \\N character escape";
3862 if (ucnhash_CAPI == NULL) {
3863 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003864 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003865 if (ucnhash_CAPI == NULL)
3866 goto ucnhashError;
3867 }
3868 if (*s == '{') {
3869 const char *start = s+1;
3870 /* look for the closing brace */
3871 while (*s != '}' && s < end)
3872 s++;
3873 if (s > start && s < end && *s == '}') {
3874 /* found a name. look it up in the unicode database */
3875 message = "unknown Unicode character name";
3876 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003877 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003878 goto store;
3879 }
3880 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 endinpos = s-starts;
3882 outpos = p-PyUnicode_AS_UNICODE(v);
3883 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003884 errors, &errorHandler,
3885 "unicodeescape", message,
3886 &starts, &end, &startinpos, &endinpos, &exc, &s,
3887 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003888 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003889 break;
3890
3891 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003892 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 message = "\\ at end of string";
3894 s--;
3895 endinpos = s-starts;
3896 outpos = p-PyUnicode_AS_UNICODE(v);
3897 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003898 errors, &errorHandler,
3899 "unicodeescape", message,
3900 &starts, &end, &startinpos, &endinpos, &exc, &s,
3901 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003902 goto onError;
3903 }
3904 else {
3905 *p++ = '\\';
3906 *p++ = (unsigned char)s[-1];
3907 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003908 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003910 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003913 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003915 Py_XDECREF(errorHandler);
3916 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003918
Benjamin Peterson29060642009-01-31 22:14:21 +00003919 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003920 PyErr_SetString(
3921 PyExc_UnicodeError,
3922 "\\N escapes not supported (can't load unicodedata module)"
3923 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003924 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 Py_XDECREF(errorHandler);
3926 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003927 return NULL;
3928
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 Py_XDECREF(errorHandler);
3932 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 return NULL;
3934}
3935
3936/* Return a Unicode-Escape string version of the Unicode object.
3937
3938 If quotes is true, the string is enclosed in u"" or u'' quotes as
3939 appropriate.
3940
3941*/
3942
Thomas Wouters477c8d52006-05-27 19:21:47 +00003943Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003944 Py_ssize_t size,
3945 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003946{
3947 /* like wcschr, but doesn't stop at NULL characters */
3948
3949 while (size-- > 0) {
3950 if (*s == ch)
3951 return s;
3952 s++;
3953 }
3954
3955 return NULL;
3956}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003957
Walter Dörwald79e913e2007-05-12 11:08:06 +00003958static const char *hexdigits = "0123456789abcdef";
3959
3960PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003961 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003963 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003966#ifdef Py_UNICODE_WIDE
3967 const Py_ssize_t expandsize = 10;
3968#else
3969 const Py_ssize_t expandsize = 6;
3970#endif
3971
Thomas Wouters89f507f2006-12-13 04:49:30 +00003972 /* XXX(nnorwitz): rather than over-allocating, it would be
3973 better to choose a different scheme. Perhaps scan the
3974 first N-chars of the string and allocate based on that size.
3975 */
3976 /* Initial allocation is based on the longest-possible unichr
3977 escape.
3978
3979 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3980 unichr, so in this case it's the longest unichr escape. In
3981 narrow (UTF-16) builds this is five chars per source unichr
3982 since there are two unichrs in the surrogate pair, so in narrow
3983 (UTF-16) builds it's not the longest unichr escape.
3984
3985 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3986 so in the narrow (UTF-16) build case it's the longest unichr
3987 escape.
3988 */
3989
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003990 if (size == 0)
3991 return PyBytes_FromStringAndSize(NULL, 0);
3992
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003993 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003995
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003996 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 2
3998 + expandsize*size
3999 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 if (repr == NULL)
4001 return NULL;
4002
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004003 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 while (size-- > 0) {
4006 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004007
Walter Dörwald79e913e2007-05-12 11:08:06 +00004008 /* Escape backslashes */
4009 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 *p++ = '\\';
4011 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004012 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004013 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004014
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004015#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004016 /* Map 21-bit characters to '\U00xxxxxx' */
4017 else if (ch >= 0x10000) {
4018 *p++ = '\\';
4019 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004020 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4021 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4022 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4023 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4024 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4025 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4026 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4027 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004029 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004030#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4032 else if (ch >= 0xD800 && ch < 0xDC00) {
4033 Py_UNICODE ch2;
4034 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004035
Benjamin Peterson29060642009-01-31 22:14:21 +00004036 ch2 = *s++;
4037 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004038 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4040 *p++ = '\\';
4041 *p++ = 'U';
4042 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4043 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4044 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4045 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4046 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4047 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4048 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4049 *p++ = hexdigits[ucs & 0x0000000F];
4050 continue;
4051 }
4052 /* Fall through: isolated surrogates are copied as-is */
4053 s--;
4054 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004055 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004056#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004057
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004059 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 *p++ = '\\';
4061 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004062 *p++ = hexdigits[(ch >> 12) & 0x000F];
4063 *p++ = hexdigits[(ch >> 8) & 0x000F];
4064 *p++ = hexdigits[(ch >> 4) & 0x000F];
4065 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004067
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004068 /* Map special whitespace to '\t', \n', '\r' */
4069 else if (ch == '\t') {
4070 *p++ = '\\';
4071 *p++ = 't';
4072 }
4073 else if (ch == '\n') {
4074 *p++ = '\\';
4075 *p++ = 'n';
4076 }
4077 else if (ch == '\r') {
4078 *p++ = '\\';
4079 *p++ = 'r';
4080 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004081
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004082 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004083 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004085 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004086 *p++ = hexdigits[(ch >> 4) & 0x000F];
4087 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004088 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004089
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 /* Copy everything else as-is */
4091 else
4092 *p++ = (char) ch;
4093 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004095 assert(p - PyBytes_AS_STRING(repr) > 0);
4096 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4097 return NULL;
4098 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099}
4100
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004101PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004103 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 if (!PyUnicode_Check(unicode)) {
4105 PyErr_BadArgument();
4106 return NULL;
4107 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004108 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4109 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004110 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111}
4112
4113/* --- Raw Unicode Escape Codec ------------------------------------------- */
4114
4115PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 Py_ssize_t size,
4117 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004120 Py_ssize_t startinpos;
4121 Py_ssize_t endinpos;
4122 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 const char *end;
4126 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 PyObject *errorHandler = NULL;
4128 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 /* Escaped strings will always be longer than the resulting
4131 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 length after conversion to the true value. (But decoding error
4133 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 v = _PyUnicode_New(size);
4135 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 end = s + size;
4141 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 unsigned char c;
4143 Py_UCS4 x;
4144 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004145 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 /* Non-escape characters are interpreted as Unicode ordinals */
4148 if (*s != '\\') {
4149 *p++ = (unsigned char)*s++;
4150 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004151 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 startinpos = s-starts;
4153
4154 /* \u-escapes are only interpreted iff the number of leading
4155 backslashes if odd */
4156 bs = s;
4157 for (;s < end;) {
4158 if (*s != '\\')
4159 break;
4160 *p++ = (unsigned char)*s++;
4161 }
4162 if (((s - bs) & 1) == 0 ||
4163 s >= end ||
4164 (*s != 'u' && *s != 'U')) {
4165 continue;
4166 }
4167 p--;
4168 count = *s=='u' ? 4 : 8;
4169 s++;
4170
4171 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4172 outpos = p-PyUnicode_AS_UNICODE(v);
4173 for (x = 0, i = 0; i < count; ++i, ++s) {
4174 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004175 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 endinpos = s-starts;
4177 if (unicode_decode_call_errorhandler(
4178 errors, &errorHandler,
4179 "rawunicodeescape", "truncated \\uXXXX",
4180 &starts, &end, &startinpos, &endinpos, &exc, &s,
4181 &v, &outpos, &p))
4182 goto onError;
4183 goto nextByte;
4184 }
4185 x = (x<<4) & ~0xF;
4186 if (c >= '0' && c <= '9')
4187 x += c - '0';
4188 else if (c >= 'a' && c <= 'f')
4189 x += 10 + c - 'a';
4190 else
4191 x += 10 + c - 'A';
4192 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004193 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 /* UCS-2 character */
4195 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004196 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 /* UCS-4 character. Either store directly, or as
4198 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004199#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004201#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 x -= 0x10000L;
4203 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4204 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004205#endif
4206 } else {
4207 endinpos = s-starts;
4208 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004209 if (unicode_decode_call_errorhandler(
4210 errors, &errorHandler,
4211 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 &starts, &end, &startinpos, &endinpos, &exc, &s,
4213 &v, &outpos, &p))
4214 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004215 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 nextByte:
4217 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004219 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 Py_XDECREF(errorHandler);
4222 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004224
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227 Py_XDECREF(errorHandler);
4228 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 return NULL;
4230}
4231
4232PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004233 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004235 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 char *p;
4237 char *q;
4238
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004239#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004240 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004241#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004242 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004243#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004244
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004245 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004247
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004248 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249 if (repr == NULL)
4250 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004251 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004252 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004254 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 while (size-- > 0) {
4256 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004257#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 /* Map 32-bit characters to '\Uxxxxxxxx' */
4259 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004260 *p++ = '\\';
4261 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004262 *p++ = hexdigits[(ch >> 28) & 0xf];
4263 *p++ = hexdigits[(ch >> 24) & 0xf];
4264 *p++ = hexdigits[(ch >> 20) & 0xf];
4265 *p++ = hexdigits[(ch >> 16) & 0xf];
4266 *p++ = hexdigits[(ch >> 12) & 0xf];
4267 *p++ = hexdigits[(ch >> 8) & 0xf];
4268 *p++ = hexdigits[(ch >> 4) & 0xf];
4269 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004270 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004271 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004272#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004273 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4274 if (ch >= 0xD800 && ch < 0xDC00) {
4275 Py_UNICODE ch2;
4276 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004277
Benjamin Peterson29060642009-01-31 22:14:21 +00004278 ch2 = *s++;
4279 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004280 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4282 *p++ = '\\';
4283 *p++ = 'U';
4284 *p++ = hexdigits[(ucs >> 28) & 0xf];
4285 *p++ = hexdigits[(ucs >> 24) & 0xf];
4286 *p++ = hexdigits[(ucs >> 20) & 0xf];
4287 *p++ = hexdigits[(ucs >> 16) & 0xf];
4288 *p++ = hexdigits[(ucs >> 12) & 0xf];
4289 *p++ = hexdigits[(ucs >> 8) & 0xf];
4290 *p++ = hexdigits[(ucs >> 4) & 0xf];
4291 *p++ = hexdigits[ucs & 0xf];
4292 continue;
4293 }
4294 /* Fall through: isolated surrogates are copied as-is */
4295 s--;
4296 size++;
4297 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004298#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 /* Map 16-bit characters to '\uxxxx' */
4300 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301 *p++ = '\\';
4302 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004303 *p++ = hexdigits[(ch >> 12) & 0xf];
4304 *p++ = hexdigits[(ch >> 8) & 0xf];
4305 *p++ = hexdigits[(ch >> 4) & 0xf];
4306 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 /* Copy everything else as-is */
4309 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 *p++ = (char) ch;
4311 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004312 size = p - q;
4313
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004314 assert(size > 0);
4315 if (_PyBytes_Resize(&repr, size) < 0)
4316 return NULL;
4317 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318}
4319
4320PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4321{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004322 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004324 PyErr_BadArgument();
4325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004327 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4328 PyUnicode_GET_SIZE(unicode));
4329
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004330 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331}
4332
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004333/* --- Unicode Internal Codec ------------------------------------------- */
4334
4335PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004336 Py_ssize_t size,
4337 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004338{
4339 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004340 Py_ssize_t startinpos;
4341 Py_ssize_t endinpos;
4342 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004343 PyUnicodeObject *v;
4344 Py_UNICODE *p;
4345 const char *end;
4346 const char *reason;
4347 PyObject *errorHandler = NULL;
4348 PyObject *exc = NULL;
4349
Neal Norwitzd43069c2006-01-08 01:12:10 +00004350#ifdef Py_UNICODE_WIDE
4351 Py_UNICODE unimax = PyUnicode_GetMax();
4352#endif
4353
Thomas Wouters89f507f2006-12-13 04:49:30 +00004354 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004355 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4356 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004357 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004358 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004359 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004360 p = PyUnicode_AS_UNICODE(v);
4361 end = s + size;
4362
4363 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004364 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004365 /* We have to sanity check the raw data, otherwise doom looms for
4366 some malformed UCS-4 data. */
4367 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004368#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004369 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004370#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004371 end-s < Py_UNICODE_SIZE
4372 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004373 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004374 startinpos = s - starts;
4375 if (end-s < Py_UNICODE_SIZE) {
4376 endinpos = end-starts;
4377 reason = "truncated input";
4378 }
4379 else {
4380 endinpos = s - starts + Py_UNICODE_SIZE;
4381 reason = "illegal code point (> 0x10FFFF)";
4382 }
4383 outpos = p - PyUnicode_AS_UNICODE(v);
4384 if (unicode_decode_call_errorhandler(
4385 errors, &errorHandler,
4386 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004387 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004388 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004389 goto onError;
4390 }
4391 }
4392 else {
4393 p++;
4394 s += Py_UNICODE_SIZE;
4395 }
4396 }
4397
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004398 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004399 goto onError;
4400 Py_XDECREF(errorHandler);
4401 Py_XDECREF(exc);
4402 return (PyObject *)v;
4403
Benjamin Peterson29060642009-01-31 22:14:21 +00004404 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004405 Py_XDECREF(v);
4406 Py_XDECREF(errorHandler);
4407 Py_XDECREF(exc);
4408 return NULL;
4409}
4410
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411/* --- Latin-1 Codec ------------------------------------------------------ */
4412
4413PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 Py_ssize_t size,
4415 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416{
4417 PyUnicodeObject *v;
4418 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004419 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004420
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004422 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 Py_UNICODE r = *(unsigned char*)s;
4424 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004425 }
4426
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 v = _PyUnicode_New(size);
4428 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004433 e = s + size;
4434 /* Unrolling the copy makes it much faster by reducing the looping
4435 overhead. This is similar to what many memcpy() implementations do. */
4436 unrolled_end = e - 4;
4437 while (s < unrolled_end) {
4438 p[0] = (unsigned char) s[0];
4439 p[1] = (unsigned char) s[1];
4440 p[2] = (unsigned char) s[2];
4441 p[3] = (unsigned char) s[3];
4442 s += 4;
4443 p += 4;
4444 }
4445 while (s < e)
4446 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004448
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 Py_XDECREF(v);
4451 return NULL;
4452}
4453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454/* create or adjust a UnicodeEncodeError */
4455static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 const char *encoding,
4457 const Py_UNICODE *unicode, Py_ssize_t size,
4458 Py_ssize_t startpos, Py_ssize_t endpos,
4459 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 *exceptionObject = PyUnicodeEncodeError_Create(
4463 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464 }
4465 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4467 goto onError;
4468 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4469 goto onError;
4470 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4471 goto onError;
4472 return;
4473 onError:
4474 Py_DECREF(*exceptionObject);
4475 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 }
4477}
4478
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479/* raises a UnicodeEncodeError */
4480static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 const char *encoding,
4482 const Py_UNICODE *unicode, Py_ssize_t size,
4483 Py_ssize_t startpos, Py_ssize_t endpos,
4484 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485{
4486 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490}
4491
4492/* error handling callback helper:
4493 build arguments, call the callback and check the arguments,
4494 put the result into newpos and return the replacement string, which
4495 has to be freed by the caller */
4496static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 PyObject **errorHandler,
4498 const char *encoding, const char *reason,
4499 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4500 Py_ssize_t startpos, Py_ssize_t endpos,
4501 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004503 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504
4505 PyObject *restuple;
4506 PyObject *resunicode;
4507
4508 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 }
4513
4514 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518
4519 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004524 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 Py_DECREF(restuple);
4526 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004528 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 &resunicode, newpos)) {
4530 Py_DECREF(restuple);
4531 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004533 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4534 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4535 Py_DECREF(restuple);
4536 return NULL;
4537 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004540 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4542 Py_DECREF(restuple);
4543 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004544 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 Py_INCREF(resunicode);
4546 Py_DECREF(restuple);
4547 return resunicode;
4548}
4549
4550static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 Py_ssize_t size,
4552 const char *errors,
4553 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554{
4555 /* output object */
4556 PyObject *res;
4557 /* pointers to the beginning and end+1 of input */
4558 const Py_UNICODE *startp = p;
4559 const Py_UNICODE *endp = p + size;
4560 /* pointer to the beginning of the unencodable characters */
4561 /* const Py_UNICODE *badp = NULL; */
4562 /* pointer into the output */
4563 char *str;
4564 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004565 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004566 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4567 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 PyObject *errorHandler = NULL;
4569 PyObject *exc = NULL;
4570 /* the following variable is used for caching string comparisons
4571 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4572 int known_errorHandler = -1;
4573
4574 /* allocate enough for a simple encoding without
4575 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004576 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004577 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004578 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004580 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004581 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 ressize = size;
4583
4584 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004585 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 /* can we encode this? */
4588 if (c<limit) {
4589 /* no overflow check, because we know that the space is enough */
4590 *str++ = (char)c;
4591 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004592 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 else {
4594 Py_ssize_t unicodepos = p-startp;
4595 Py_ssize_t requiredsize;
4596 PyObject *repunicode;
4597 Py_ssize_t repsize;
4598 Py_ssize_t newpos;
4599 Py_ssize_t respos;
4600 Py_UNICODE *uni2;
4601 /* startpos for collecting unencodable chars */
4602 const Py_UNICODE *collstart = p;
4603 const Py_UNICODE *collend = p;
4604 /* find all unecodable characters */
4605 while ((collend < endp) && ((*collend)>=limit))
4606 ++collend;
4607 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4608 if (known_errorHandler==-1) {
4609 if ((errors==NULL) || (!strcmp(errors, "strict")))
4610 known_errorHandler = 1;
4611 else if (!strcmp(errors, "replace"))
4612 known_errorHandler = 2;
4613 else if (!strcmp(errors, "ignore"))
4614 known_errorHandler = 3;
4615 else if (!strcmp(errors, "xmlcharrefreplace"))
4616 known_errorHandler = 4;
4617 else
4618 known_errorHandler = 0;
4619 }
4620 switch (known_errorHandler) {
4621 case 1: /* strict */
4622 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4623 goto onError;
4624 case 2: /* replace */
4625 while (collstart++<collend)
4626 *str++ = '?'; /* fall through */
4627 case 3: /* ignore */
4628 p = collend;
4629 break;
4630 case 4: /* xmlcharrefreplace */
4631 respos = str - PyBytes_AS_STRING(res);
4632 /* determine replacement size (temporarily (mis)uses p) */
4633 for (p = collstart, repsize = 0; p < collend; ++p) {
4634 if (*p<10)
4635 repsize += 2+1+1;
4636 else if (*p<100)
4637 repsize += 2+2+1;
4638 else if (*p<1000)
4639 repsize += 2+3+1;
4640 else if (*p<10000)
4641 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004642#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 else
4644 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004645#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 else if (*p<100000)
4647 repsize += 2+5+1;
4648 else if (*p<1000000)
4649 repsize += 2+6+1;
4650 else
4651 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004652#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004653 }
4654 requiredsize = respos+repsize+(endp-collend);
4655 if (requiredsize > ressize) {
4656 if (requiredsize<2*ressize)
4657 requiredsize = 2*ressize;
4658 if (_PyBytes_Resize(&res, requiredsize))
4659 goto onError;
4660 str = PyBytes_AS_STRING(res) + respos;
4661 ressize = requiredsize;
4662 }
4663 /* generate replacement (temporarily (mis)uses p) */
4664 for (p = collstart; p < collend; ++p) {
4665 str += sprintf(str, "&#%d;", (int)*p);
4666 }
4667 p = collend;
4668 break;
4669 default:
4670 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4671 encoding, reason, startp, size, &exc,
4672 collstart-startp, collend-startp, &newpos);
4673 if (repunicode == NULL)
4674 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004675 if (PyBytes_Check(repunicode)) {
4676 /* Directly copy bytes result to output. */
4677 repsize = PyBytes_Size(repunicode);
4678 if (repsize > 1) {
4679 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004680 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004681 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4682 Py_DECREF(repunicode);
4683 goto onError;
4684 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004685 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004686 ressize += repsize-1;
4687 }
4688 memcpy(str, PyBytes_AsString(repunicode), repsize);
4689 str += repsize;
4690 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004691 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004692 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004693 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 /* need more space? (at least enough for what we
4695 have+the replacement+the rest of the string, so
4696 we won't have to check space for encodable characters) */
4697 respos = str - PyBytes_AS_STRING(res);
4698 repsize = PyUnicode_GET_SIZE(repunicode);
4699 requiredsize = respos+repsize+(endp-collend);
4700 if (requiredsize > ressize) {
4701 if (requiredsize<2*ressize)
4702 requiredsize = 2*ressize;
4703 if (_PyBytes_Resize(&res, requiredsize)) {
4704 Py_DECREF(repunicode);
4705 goto onError;
4706 }
4707 str = PyBytes_AS_STRING(res) + respos;
4708 ressize = requiredsize;
4709 }
4710 /* check if there is anything unencodable in the replacement
4711 and copy it to the output */
4712 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4713 c = *uni2;
4714 if (c >= limit) {
4715 raise_encode_exception(&exc, encoding, startp, size,
4716 unicodepos, unicodepos+1, reason);
4717 Py_DECREF(repunicode);
4718 goto onError;
4719 }
4720 *str = (char)c;
4721 }
4722 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004723 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004725 }
4726 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004727 /* Resize if we allocated to much */
4728 size = str - PyBytes_AS_STRING(res);
4729 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004730 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004731 if (_PyBytes_Resize(&res, size) < 0)
4732 goto onError;
4733 }
4734
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 Py_XDECREF(errorHandler);
4736 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004737 return res;
4738
4739 onError:
4740 Py_XDECREF(res);
4741 Py_XDECREF(errorHandler);
4742 Py_XDECREF(exc);
4743 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744}
4745
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004747 Py_ssize_t size,
4748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004750 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751}
4752
4753PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4754{
4755 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 PyErr_BadArgument();
4757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 }
4759 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004760 PyUnicode_GET_SIZE(unicode),
4761 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762}
4763
4764/* --- 7-bit ASCII Codec -------------------------------------------------- */
4765
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004767 Py_ssize_t size,
4768 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 PyUnicodeObject *v;
4772 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004773 Py_ssize_t startinpos;
4774 Py_ssize_t endinpos;
4775 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 const char *e;
4777 PyObject *errorHandler = NULL;
4778 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004779
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004781 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 Py_UNICODE r = *(unsigned char*)s;
4783 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004784 }
Tim Petersced69f82003-09-16 20:30:58 +00004785
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 v = _PyUnicode_New(size);
4787 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004790 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792 e = s + size;
4793 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 register unsigned char c = (unsigned char)*s;
4795 if (c < 128) {
4796 *p++ = c;
4797 ++s;
4798 }
4799 else {
4800 startinpos = s-starts;
4801 endinpos = startinpos + 1;
4802 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4803 if (unicode_decode_call_errorhandler(
4804 errors, &errorHandler,
4805 "ascii", "ordinal not in range(128)",
4806 &starts, &e, &startinpos, &endinpos, &exc, &s,
4807 &v, &outpos, &p))
4808 goto onError;
4809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004811 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004812 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4813 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 Py_XDECREF(errorHandler);
4815 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004817
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 Py_XDECREF(errorHandler);
4821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 return NULL;
4823}
4824
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004826 Py_ssize_t size,
4827 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830}
4831
4832PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4833{
4834 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 PyErr_BadArgument();
4836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 }
4838 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 PyUnicode_GET_SIZE(unicode),
4840 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841}
4842
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004843#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004844
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004845/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004846
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004847#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004848#define NEED_RETRY
4849#endif
4850
4851/* XXX This code is limited to "true" double-byte encodings, as
4852 a) it assumes an incomplete character consists of a single byte, and
4853 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004855
4856static int is_dbcs_lead_byte(const char *s, int offset)
4857{
4858 const char *curr = s + offset;
4859
4860 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 const char *prev = CharPrev(s, curr);
4862 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004863 }
4864 return 0;
4865}
4866
4867/*
4868 * Decode MBCS string into unicode object. If 'final' is set, converts
4869 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4870 */
4871static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004872 const char *s, /* MBCS string */
4873 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004874 int final,
4875 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004876{
4877 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004878 Py_ssize_t n;
4879 DWORD usize;
4880 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004881
4882 assert(size >= 0);
4883
Victor Stinner554f3f02010-06-16 23:33:54 +00004884 /* check and handle 'errors' arg */
4885 if (errors==NULL || strcmp(errors, "strict")==0)
4886 flags = MB_ERR_INVALID_CHARS;
4887 else if (strcmp(errors, "ignore")==0)
4888 flags = 0;
4889 else {
4890 PyErr_Format(PyExc_ValueError,
4891 "mbcs encoding does not support errors='%s'",
4892 errors);
4893 return -1;
4894 }
4895
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004896 /* Skip trailing lead-byte unless 'final' is set */
4897 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004899
4900 /* First get the size of the result */
4901 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004902 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4903 if (usize==0)
4904 goto mbcs_decode_error;
4905 } else
4906 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004907
4908 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004909 /* Create unicode object */
4910 *v = _PyUnicode_New(usize);
4911 if (*v == NULL)
4912 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004913 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004914 }
4915 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 /* Extend unicode object */
4917 n = PyUnicode_GET_SIZE(*v);
4918 if (_PyUnicode_Resize(v, n + usize) < 0)
4919 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004920 }
4921
4922 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004923 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004924 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004925 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4926 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004928 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004929 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004930
4931mbcs_decode_error:
4932 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4933 we raise a UnicodeDecodeError - else it is a 'generic'
4934 windows error
4935 */
4936 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4937 /* Ideally, we should get reason from FormatMessage - this
4938 is the Windows 2000 English version of the message
4939 */
4940 PyObject *exc = NULL;
4941 const char *reason = "No mapping for the Unicode character exists "
4942 "in the target multi-byte code page.";
4943 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4944 if (exc != NULL) {
4945 PyCodec_StrictErrors(exc);
4946 Py_DECREF(exc);
4947 }
4948 } else {
4949 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4950 }
4951 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004952}
4953
4954PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 Py_ssize_t size,
4956 const char *errors,
4957 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004958{
4959 PyUnicodeObject *v = NULL;
4960 int done;
4961
4962 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004964
4965#ifdef NEED_RETRY
4966 retry:
4967 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004968 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004969 else
4970#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004971 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004972
4973 if (done < 0) {
4974 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004975 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004976 }
4977
4978 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004980
4981#ifdef NEED_RETRY
4982 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 s += done;
4984 size -= done;
4985 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004986 }
4987#endif
4988
4989 return (PyObject *)v;
4990}
4991
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004992PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 Py_ssize_t size,
4994 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004995{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004996 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4997}
4998
4999/*
5000 * Convert unicode into string object (MBCS).
5001 * Returns 0 if succeed, -1 otherwise.
5002 */
5003static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005005 int size, /* size of unicode */
5006 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005007{
Victor Stinner554f3f02010-06-16 23:33:54 +00005008 BOOL usedDefaultChar = FALSE;
5009 BOOL *pusedDefaultChar;
5010 int mbcssize;
5011 Py_ssize_t n;
5012 PyObject *exc = NULL;
5013 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005014
5015 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005016
Victor Stinner554f3f02010-06-16 23:33:54 +00005017 /* check and handle 'errors' arg */
5018 if (errors==NULL || strcmp(errors, "strict")==0) {
5019 flags = WC_NO_BEST_FIT_CHARS;
5020 pusedDefaultChar = &usedDefaultChar;
5021 } else if (strcmp(errors, "replace")==0) {
5022 flags = 0;
5023 pusedDefaultChar = NULL;
5024 } else {
5025 PyErr_Format(PyExc_ValueError,
5026 "mbcs encoding does not support errors='%s'",
5027 errors);
5028 return -1;
5029 }
5030
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005031 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005032 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005033 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5034 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 if (mbcssize == 0) {
5036 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5037 return -1;
5038 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005039 /* If we used a default char, then we failed! */
5040 if (pusedDefaultChar && *pusedDefaultChar)
5041 goto mbcs_encode_error;
5042 } else {
5043 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005044 }
5045
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005046 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 /* Create string object */
5048 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5049 if (*repr == NULL)
5050 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005051 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005052 }
5053 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 /* Extend string object */
5055 n = PyBytes_Size(*repr);
5056 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5057 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005058 }
5059
5060 /* Do the conversion */
5061 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005063 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5064 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5066 return -1;
5067 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005068 if (pusedDefaultChar && *pusedDefaultChar)
5069 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005070 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005071 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005072
5073mbcs_encode_error:
5074 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5075 Py_XDECREF(exc);
5076 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005077}
5078
5079PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005080 Py_ssize_t size,
5081 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005082{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005083 PyObject *repr = NULL;
5084 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005085
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005086#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005088 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005089 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005090 else
5091#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005092 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005093
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005094 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 Py_XDECREF(repr);
5096 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005097 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005098
5099#ifdef NEED_RETRY
5100 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 p += INT_MAX;
5102 size -= INT_MAX;
5103 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005104 }
5105#endif
5106
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005107 return repr;
5108}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005109
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005110PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5111{
5112 if (!PyUnicode_Check(unicode)) {
5113 PyErr_BadArgument();
5114 return NULL;
5115 }
5116 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 PyUnicode_GET_SIZE(unicode),
5118 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005119}
5120
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005121#undef NEED_RETRY
5122
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005123#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005124
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125/* --- Character Mapping Codec -------------------------------------------- */
5126
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005128 Py_ssize_t size,
5129 PyObject *mapping,
5130 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005132 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005133 Py_ssize_t startinpos;
5134 Py_ssize_t endinpos;
5135 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005136 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 PyUnicodeObject *v;
5138 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005139 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 PyObject *errorHandler = NULL;
5141 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005142 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005143 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 /* Default to Latin-1 */
5146 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148
5149 v = _PyUnicode_New(size);
5150 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005153 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005155 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005156 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 mapstring = PyUnicode_AS_UNICODE(mapping);
5158 maplen = PyUnicode_GET_SIZE(mapping);
5159 while (s < e) {
5160 unsigned char ch = *s;
5161 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 if (ch < maplen)
5164 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 if (x == 0xfffe) {
5167 /* undefined mapping */
5168 outpos = p-PyUnicode_AS_UNICODE(v);
5169 startinpos = s-starts;
5170 endinpos = startinpos+1;
5171 if (unicode_decode_call_errorhandler(
5172 errors, &errorHandler,
5173 "charmap", "character maps to <undefined>",
5174 &starts, &e, &startinpos, &endinpos, &exc, &s,
5175 &v, &outpos, &p)) {
5176 goto onError;
5177 }
5178 continue;
5179 }
5180 *p++ = x;
5181 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005182 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005183 }
5184 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 while (s < e) {
5186 unsigned char ch = *s;
5187 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005188
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5190 w = PyLong_FromLong((long)ch);
5191 if (w == NULL)
5192 goto onError;
5193 x = PyObject_GetItem(mapping, w);
5194 Py_DECREF(w);
5195 if (x == NULL) {
5196 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5197 /* No mapping found means: mapping is undefined. */
5198 PyErr_Clear();
5199 x = Py_None;
5200 Py_INCREF(x);
5201 } else
5202 goto onError;
5203 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005204
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 /* Apply mapping */
5206 if (PyLong_Check(x)) {
5207 long value = PyLong_AS_LONG(x);
5208 if (value < 0 || value > 65535) {
5209 PyErr_SetString(PyExc_TypeError,
5210 "character mapping must be in range(65536)");
5211 Py_DECREF(x);
5212 goto onError;
5213 }
5214 *p++ = (Py_UNICODE)value;
5215 }
5216 else if (x == Py_None) {
5217 /* undefined mapping */
5218 outpos = p-PyUnicode_AS_UNICODE(v);
5219 startinpos = s-starts;
5220 endinpos = startinpos+1;
5221 if (unicode_decode_call_errorhandler(
5222 errors, &errorHandler,
5223 "charmap", "character maps to <undefined>",
5224 &starts, &e, &startinpos, &endinpos, &exc, &s,
5225 &v, &outpos, &p)) {
5226 Py_DECREF(x);
5227 goto onError;
5228 }
5229 Py_DECREF(x);
5230 continue;
5231 }
5232 else if (PyUnicode_Check(x)) {
5233 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005234
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 if (targetsize == 1)
5236 /* 1-1 mapping */
5237 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 else if (targetsize > 1) {
5240 /* 1-n mapping */
5241 if (targetsize > extrachars) {
5242 /* resize first */
5243 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5244 Py_ssize_t needed = (targetsize - extrachars) + \
5245 (targetsize << 2);
5246 extrachars += needed;
5247 /* XXX overflow detection missing */
5248 if (_PyUnicode_Resize(&v,
5249 PyUnicode_GET_SIZE(v) + needed) < 0) {
5250 Py_DECREF(x);
5251 goto onError;
5252 }
5253 p = PyUnicode_AS_UNICODE(v) + oldpos;
5254 }
5255 Py_UNICODE_COPY(p,
5256 PyUnicode_AS_UNICODE(x),
5257 targetsize);
5258 p += targetsize;
5259 extrachars -= targetsize;
5260 }
5261 /* 1-0 mapping: skip the character */
5262 }
5263 else {
5264 /* wrong return value */
5265 PyErr_SetString(PyExc_TypeError,
5266 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005267 Py_DECREF(x);
5268 goto onError;
5269 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 Py_DECREF(x);
5271 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 }
5274 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005275 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5276 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005277 Py_XDECREF(errorHandler);
5278 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005280
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005282 Py_XDECREF(errorHandler);
5283 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 Py_XDECREF(v);
5285 return NULL;
5286}
5287
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005288/* Charmap encoding: the lookup table */
5289
5290struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 PyObject_HEAD
5292 unsigned char level1[32];
5293 int count2, count3;
5294 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005295};
5296
5297static PyObject*
5298encoding_map_size(PyObject *obj, PyObject* args)
5299{
5300 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005301 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005303}
5304
5305static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005306 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 PyDoc_STR("Return the size (in bytes) of this object") },
5308 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005309};
5310
5311static void
5312encoding_map_dealloc(PyObject* o)
5313{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005314 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005315}
5316
5317static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005318 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 "EncodingMap", /*tp_name*/
5320 sizeof(struct encoding_map), /*tp_basicsize*/
5321 0, /*tp_itemsize*/
5322 /* methods */
5323 encoding_map_dealloc, /*tp_dealloc*/
5324 0, /*tp_print*/
5325 0, /*tp_getattr*/
5326 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005327 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 0, /*tp_repr*/
5329 0, /*tp_as_number*/
5330 0, /*tp_as_sequence*/
5331 0, /*tp_as_mapping*/
5332 0, /*tp_hash*/
5333 0, /*tp_call*/
5334 0, /*tp_str*/
5335 0, /*tp_getattro*/
5336 0, /*tp_setattro*/
5337 0, /*tp_as_buffer*/
5338 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5339 0, /*tp_doc*/
5340 0, /*tp_traverse*/
5341 0, /*tp_clear*/
5342 0, /*tp_richcompare*/
5343 0, /*tp_weaklistoffset*/
5344 0, /*tp_iter*/
5345 0, /*tp_iternext*/
5346 encoding_map_methods, /*tp_methods*/
5347 0, /*tp_members*/
5348 0, /*tp_getset*/
5349 0, /*tp_base*/
5350 0, /*tp_dict*/
5351 0, /*tp_descr_get*/
5352 0, /*tp_descr_set*/
5353 0, /*tp_dictoffset*/
5354 0, /*tp_init*/
5355 0, /*tp_alloc*/
5356 0, /*tp_new*/
5357 0, /*tp_free*/
5358 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005359};
5360
5361PyObject*
5362PyUnicode_BuildEncodingMap(PyObject* string)
5363{
5364 Py_UNICODE *decode;
5365 PyObject *result;
5366 struct encoding_map *mresult;
5367 int i;
5368 int need_dict = 0;
5369 unsigned char level1[32];
5370 unsigned char level2[512];
5371 unsigned char *mlevel1, *mlevel2, *mlevel3;
5372 int count2 = 0, count3 = 0;
5373
5374 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5375 PyErr_BadArgument();
5376 return NULL;
5377 }
5378 decode = PyUnicode_AS_UNICODE(string);
5379 memset(level1, 0xFF, sizeof level1);
5380 memset(level2, 0xFF, sizeof level2);
5381
5382 /* If there isn't a one-to-one mapping of NULL to \0,
5383 or if there are non-BMP characters, we need to use
5384 a mapping dictionary. */
5385 if (decode[0] != 0)
5386 need_dict = 1;
5387 for (i = 1; i < 256; i++) {
5388 int l1, l2;
5389 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005390#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005391 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005392#endif
5393 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005394 need_dict = 1;
5395 break;
5396 }
5397 if (decode[i] == 0xFFFE)
5398 /* unmapped character */
5399 continue;
5400 l1 = decode[i] >> 11;
5401 l2 = decode[i] >> 7;
5402 if (level1[l1] == 0xFF)
5403 level1[l1] = count2++;
5404 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005405 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005406 }
5407
5408 if (count2 >= 0xFF || count3 >= 0xFF)
5409 need_dict = 1;
5410
5411 if (need_dict) {
5412 PyObject *result = PyDict_New();
5413 PyObject *key, *value;
5414 if (!result)
5415 return NULL;
5416 for (i = 0; i < 256; i++) {
5417 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005418 key = PyLong_FromLong(decode[i]);
5419 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005420 if (!key || !value)
5421 goto failed1;
5422 if (PyDict_SetItem(result, key, value) == -1)
5423 goto failed1;
5424 Py_DECREF(key);
5425 Py_DECREF(value);
5426 }
5427 return result;
5428 failed1:
5429 Py_XDECREF(key);
5430 Py_XDECREF(value);
5431 Py_DECREF(result);
5432 return NULL;
5433 }
5434
5435 /* Create a three-level trie */
5436 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5437 16*count2 + 128*count3 - 1);
5438 if (!result)
5439 return PyErr_NoMemory();
5440 PyObject_Init(result, &EncodingMapType);
5441 mresult = (struct encoding_map*)result;
5442 mresult->count2 = count2;
5443 mresult->count3 = count3;
5444 mlevel1 = mresult->level1;
5445 mlevel2 = mresult->level23;
5446 mlevel3 = mresult->level23 + 16*count2;
5447 memcpy(mlevel1, level1, 32);
5448 memset(mlevel2, 0xFF, 16*count2);
5449 memset(mlevel3, 0, 128*count3);
5450 count3 = 0;
5451 for (i = 1; i < 256; i++) {
5452 int o1, o2, o3, i2, i3;
5453 if (decode[i] == 0xFFFE)
5454 /* unmapped character */
5455 continue;
5456 o1 = decode[i]>>11;
5457 o2 = (decode[i]>>7) & 0xF;
5458 i2 = 16*mlevel1[o1] + o2;
5459 if (mlevel2[i2] == 0xFF)
5460 mlevel2[i2] = count3++;
5461 o3 = decode[i] & 0x7F;
5462 i3 = 128*mlevel2[i2] + o3;
5463 mlevel3[i3] = i;
5464 }
5465 return result;
5466}
5467
5468static int
5469encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5470{
5471 struct encoding_map *map = (struct encoding_map*)mapping;
5472 int l1 = c>>11;
5473 int l2 = (c>>7) & 0xF;
5474 int l3 = c & 0x7F;
5475 int i;
5476
5477#ifdef Py_UNICODE_WIDE
5478 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005480 }
5481#endif
5482 if (c == 0)
5483 return 0;
5484 /* level 1*/
5485 i = map->level1[l1];
5486 if (i == 0xFF) {
5487 return -1;
5488 }
5489 /* level 2*/
5490 i = map->level23[16*i+l2];
5491 if (i == 0xFF) {
5492 return -1;
5493 }
5494 /* level 3 */
5495 i = map->level23[16*map->count2 + 128*i + l3];
5496 if (i == 0) {
5497 return -1;
5498 }
5499 return i;
5500}
5501
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502/* Lookup the character ch in the mapping. If the character
5503 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005504 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506{
Christian Heimes217cfd12007-12-02 14:31:20 +00005507 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 PyObject *x;
5509
5510 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 x = PyObject_GetItem(mapping, w);
5513 Py_DECREF(w);
5514 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5516 /* No mapping found means: mapping is undefined. */
5517 PyErr_Clear();
5518 x = Py_None;
5519 Py_INCREF(x);
5520 return x;
5521 } else
5522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005524 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005526 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 long value = PyLong_AS_LONG(x);
5528 if (value < 0 || value > 255) {
5529 PyErr_SetString(PyExc_TypeError,
5530 "character mapping must be in range(256)");
5531 Py_DECREF(x);
5532 return NULL;
5533 }
5534 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005536 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 /* wrong return value */
5540 PyErr_Format(PyExc_TypeError,
5541 "character mapping must return integer, bytes or None, not %.400s",
5542 x->ob_type->tp_name);
5543 Py_DECREF(x);
5544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 }
5546}
5547
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005548static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005549charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005550{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005551 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5552 /* exponentially overallocate to minimize reallocations */
5553 if (requiredsize < 2*outsize)
5554 requiredsize = 2*outsize;
5555 if (_PyBytes_Resize(outobj, requiredsize))
5556 return -1;
5557 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005558}
5559
Benjamin Peterson14339b62009-01-31 16:36:08 +00005560typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005562}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005564 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005565 space is available. Return a new reference to the object that
5566 was put in the output buffer, or Py_None, if the mapping was undefined
5567 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005568 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005570charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005573 PyObject *rep;
5574 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005575 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576
Christian Heimes90aa7642007-12-19 02:45:37 +00005577 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005578 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005580 if (res == -1)
5581 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 if (outsize<requiredsize)
5583 if (charmapencode_resize(outobj, outpos, requiredsize))
5584 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005585 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 outstart[(*outpos)++] = (char)res;
5587 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005588 }
5589
5590 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005593 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 Py_DECREF(rep);
5595 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005596 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 if (PyLong_Check(rep)) {
5598 Py_ssize_t requiredsize = *outpos+1;
5599 if (outsize<requiredsize)
5600 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5601 Py_DECREF(rep);
5602 return enc_EXCEPTION;
5603 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005604 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005606 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 else {
5608 const char *repchars = PyBytes_AS_STRING(rep);
5609 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5610 Py_ssize_t requiredsize = *outpos+repsize;
5611 if (outsize<requiredsize)
5612 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5613 Py_DECREF(rep);
5614 return enc_EXCEPTION;
5615 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005616 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 memcpy(outstart + *outpos, repchars, repsize);
5618 *outpos += repsize;
5619 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005621 Py_DECREF(rep);
5622 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623}
5624
5625/* handle an error in PyUnicode_EncodeCharmap
5626 Return 0 on success, -1 on error */
5627static
5628int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005629 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005631 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005632 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633{
5634 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005635 Py_ssize_t repsize;
5636 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005637 Py_UNICODE *uni2;
5638 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005639 Py_ssize_t collstartpos = *inpos;
5640 Py_ssize_t collendpos = *inpos+1;
5641 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642 char *encoding = "charmap";
5643 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005644 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 /* find all unencodable characters */
5647 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005648 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005649 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 int res = encoding_map_lookup(p[collendpos], mapping);
5651 if (res != -1)
5652 break;
5653 ++collendpos;
5654 continue;
5655 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005656
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 rep = charmapencode_lookup(p[collendpos], mapping);
5658 if (rep==NULL)
5659 return -1;
5660 else if (rep!=Py_None) {
5661 Py_DECREF(rep);
5662 break;
5663 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005664 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005666 }
5667 /* cache callback name lookup
5668 * (if not done yet, i.e. it's the first error) */
5669 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 if ((errors==NULL) || (!strcmp(errors, "strict")))
5671 *known_errorHandler = 1;
5672 else if (!strcmp(errors, "replace"))
5673 *known_errorHandler = 2;
5674 else if (!strcmp(errors, "ignore"))
5675 *known_errorHandler = 3;
5676 else if (!strcmp(errors, "xmlcharrefreplace"))
5677 *known_errorHandler = 4;
5678 else
5679 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 }
5681 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005682 case 1: /* strict */
5683 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5684 return -1;
5685 case 2: /* replace */
5686 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 x = charmapencode_output('?', mapping, res, respos);
5688 if (x==enc_EXCEPTION) {
5689 return -1;
5690 }
5691 else if (x==enc_FAILED) {
5692 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5693 return -1;
5694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005695 }
5696 /* fall through */
5697 case 3: /* ignore */
5698 *inpos = collendpos;
5699 break;
5700 case 4: /* xmlcharrefreplace */
5701 /* generate replacement (temporarily (mis)uses p) */
5702 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 char buffer[2+29+1+1];
5704 char *cp;
5705 sprintf(buffer, "&#%d;", (int)p[collpos]);
5706 for (cp = buffer; *cp; ++cp) {
5707 x = charmapencode_output(*cp, mapping, res, respos);
5708 if (x==enc_EXCEPTION)
5709 return -1;
5710 else if (x==enc_FAILED) {
5711 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5712 return -1;
5713 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005714 }
5715 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005716 *inpos = collendpos;
5717 break;
5718 default:
5719 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005720 encoding, reason, p, size, exceptionObject,
5721 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005722 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005724 if (PyBytes_Check(repunicode)) {
5725 /* Directly copy bytes result to output. */
5726 Py_ssize_t outsize = PyBytes_Size(*res);
5727 Py_ssize_t requiredsize;
5728 repsize = PyBytes_Size(repunicode);
5729 requiredsize = *respos + repsize;
5730 if (requiredsize > outsize)
5731 /* Make room for all additional bytes. */
5732 if (charmapencode_resize(res, respos, requiredsize)) {
5733 Py_DECREF(repunicode);
5734 return -1;
5735 }
5736 memcpy(PyBytes_AsString(*res) + *respos,
5737 PyBytes_AsString(repunicode), repsize);
5738 *respos += repsize;
5739 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005740 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005741 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005743 /* generate replacement */
5744 repsize = PyUnicode_GET_SIZE(repunicode);
5745 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 x = charmapencode_output(*uni2, mapping, res, respos);
5747 if (x==enc_EXCEPTION) {
5748 return -1;
5749 }
5750 else if (x==enc_FAILED) {
5751 Py_DECREF(repunicode);
5752 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5753 return -1;
5754 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005755 }
5756 *inpos = newpos;
5757 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 }
5759 return 0;
5760}
5761
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 Py_ssize_t size,
5764 PyObject *mapping,
5765 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 /* output object */
5768 PyObject *res = NULL;
5769 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005770 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005772 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 PyObject *errorHandler = NULL;
5774 PyObject *exc = NULL;
5775 /* the following variable is used for caching string comparisons
5776 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5777 * 3=ignore, 4=xmlcharrefreplace */
5778 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779
5780 /* Default to Latin-1 */
5781 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 /* allocate enough for a simple encoding without
5785 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005786 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005787 if (res == NULL)
5788 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005789 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 /* try to encode it */
5794 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5795 if (x==enc_EXCEPTION) /* error */
5796 goto onError;
5797 if (x==enc_FAILED) { /* unencodable character */
5798 if (charmap_encoding_error(p, size, &inpos, mapping,
5799 &exc,
5800 &known_errorHandler, &errorHandler, errors,
5801 &res, &respos)) {
5802 goto onError;
5803 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005804 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 else
5806 /* done with this character => adjust input position */
5807 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005811 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005812 if (_PyBytes_Resize(&res, respos) < 0)
5813 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005814
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005815 Py_XDECREF(exc);
5816 Py_XDECREF(errorHandler);
5817 return res;
5818
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820 Py_XDECREF(res);
5821 Py_XDECREF(exc);
5822 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 return NULL;
5824}
5825
5826PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828{
5829 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 PyErr_BadArgument();
5831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 }
5833 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 PyUnicode_GET_SIZE(unicode),
5835 mapping,
5836 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837}
5838
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839/* create or adjust a UnicodeTranslateError */
5840static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 const Py_UNICODE *unicode, Py_ssize_t size,
5842 Py_ssize_t startpos, Py_ssize_t endpos,
5843 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005846 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 }
5849 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5851 goto onError;
5852 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5853 goto onError;
5854 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5855 goto onError;
5856 return;
5857 onError:
5858 Py_DECREF(*exceptionObject);
5859 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 }
5861}
5862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863/* raises a UnicodeTranslateError */
5864static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 const Py_UNICODE *unicode, Py_ssize_t size,
5866 Py_ssize_t startpos, Py_ssize_t endpos,
5867 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868{
5869 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005873}
5874
5875/* error handling callback helper:
5876 build arguments, call the callback and check the arguments,
5877 put the result into newpos and return the replacement string, which
5878 has to be freed by the caller */
5879static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 PyObject **errorHandler,
5881 const char *reason,
5882 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5883 Py_ssize_t startpos, Py_ssize_t endpos,
5884 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005886 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005887
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005888 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 PyObject *restuple;
5890 PyObject *resunicode;
5891
5892 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005894 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005896 }
5897
5898 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005900 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902
5903 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005907 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005908 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 Py_DECREF(restuple);
5910 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 }
5912 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 &resunicode, &i_newpos)) {
5914 Py_DECREF(restuple);
5915 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005917 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005919 else
5920 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005921 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5923 Py_DECREF(restuple);
5924 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005925 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 Py_INCREF(resunicode);
5927 Py_DECREF(restuple);
5928 return resunicode;
5929}
5930
5931/* Lookup the character ch in the mapping and put the result in result,
5932 which must be decrefed by the caller.
5933 Return 0 on success, -1 on error */
5934static
5935int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5936{
Christian Heimes217cfd12007-12-02 14:31:20 +00005937 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 PyObject *x;
5939
5940 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005942 x = PyObject_GetItem(mapping, w);
5943 Py_DECREF(w);
5944 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5946 /* No mapping found means: use 1:1 mapping. */
5947 PyErr_Clear();
5948 *result = NULL;
5949 return 0;
5950 } else
5951 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005952 }
5953 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 *result = x;
5955 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005956 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005957 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 long value = PyLong_AS_LONG(x);
5959 long max = PyUnicode_GetMax();
5960 if (value < 0 || value > max) {
5961 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005962 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 Py_DECREF(x);
5964 return -1;
5965 }
5966 *result = x;
5967 return 0;
5968 }
5969 else if (PyUnicode_Check(x)) {
5970 *result = x;
5971 return 0;
5972 }
5973 else {
5974 /* wrong return value */
5975 PyErr_SetString(PyExc_TypeError,
5976 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005977 Py_DECREF(x);
5978 return -1;
5979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980}
5981/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 if not reallocate and adjust various state variables.
5983 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005984static
Walter Dörwald4894c302003-10-24 14:25:28 +00005985int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005988 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005989 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 /* remember old output position */
5991 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5992 /* exponentially overallocate to minimize reallocations */
5993 if (requiredsize < 2 * oldsize)
5994 requiredsize = 2 * oldsize;
5995 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5996 return -1;
5997 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 }
5999 return 0;
6000}
6001/* lookup the character, put the result in the output string and adjust
6002 various state variables. Return a new reference to the object that
6003 was put in the output buffer in *result, or Py_None, if the mapping was
6004 undefined (in which case no character was written).
6005 The called must decref result.
6006 Return 0 on success, -1 on error. */
6007static
Walter Dörwald4894c302003-10-24 14:25:28 +00006008int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6010 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011{
Walter Dörwald4894c302003-10-24 14:25:28 +00006012 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006014 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 /* not found => default to 1:1 mapping */
6016 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017 }
6018 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006020 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 /* no overflow check, because we know that the space is enough */
6022 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 }
6024 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6026 if (repsize==1) {
6027 /* no overflow check, because we know that the space is enough */
6028 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6029 }
6030 else if (repsize!=0) {
6031 /* more than one character */
6032 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6033 (insize - (curinp-startinp)) +
6034 repsize - 1;
6035 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6036 return -1;
6037 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6038 *outp += repsize;
6039 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006040 }
6041 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 return 0;
6044}
6045
6046PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 Py_ssize_t size,
6048 PyObject *mapping,
6049 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051 /* output object */
6052 PyObject *res = NULL;
6053 /* pointers to the beginning and end+1 of input */
6054 const Py_UNICODE *startp = p;
6055 const Py_UNICODE *endp = p + size;
6056 /* pointer into the output */
6057 Py_UNICODE *str;
6058 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006059 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060 char *reason = "character maps to <undefined>";
6061 PyObject *errorHandler = NULL;
6062 PyObject *exc = NULL;
6063 /* the following variable is used for caching string comparisons
6064 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6065 * 3=ignore, 4=xmlcharrefreplace */
6066 int known_errorHandler = -1;
6067
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 PyErr_BadArgument();
6070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006072
6073 /* allocate enough for a simple 1:1 translation without
6074 replacements, if we need more, we'll resize */
6075 res = PyUnicode_FromUnicode(NULL, size);
6076 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 /* try to encode it */
6084 PyObject *x = NULL;
6085 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6086 Py_XDECREF(x);
6087 goto onError;
6088 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006089 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 if (x!=Py_None) /* it worked => adjust input pointer */
6091 ++p;
6092 else { /* untranslatable character */
6093 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6094 Py_ssize_t repsize;
6095 Py_ssize_t newpos;
6096 Py_UNICODE *uni2;
6097 /* startpos for collecting untranslatable chars */
6098 const Py_UNICODE *collstart = p;
6099 const Py_UNICODE *collend = p+1;
6100 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 /* find all untranslatable characters */
6103 while (collend < endp) {
6104 if (charmaptranslate_lookup(*collend, mapping, &x))
6105 goto onError;
6106 Py_XDECREF(x);
6107 if (x!=Py_None)
6108 break;
6109 ++collend;
6110 }
6111 /* cache callback name lookup
6112 * (if not done yet, i.e. it's the first error) */
6113 if (known_errorHandler==-1) {
6114 if ((errors==NULL) || (!strcmp(errors, "strict")))
6115 known_errorHandler = 1;
6116 else if (!strcmp(errors, "replace"))
6117 known_errorHandler = 2;
6118 else if (!strcmp(errors, "ignore"))
6119 known_errorHandler = 3;
6120 else if (!strcmp(errors, "xmlcharrefreplace"))
6121 known_errorHandler = 4;
6122 else
6123 known_errorHandler = 0;
6124 }
6125 switch (known_errorHandler) {
6126 case 1: /* strict */
6127 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006128 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 case 2: /* replace */
6130 /* No need to check for space, this is a 1:1 replacement */
6131 for (coll = collstart; coll<collend; ++coll)
6132 *str++ = '?';
6133 /* fall through */
6134 case 3: /* ignore */
6135 p = collend;
6136 break;
6137 case 4: /* xmlcharrefreplace */
6138 /* generate replacement (temporarily (mis)uses p) */
6139 for (p = collstart; p < collend; ++p) {
6140 char buffer[2+29+1+1];
6141 char *cp;
6142 sprintf(buffer, "&#%d;", (int)*p);
6143 if (charmaptranslate_makespace(&res, &str,
6144 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6145 goto onError;
6146 for (cp = buffer; *cp; ++cp)
6147 *str++ = *cp;
6148 }
6149 p = collend;
6150 break;
6151 default:
6152 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6153 reason, startp, size, &exc,
6154 collstart-startp, collend-startp, &newpos);
6155 if (repunicode == NULL)
6156 goto onError;
6157 /* generate replacement */
6158 repsize = PyUnicode_GET_SIZE(repunicode);
6159 if (charmaptranslate_makespace(&res, &str,
6160 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6161 Py_DECREF(repunicode);
6162 goto onError;
6163 }
6164 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6165 *str++ = *uni2;
6166 p = startp + newpos;
6167 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006168 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006169 }
6170 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171 /* Resize if we allocated to much */
6172 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006173 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 if (PyUnicode_Resize(&res, respos) < 0)
6175 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 }
6177 Py_XDECREF(exc);
6178 Py_XDECREF(errorHandler);
6179 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006182 Py_XDECREF(res);
6183 Py_XDECREF(exc);
6184 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 return NULL;
6186}
6187
6188PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 PyObject *mapping,
6190 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191{
6192 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006193
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 str = PyUnicode_FromObject(str);
6195 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 PyUnicode_GET_SIZE(str),
6199 mapping,
6200 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 Py_DECREF(str);
6202 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006203
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 Py_XDECREF(str);
6206 return NULL;
6207}
Tim Petersced69f82003-09-16 20:30:58 +00006208
Guido van Rossum9e896b32000-04-05 20:11:21 +00006209/* --- Decimal Encoder ---------------------------------------------------- */
6210
6211int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 Py_ssize_t length,
6213 char *output,
6214 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006215{
6216 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006217 PyObject *errorHandler = NULL;
6218 PyObject *exc = NULL;
6219 const char *encoding = "decimal";
6220 const char *reason = "invalid decimal Unicode string";
6221 /* the following variable is used for caching string comparisons
6222 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6223 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006224
6225 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 PyErr_BadArgument();
6227 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006228 }
6229
6230 p = s;
6231 end = s + length;
6232 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 register Py_UNICODE ch = *p;
6234 int decimal;
6235 PyObject *repunicode;
6236 Py_ssize_t repsize;
6237 Py_ssize_t newpos;
6238 Py_UNICODE *uni2;
6239 Py_UNICODE *collstart;
6240 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006241
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006243 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 ++p;
6245 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006246 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 decimal = Py_UNICODE_TODECIMAL(ch);
6248 if (decimal >= 0) {
6249 *output++ = '0' + decimal;
6250 ++p;
6251 continue;
6252 }
6253 if (0 < ch && ch < 256) {
6254 *output++ = (char)ch;
6255 ++p;
6256 continue;
6257 }
6258 /* All other characters are considered unencodable */
6259 collstart = p;
6260 collend = p+1;
6261 while (collend < end) {
6262 if ((0 < *collend && *collend < 256) ||
6263 !Py_UNICODE_ISSPACE(*collend) ||
6264 Py_UNICODE_TODECIMAL(*collend))
6265 break;
6266 }
6267 /* cache callback name lookup
6268 * (if not done yet, i.e. it's the first error) */
6269 if (known_errorHandler==-1) {
6270 if ((errors==NULL) || (!strcmp(errors, "strict")))
6271 known_errorHandler = 1;
6272 else if (!strcmp(errors, "replace"))
6273 known_errorHandler = 2;
6274 else if (!strcmp(errors, "ignore"))
6275 known_errorHandler = 3;
6276 else if (!strcmp(errors, "xmlcharrefreplace"))
6277 known_errorHandler = 4;
6278 else
6279 known_errorHandler = 0;
6280 }
6281 switch (known_errorHandler) {
6282 case 1: /* strict */
6283 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6284 goto onError;
6285 case 2: /* replace */
6286 for (p = collstart; p < collend; ++p)
6287 *output++ = '?';
6288 /* fall through */
6289 case 3: /* ignore */
6290 p = collend;
6291 break;
6292 case 4: /* xmlcharrefreplace */
6293 /* generate replacement (temporarily (mis)uses p) */
6294 for (p = collstart; p < collend; ++p)
6295 output += sprintf(output, "&#%d;", (int)*p);
6296 p = collend;
6297 break;
6298 default:
6299 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6300 encoding, reason, s, length, &exc,
6301 collstart-s, collend-s, &newpos);
6302 if (repunicode == NULL)
6303 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006304 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006305 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006306 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6307 Py_DECREF(repunicode);
6308 goto onError;
6309 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 /* generate replacement */
6311 repsize = PyUnicode_GET_SIZE(repunicode);
6312 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6313 Py_UNICODE ch = *uni2;
6314 if (Py_UNICODE_ISSPACE(ch))
6315 *output++ = ' ';
6316 else {
6317 decimal = Py_UNICODE_TODECIMAL(ch);
6318 if (decimal >= 0)
6319 *output++ = '0' + decimal;
6320 else if (0 < ch && ch < 256)
6321 *output++ = (char)ch;
6322 else {
6323 Py_DECREF(repunicode);
6324 raise_encode_exception(&exc, encoding,
6325 s, length, collstart-s, collend-s, reason);
6326 goto onError;
6327 }
6328 }
6329 }
6330 p = s + newpos;
6331 Py_DECREF(repunicode);
6332 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006333 }
6334 /* 0-terminate the output string */
6335 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006336 Py_XDECREF(exc);
6337 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006338 return 0;
6339
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006341 Py_XDECREF(exc);
6342 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006343 return -1;
6344}
6345
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346/* --- Helpers ------------------------------------------------------------ */
6347
Eric Smith8c663262007-08-25 02:26:07 +00006348#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006349#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006350
Thomas Wouters477c8d52006-05-27 19:21:47 +00006351#include "stringlib/count.h"
6352#include "stringlib/find.h"
6353#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006354#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006355
Eric Smith5807c412008-05-11 21:00:57 +00006356#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006357#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006358#include "stringlib/localeutil.h"
6359
Thomas Wouters477c8d52006-05-27 19:21:47 +00006360/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006361#define ADJUST_INDICES(start, end, len) \
6362 if (end > len) \
6363 end = len; \
6364 else if (end < 0) { \
6365 end += len; \
6366 if (end < 0) \
6367 end = 0; \
6368 } \
6369 if (start < 0) { \
6370 start += len; \
6371 if (start < 0) \
6372 start = 0; \
6373 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006374
Martin v. Löwis18e16552006-02-15 17:27:45 +00006375Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006376 PyObject *substr,
6377 Py_ssize_t start,
6378 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006380 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006381 PyUnicodeObject* str_obj;
6382 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006383
Thomas Wouters477c8d52006-05-27 19:21:47 +00006384 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6385 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006387 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6388 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 Py_DECREF(str_obj);
6390 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 }
Tim Petersced69f82003-09-16 20:30:58 +00006392
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006393 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006394 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006395 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6396 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006397 );
6398
6399 Py_DECREF(sub_obj);
6400 Py_DECREF(str_obj);
6401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 return result;
6403}
6404
Martin v. Löwis18e16552006-02-15 17:27:45 +00006405Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006406 PyObject *sub,
6407 Py_ssize_t start,
6408 Py_ssize_t end,
6409 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006411 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006412
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006414 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006416 sub = PyUnicode_FromObject(sub);
6417 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 Py_DECREF(str);
6419 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 }
Tim Petersced69f82003-09-16 20:30:58 +00006421
Thomas Wouters477c8d52006-05-27 19:21:47 +00006422 if (direction > 0)
6423 result = stringlib_find_slice(
6424 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6425 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6426 start, end
6427 );
6428 else
6429 result = stringlib_rfind_slice(
6430 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6431 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6432 start, end
6433 );
6434
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006436 Py_DECREF(sub);
6437
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 return result;
6439}
6440
Tim Petersced69f82003-09-16 20:30:58 +00006441static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 PyUnicodeObject *substring,
6444 Py_ssize_t start,
6445 Py_ssize_t end,
6446 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 if (substring->length == 0)
6449 return 1;
6450
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006451 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 end -= substring->length;
6453 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455
6456 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 if (Py_UNICODE_MATCH(self, end, substring))
6458 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 } else {
6460 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 }
6463
6464 return 0;
6465}
6466
Martin v. Löwis18e16552006-02-15 17:27:45 +00006467Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 PyObject *substr,
6469 Py_ssize_t start,
6470 Py_ssize_t end,
6471 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006473 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006474
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 str = PyUnicode_FromObject(str);
6476 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 substr = PyUnicode_FromObject(substr);
6479 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006480 Py_DECREF(str);
6481 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 }
Tim Petersced69f82003-09-16 20:30:58 +00006483
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 (PyUnicodeObject *)substr,
6486 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 Py_DECREF(str);
6488 Py_DECREF(substr);
6489 return result;
6490}
6491
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492/* Apply fixfct filter to the Unicode object self and return a
6493 reference to the modified object */
6494
Tim Petersced69f82003-09-16 20:30:58 +00006495static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498{
6499
6500 PyUnicodeObject *u;
6501
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006502 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006505
6506 Py_UNICODE_COPY(u->str, self->str, self->length);
6507
Tim Peters7a29bd52001-09-12 03:03:31 +00006508 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 /* fixfct should return TRUE if it modified the buffer. If
6510 FALSE, return a reference to the original buffer instead
6511 (to save space, not time) */
6512 Py_INCREF(self);
6513 Py_DECREF(u);
6514 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 }
6516 return (PyObject*) u;
6517}
6518
Tim Petersced69f82003-09-16 20:30:58 +00006519static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520int fixupper(PyUnicodeObject *self)
6521{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006522 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 Py_UNICODE *s = self->str;
6524 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006525
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006528
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 ch = Py_UNICODE_TOUPPER(*s);
6530 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 *s = ch;
6533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 s++;
6535 }
6536
6537 return status;
6538}
6539
Tim Petersced69f82003-09-16 20:30:58 +00006540static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541int fixlower(PyUnicodeObject *self)
6542{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006543 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 Py_UNICODE *s = self->str;
6545 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006546
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006549
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 ch = Py_UNICODE_TOLOWER(*s);
6551 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 *s = ch;
6554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 s++;
6556 }
6557
6558 return status;
6559}
6560
Tim Petersced69f82003-09-16 20:30:58 +00006561static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562int fixswapcase(PyUnicodeObject *self)
6563{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006564 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 Py_UNICODE *s = self->str;
6566 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006567
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 while (len-- > 0) {
6569 if (Py_UNICODE_ISUPPER(*s)) {
6570 *s = Py_UNICODE_TOLOWER(*s);
6571 status = 1;
6572 } else if (Py_UNICODE_ISLOWER(*s)) {
6573 *s = Py_UNICODE_TOUPPER(*s);
6574 status = 1;
6575 }
6576 s++;
6577 }
6578
6579 return status;
6580}
6581
Tim Petersced69f82003-09-16 20:30:58 +00006582static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583int fixcapitalize(PyUnicodeObject *self)
6584{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006585 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006586 Py_UNICODE *s = self->str;
6587 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006588
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006589 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006591 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 *s = Py_UNICODE_TOUPPER(*s);
6593 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006595 s++;
6596 while (--len > 0) {
6597 if (Py_UNICODE_ISUPPER(*s)) {
6598 *s = Py_UNICODE_TOLOWER(*s);
6599 status = 1;
6600 }
6601 s++;
6602 }
6603 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604}
6605
6606static
6607int fixtitle(PyUnicodeObject *self)
6608{
6609 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6610 register Py_UNICODE *e;
6611 int previous_is_cased;
6612
6613 /* Shortcut for single character strings */
6614 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6616 if (*p != ch) {
6617 *p = ch;
6618 return 1;
6619 }
6620 else
6621 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 }
Tim Petersced69f82003-09-16 20:30:58 +00006623
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 e = p + PyUnicode_GET_SIZE(self);
6625 previous_is_cased = 0;
6626 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006628
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 if (previous_is_cased)
6630 *p = Py_UNICODE_TOLOWER(ch);
6631 else
6632 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006633
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 if (Py_UNICODE_ISLOWER(ch) ||
6635 Py_UNICODE_ISUPPER(ch) ||
6636 Py_UNICODE_ISTITLE(ch))
6637 previous_is_cased = 1;
6638 else
6639 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 }
6641 return 1;
6642}
6643
Tim Peters8ce9f162004-08-27 01:49:32 +00006644PyObject *
6645PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646{
Skip Montanaro6543b452004-09-16 03:28:13 +00006647 const Py_UNICODE blank = ' ';
6648 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006649 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006650 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006651 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6652 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006653 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6654 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006655 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006656 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657
Tim Peters05eba1f2004-08-27 21:32:02 +00006658 fseq = PySequence_Fast(seq, "");
6659 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006660 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006661 }
6662
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006663 /* NOTE: the following code can't call back into Python code,
6664 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006665 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006666
Tim Peters05eba1f2004-08-27 21:32:02 +00006667 seqlen = PySequence_Fast_GET_SIZE(fseq);
6668 /* If empty sequence, return u"". */
6669 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006670 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6671 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006672 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006673 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006674 /* If singleton sequence with an exact Unicode, return that. */
6675 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 item = items[0];
6677 if (PyUnicode_CheckExact(item)) {
6678 Py_INCREF(item);
6679 res = (PyUnicodeObject *)item;
6680 goto Done;
6681 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006682 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006683 else {
6684 /* Set up sep and seplen */
6685 if (separator == NULL) {
6686 sep = &blank;
6687 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006688 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006689 else {
6690 if (!PyUnicode_Check(separator)) {
6691 PyErr_Format(PyExc_TypeError,
6692 "separator: expected str instance,"
6693 " %.80s found",
6694 Py_TYPE(separator)->tp_name);
6695 goto onError;
6696 }
6697 sep = PyUnicode_AS_UNICODE(separator);
6698 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006699 }
6700 }
6701
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006702 /* There are at least two things to join, or else we have a subclass
6703 * of str in the sequence.
6704 * Do a pre-pass to figure out the total amount of space we'll
6705 * need (sz), and see whether all argument are strings.
6706 */
6707 sz = 0;
6708 for (i = 0; i < seqlen; i++) {
6709 const Py_ssize_t old_sz = sz;
6710 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 if (!PyUnicode_Check(item)) {
6712 PyErr_Format(PyExc_TypeError,
6713 "sequence item %zd: expected str instance,"
6714 " %.80s found",
6715 i, Py_TYPE(item)->tp_name);
6716 goto onError;
6717 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006718 sz += PyUnicode_GET_SIZE(item);
6719 if (i != 0)
6720 sz += seplen;
6721 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6722 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006724 goto onError;
6725 }
6726 }
Tim Petersced69f82003-09-16 20:30:58 +00006727
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006728 res = _PyUnicode_New(sz);
6729 if (res == NULL)
6730 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006731
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006732 /* Catenate everything. */
6733 res_p = PyUnicode_AS_UNICODE(res);
6734 for (i = 0; i < seqlen; ++i) {
6735 Py_ssize_t itemlen;
6736 item = items[i];
6737 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 /* Copy item, and maybe the separator. */
6739 if (i) {
6740 Py_UNICODE_COPY(res_p, sep, seplen);
6741 res_p += seplen;
6742 }
6743 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6744 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006745 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006746
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006748 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 return (PyObject *)res;
6750
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006752 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006753 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 return NULL;
6755}
6756
Tim Petersced69f82003-09-16 20:30:58 +00006757static
6758PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 Py_ssize_t left,
6760 Py_ssize_t right,
6761 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762{
6763 PyUnicodeObject *u;
6764
6765 if (left < 0)
6766 left = 0;
6767 if (right < 0)
6768 right = 0;
6769
Tim Peters7a29bd52001-09-12 03:03:31 +00006770 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 Py_INCREF(self);
6772 return self;
6773 }
6774
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006775 if (left > PY_SSIZE_T_MAX - self->length ||
6776 right > PY_SSIZE_T_MAX - (left + self->length)) {
6777 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6778 return NULL;
6779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 u = _PyUnicode_New(left + self->length + right);
6781 if (u) {
6782 if (left)
6783 Py_UNICODE_FILL(u->str, fill, left);
6784 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6785 if (right)
6786 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6787 }
6788
6789 return u;
6790}
6791
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006792PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795
6796 string = PyUnicode_FromObject(string);
6797 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006800 list = stringlib_splitlines(
6801 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6802 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
6804 Py_DECREF(string);
6805 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806}
6807
Tim Petersced69f82003-09-16 20:30:58 +00006808static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 PyUnicodeObject *substring,
6811 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006814 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006817 return stringlib_split_whitespace(
6818 (PyObject*) self, self->str, self->length, maxcount
6819 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006821 return stringlib_split(
6822 (PyObject*) self, self->str, self->length,
6823 substring->str, substring->length,
6824 maxcount
6825 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826}
6827
Tim Petersced69f82003-09-16 20:30:58 +00006828static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006829PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 PyUnicodeObject *substring,
6831 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006832{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006833 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006834 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006835
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006836 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006837 return stringlib_rsplit_whitespace(
6838 (PyObject*) self, self->str, self->length, maxcount
6839 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006840
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006841 return stringlib_rsplit(
6842 (PyObject*) self, self->str, self->length,
6843 substring->str, substring->length,
6844 maxcount
6845 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006846}
6847
6848static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 PyUnicodeObject *str1,
6851 PyUnicodeObject *str2,
6852 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853{
6854 PyUnicodeObject *u;
6855
6856 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006858 else if (maxcount == 0 || self->length == 0)
6859 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860
Thomas Wouters477c8d52006-05-27 19:21:47 +00006861 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006862 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006863 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006864 if (str1->length == 0)
6865 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006866 if (str1->length == 1) {
6867 /* replace characters */
6868 Py_UNICODE u1, u2;
6869 if (!findchar(self->str, self->length, str1->str[0]))
6870 goto nothing;
6871 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6872 if (!u)
6873 return NULL;
6874 Py_UNICODE_COPY(u->str, self->str, self->length);
6875 u1 = str1->str[0];
6876 u2 = str2->str[0];
6877 for (i = 0; i < u->length; i++)
6878 if (u->str[i] == u1) {
6879 if (--maxcount < 0)
6880 break;
6881 u->str[i] = u2;
6882 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006884 i = stringlib_find(
6885 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006887 if (i < 0)
6888 goto nothing;
6889 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6890 if (!u)
6891 return NULL;
6892 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006893
6894 /* change everything in-place, starting with this one */
6895 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6896 i += str1->length;
6897
6898 while ( --maxcount > 0) {
6899 i = stringlib_find(self->str+i, self->length-i,
6900 str1->str, str1->length,
6901 i);
6902 if (i == -1)
6903 break;
6904 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6905 i += str1->length;
6906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006909
6910 Py_ssize_t n, i, j, e;
6911 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 Py_UNICODE *p;
6913
6914 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006915 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6916 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006917 if (n == 0)
6918 goto nothing;
6919 /* new_size = self->length + n * (str2->length - str1->length)); */
6920 delta = (str2->length - str1->length);
6921 if (delta == 0) {
6922 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006924 product = n * (str2->length - str1->length);
6925 if ((product / (str2->length - str1->length)) != n) {
6926 PyErr_SetString(PyExc_OverflowError,
6927 "replace string is too long");
6928 return NULL;
6929 }
6930 new_size = self->length + product;
6931 if (new_size < 0) {
6932 PyErr_SetString(PyExc_OverflowError,
6933 "replace string is too long");
6934 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 }
6936 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006937 u = _PyUnicode_New(new_size);
6938 if (!u)
6939 return NULL;
6940 i = 0;
6941 p = u->str;
6942 e = self->length - str1->length;
6943 if (str1->length > 0) {
6944 while (n-- > 0) {
6945 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006946 j = stringlib_find(self->str+i, self->length-i,
6947 str1->str, str1->length,
6948 i);
6949 if (j == -1)
6950 break;
6951 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006952 /* copy unchanged part [i:j] */
6953 Py_UNICODE_COPY(p, self->str+i, j-i);
6954 p += j - i;
6955 }
6956 /* copy substitution string */
6957 if (str2->length > 0) {
6958 Py_UNICODE_COPY(p, str2->str, str2->length);
6959 p += str2->length;
6960 }
6961 i = j + str1->length;
6962 }
6963 if (i < self->length)
6964 /* copy tail [i:] */
6965 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6966 } else {
6967 /* interleave */
6968 while (n > 0) {
6969 Py_UNICODE_COPY(p, str2->str, str2->length);
6970 p += str2->length;
6971 if (--n <= 0)
6972 break;
6973 *p++ = self->str[i++];
6974 }
6975 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006979
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00006981 /* nothing to replace; return original string (when possible) */
6982 if (PyUnicode_CheckExact(self)) {
6983 Py_INCREF(self);
6984 return (PyObject *) self;
6985 }
6986 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987}
6988
6989/* --- Unicode Object Methods --------------------------------------------- */
6990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006991PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00006992 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993\n\
6994Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006995characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996
6997static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006998unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007000 return fixup(self, fixtitle);
7001}
7002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007003PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005\n\
7006Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007007have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008
7009static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007010unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 return fixup(self, fixcapitalize);
7013}
7014
7015#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007016PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018\n\
7019Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007020normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021
7022static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007023unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024{
7025 PyObject *list;
7026 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007027 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 /* Split into words */
7030 list = split(self, NULL, -1);
7031 if (!list)
7032 return NULL;
7033
7034 /* Capitalize each word */
7035 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7036 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 if (item == NULL)
7039 goto onError;
7040 Py_DECREF(PyList_GET_ITEM(list, i));
7041 PyList_SET_ITEM(list, i, item);
7042 }
7043
7044 /* Join the words to form a new string */
7045 item = PyUnicode_Join(NULL, list);
7046
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 Py_DECREF(list);
7049 return (PyObject *)item;
7050}
7051#endif
7052
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007053/* Argument converter. Coerces to a single unicode character */
7054
7055static int
7056convert_uc(PyObject *obj, void *addr)
7057{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007058 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7059 PyObject *uniobj;
7060 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007061
Benjamin Peterson14339b62009-01-31 16:36:08 +00007062 uniobj = PyUnicode_FromObject(obj);
7063 if (uniobj == NULL) {
7064 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007065 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007066 return 0;
7067 }
7068 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7069 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007071 Py_DECREF(uniobj);
7072 return 0;
7073 }
7074 unistr = PyUnicode_AS_UNICODE(uniobj);
7075 *fillcharloc = unistr[0];
7076 Py_DECREF(uniobj);
7077 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007078}
7079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007080PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007083Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007084done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085
7086static PyObject *
7087unicode_center(PyUnicodeObject *self, PyObject *args)
7088{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007089 Py_ssize_t marg, left;
7090 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007091 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092
Thomas Woutersde017742006-02-16 19:34:37 +00007093 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094 return NULL;
7095
Tim Peters7a29bd52001-09-12 03:03:31 +00007096 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 Py_INCREF(self);
7098 return (PyObject*) self;
7099 }
7100
7101 marg = width - self->length;
7102 left = marg / 2 + (marg & width & 1);
7103
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007104 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105}
7106
Marc-André Lemburge5034372000-08-08 08:04:29 +00007107#if 0
7108
7109/* This code should go into some future Unicode collation support
7110 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007111 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007112
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007113/* speedy UTF-16 code point order comparison */
7114/* gleaned from: */
7115/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7116
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007117static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007118{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007119 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007120 0, 0, 0, 0, 0, 0, 0, 0,
7121 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007122 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007123};
7124
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125static int
7126unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7127{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007128 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007129
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 Py_UNICODE *s1 = str1->str;
7131 Py_UNICODE *s2 = str2->str;
7132
7133 len1 = str1->length;
7134 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007135
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007137 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007138
7139 c1 = *s1++;
7140 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007141
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 if (c1 > (1<<11) * 26)
7143 c1 += utf16Fixup[c1>>11];
7144 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007145 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007146 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007147
7148 if (c1 != c2)
7149 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007150
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007151 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 }
7153
7154 return (len1 < len2) ? -1 : (len1 != len2);
7155}
7156
Marc-André Lemburge5034372000-08-08 08:04:29 +00007157#else
7158
7159static int
7160unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7161{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007162 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007163
7164 Py_UNICODE *s1 = str1->str;
7165 Py_UNICODE *s2 = str2->str;
7166
7167 len1 = str1->length;
7168 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007169
Marc-André Lemburge5034372000-08-08 08:04:29 +00007170 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007171 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007172
Fredrik Lundh45714e92001-06-26 16:39:36 +00007173 c1 = *s1++;
7174 c2 = *s2++;
7175
7176 if (c1 != c2)
7177 return (c1 < c2) ? -1 : 1;
7178
Marc-André Lemburge5034372000-08-08 08:04:29 +00007179 len1--; len2--;
7180 }
7181
7182 return (len1 < len2) ? -1 : (len1 != len2);
7183}
7184
7185#endif
7186
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007190 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7191 return unicode_compare((PyUnicodeObject *)left,
7192 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007193 PyErr_Format(PyExc_TypeError,
7194 "Can't compare %.100s and %.100s",
7195 left->ob_type->tp_name,
7196 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197 return -1;
7198}
7199
Martin v. Löwis5b222132007-06-10 09:51:05 +00007200int
7201PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7202{
7203 int i;
7204 Py_UNICODE *id;
7205 assert(PyUnicode_Check(uni));
7206 id = PyUnicode_AS_UNICODE(uni);
7207 /* Compare Unicode string and source character set string */
7208 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 if (id[i] != str[i])
7210 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007211 /* This check keeps Python strings that end in '\0' from comparing equal
7212 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007213 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007215 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007217 return 0;
7218}
7219
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007220
Benjamin Peterson29060642009-01-31 22:14:21 +00007221#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007222 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007223
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007224PyObject *PyUnicode_RichCompare(PyObject *left,
7225 PyObject *right,
7226 int op)
7227{
7228 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007229
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007230 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7231 PyObject *v;
7232 if (((PyUnicodeObject *) left)->length !=
7233 ((PyUnicodeObject *) right)->length) {
7234 if (op == Py_EQ) {
7235 Py_INCREF(Py_False);
7236 return Py_False;
7237 }
7238 if (op == Py_NE) {
7239 Py_INCREF(Py_True);
7240 return Py_True;
7241 }
7242 }
7243 if (left == right)
7244 result = 0;
7245 else
7246 result = unicode_compare((PyUnicodeObject *)left,
7247 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007248
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007249 /* Convert the return value to a Boolean */
7250 switch (op) {
7251 case Py_EQ:
7252 v = TEST_COND(result == 0);
7253 break;
7254 case Py_NE:
7255 v = TEST_COND(result != 0);
7256 break;
7257 case Py_LE:
7258 v = TEST_COND(result <= 0);
7259 break;
7260 case Py_GE:
7261 v = TEST_COND(result >= 0);
7262 break;
7263 case Py_LT:
7264 v = TEST_COND(result == -1);
7265 break;
7266 case Py_GT:
7267 v = TEST_COND(result == 1);
7268 break;
7269 default:
7270 PyErr_BadArgument();
7271 return NULL;
7272 }
7273 Py_INCREF(v);
7274 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007275 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007276
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007277 Py_INCREF(Py_NotImplemented);
7278 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007279}
7280
Guido van Rossum403d68b2000-03-13 15:55:09 +00007281int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007282 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007283{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007284 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007285 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007286
7287 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007288 sub = PyUnicode_FromObject(element);
7289 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007290 PyErr_Format(PyExc_TypeError,
7291 "'in <string>' requires string as left operand, not %s",
7292 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007293 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007294 }
7295
Thomas Wouters477c8d52006-05-27 19:21:47 +00007296 str = PyUnicode_FromObject(container);
7297 if (!str) {
7298 Py_DECREF(sub);
7299 return -1;
7300 }
7301
7302 result = stringlib_contains_obj(str, sub);
7303
7304 Py_DECREF(str);
7305 Py_DECREF(sub);
7306
Guido van Rossum403d68b2000-03-13 15:55:09 +00007307 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007308}
7309
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310/* Concat to string or Unicode object giving a new Unicode object. */
7311
7312PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314{
7315 PyUnicodeObject *u = NULL, *v = NULL, *w;
7316
7317 /* Coerce the two arguments */
7318 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7319 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7322 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324
7325 /* Shortcuts */
7326 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 Py_DECREF(v);
7328 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007329 }
7330 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 Py_DECREF(u);
7332 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333 }
7334
7335 /* Concat the two Unicode strings */
7336 w = _PyUnicode_New(u->length + v->length);
7337 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 Py_UNICODE_COPY(w->str, u->str, u->length);
7340 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7341
7342 Py_DECREF(u);
7343 Py_DECREF(v);
7344 return (PyObject *)w;
7345
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 Py_XDECREF(u);
7348 Py_XDECREF(v);
7349 return NULL;
7350}
7351
Walter Dörwald1ab83302007-05-18 17:15:44 +00007352void
7353PyUnicode_Append(PyObject **pleft, PyObject *right)
7354{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007355 PyObject *new;
7356 if (*pleft == NULL)
7357 return;
7358 if (right == NULL || !PyUnicode_Check(*pleft)) {
7359 Py_DECREF(*pleft);
7360 *pleft = NULL;
7361 return;
7362 }
7363 new = PyUnicode_Concat(*pleft, right);
7364 Py_DECREF(*pleft);
7365 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007366}
7367
7368void
7369PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7370{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007371 PyUnicode_Append(pleft, right);
7372 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007373}
7374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007375PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007378Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007379string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007380interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381
7382static PyObject *
7383unicode_count(PyUnicodeObject *self, PyObject *args)
7384{
7385 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007386 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007387 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 PyObject *result;
7389
Guido van Rossumb8872e62000-05-09 14:14:27 +00007390 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392 return NULL;
7393
7394 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007395 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007398
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007399 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007400 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007401 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007402 substring->str, substring->length,
7403 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007404 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
7406 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007407
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408 return result;
7409}
7410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007411PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007412 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007414Encode S using the codec registered for encoding. Default encoding\n\
7415is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007416handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007417a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7418'xmlcharrefreplace' as well as any other name registered with\n\
7419codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420
7421static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007422unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007424 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425 char *encoding = NULL;
7426 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007427
Benjamin Peterson308d6372009-09-18 21:42:35 +00007428 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7429 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007431 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007432}
7433
Georg Brandl02524622010-12-02 18:06:51 +00007434PyDoc_STRVAR(transform__doc__,
7435 "S.transform(encoding, errors='strict') -> str\n\
7436\n\
7437Transform S using the codec registered for encoding. errors may be given\n\
7438to set a different error handling scheme.");
7439
7440static PyObject *
7441unicode_transform(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7442{
7443 static char *kwlist[] = {"encoding", "errors", 0};
7444 char *encoding = NULL;
7445 char *errors = NULL;
7446
7447 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|s:transform",
7448 kwlist, &encoding, &errors))
7449 return NULL;
7450 return PyUnicode_AsEncodedUnicode((PyObject *)self, encoding, errors);
7451}
7452
7453PyDoc_STRVAR(untransform__doc__,
7454 "S.untransform(encoding, errors='strict') -> str\n\
7455\n\
7456Reverse-transform S using the codec registered for encoding. errors may be\n\
7457given to set a different error handling scheme.");
7458
7459static PyObject *
7460unicode_untransform(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7461{
7462 static char *kwlist[] = {"encoding", "errors", 0};
7463 char *encoding = NULL;
7464 char *errors = NULL;
7465
7466 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|s:untransform",
7467 kwlist, &encoding, &errors))
7468 return NULL;
7469 return PyUnicode_AsDecodedUnicode((PyObject *)self, encoding, errors);
7470}
7471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007472PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474\n\
7475Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007476If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477
7478static PyObject*
7479unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7480{
7481 Py_UNICODE *e;
7482 Py_UNICODE *p;
7483 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007484 Py_UNICODE *qe;
7485 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486 PyUnicodeObject *u;
7487 int tabsize = 8;
7488
7489 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491
Thomas Wouters7e474022000-07-16 12:04:32 +00007492 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007493 i = 0; /* chars up to and including most recent \n or \r */
7494 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7495 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 for (p = self->str; p < e; p++)
7497 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 if (tabsize > 0) {
7499 incr = tabsize - (j % tabsize); /* cannot overflow */
7500 if (j > PY_SSIZE_T_MAX - incr)
7501 goto overflow1;
7502 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007503 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 if (j > PY_SSIZE_T_MAX - 1)
7507 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 j++;
7509 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 if (i > PY_SSIZE_T_MAX - j)
7511 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007513 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 }
7515 }
7516
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007517 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007519
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 /* Second pass: create output string and fill it */
7521 u = _PyUnicode_New(i + j);
7522 if (!u)
7523 return NULL;
7524
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007525 j = 0; /* same as in first pass */
7526 q = u->str; /* next output char */
7527 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528
7529 for (p = self->str; p < e; p++)
7530 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 if (tabsize > 0) {
7532 i = tabsize - (j % tabsize);
7533 j += i;
7534 while (i--) {
7535 if (q >= qe)
7536 goto overflow2;
7537 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007538 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007540 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 else {
7542 if (q >= qe)
7543 goto overflow2;
7544 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007545 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546 if (*p == '\n' || *p == '\r')
7547 j = 0;
7548 }
7549
7550 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007551
7552 overflow2:
7553 Py_DECREF(u);
7554 overflow1:
7555 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557}
7558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007559PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561\n\
7562Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007563such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564arguments start and end are interpreted as in slice notation.\n\
7565\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007566Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567
7568static PyObject *
7569unicode_find(PyUnicodeObject *self, PyObject *args)
7570{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007571 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007572 Py_ssize_t start;
7573 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007574 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575
Christian Heimes9cd17752007-11-18 19:35:23 +00007576 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578
Thomas Wouters477c8d52006-05-27 19:21:47 +00007579 result = stringlib_find_slice(
7580 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7581 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7582 start, end
7583 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584
7585 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007586
Christian Heimes217cfd12007-12-02 14:31:20 +00007587 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588}
7589
7590static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007591unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592{
7593 if (index < 0 || index >= self->length) {
7594 PyErr_SetString(PyExc_IndexError, "string index out of range");
7595 return NULL;
7596 }
7597
7598 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7599}
7600
Guido van Rossumc2504932007-09-18 19:42:40 +00007601/* Believe it or not, this produces the same value for ASCII strings
7602 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007603static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007604unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605{
Guido van Rossumc2504932007-09-18 19:42:40 +00007606 Py_ssize_t len;
7607 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007608 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007609
7610 if (self->hash != -1)
7611 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007612 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007613 p = self->str;
7614 x = *p << 7;
7615 while (--len >= 0)
7616 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007617 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007618 if (x == -1)
7619 x = -2;
7620 self->hash = x;
7621 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622}
7623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007624PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007627Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628
7629static PyObject *
7630unicode_index(PyUnicodeObject *self, PyObject *args)
7631{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007632 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007633 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007634 Py_ssize_t start;
7635 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636
Christian Heimes9cd17752007-11-18 19:35:23 +00007637 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639
Thomas Wouters477c8d52006-05-27 19:21:47 +00007640 result = stringlib_find_slice(
7641 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7642 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7643 start, end
7644 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645
7646 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007647
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648 if (result < 0) {
7649 PyErr_SetString(PyExc_ValueError, "substring not found");
7650 return NULL;
7651 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007652
Christian Heimes217cfd12007-12-02 14:31:20 +00007653 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654}
7655
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007656PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007659Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007660at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661
7662static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007663unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664{
7665 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7666 register const Py_UNICODE *e;
7667 int cased;
7668
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669 /* Shortcut for single character strings */
7670 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007673 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007674 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007676
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 e = p + PyUnicode_GET_SIZE(self);
7678 cased = 0;
7679 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007681
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7683 return PyBool_FromLong(0);
7684 else if (!cased && Py_UNICODE_ISLOWER(ch))
7685 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007687 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688}
7689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007690PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007693Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007694at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695
7696static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007697unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698{
7699 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7700 register const Py_UNICODE *e;
7701 int cased;
7702
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703 /* Shortcut for single character strings */
7704 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007707 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007708 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007710
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 e = p + PyUnicode_GET_SIZE(self);
7712 cased = 0;
7713 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007715
Benjamin Peterson29060642009-01-31 22:14:21 +00007716 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7717 return PyBool_FromLong(0);
7718 else if (!cased && Py_UNICODE_ISUPPER(ch))
7719 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007721 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722}
7723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007724PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007727Return True if S is a titlecased string and there is at least one\n\
7728character in S, i.e. upper- and titlecase characters may only\n\
7729follow uncased characters and lowercase characters only cased ones.\n\
7730Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731
7732static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007733unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734{
7735 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7736 register const Py_UNICODE *e;
7737 int cased, previous_is_cased;
7738
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739 /* Shortcut for single character strings */
7740 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7742 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007744 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007745 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007747
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 e = p + PyUnicode_GET_SIZE(self);
7749 cased = 0;
7750 previous_is_cased = 0;
7751 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007753
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7755 if (previous_is_cased)
7756 return PyBool_FromLong(0);
7757 previous_is_cased = 1;
7758 cased = 1;
7759 }
7760 else if (Py_UNICODE_ISLOWER(ch)) {
7761 if (!previous_is_cased)
7762 return PyBool_FromLong(0);
7763 previous_is_cased = 1;
7764 cased = 1;
7765 }
7766 else
7767 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007769 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770}
7771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007772PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007775Return True if all characters in S are whitespace\n\
7776and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777
7778static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007779unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780{
7781 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7782 register const Py_UNICODE *e;
7783
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784 /* Shortcut for single character strings */
7785 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 Py_UNICODE_ISSPACE(*p))
7787 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007789 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007790 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007792
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 e = p + PyUnicode_GET_SIZE(self);
7794 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 if (!Py_UNICODE_ISSPACE(*p))
7796 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007798 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799}
7800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007801PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007803\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007804Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007805and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007806
7807static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007808unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007809{
7810 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7811 register const Py_UNICODE *e;
7812
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007813 /* Shortcut for single character strings */
7814 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 Py_UNICODE_ISALPHA(*p))
7816 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007817
7818 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007819 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007821
7822 e = p + PyUnicode_GET_SIZE(self);
7823 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 if (!Py_UNICODE_ISALPHA(*p))
7825 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007826 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007827 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007828}
7829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007830PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007832\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007833Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007834and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007835
7836static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007837unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007838{
7839 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7840 register const Py_UNICODE *e;
7841
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007842 /* Shortcut for single character strings */
7843 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 Py_UNICODE_ISALNUM(*p))
7845 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007846
7847 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007848 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007850
7851 e = p + PyUnicode_GET_SIZE(self);
7852 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 if (!Py_UNICODE_ISALNUM(*p))
7854 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007855 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007856 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007857}
7858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007859PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007862Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007863False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864
7865static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007866unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867{
7868 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7869 register const Py_UNICODE *e;
7870
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 /* Shortcut for single character strings */
7872 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 Py_UNICODE_ISDECIMAL(*p))
7874 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007876 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007877 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007879
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880 e = p + PyUnicode_GET_SIZE(self);
7881 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 if (!Py_UNICODE_ISDECIMAL(*p))
7883 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007885 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886}
7887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007888PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007891Return True if all characters in S are digits\n\
7892and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893
7894static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007895unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896{
7897 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7898 register const Py_UNICODE *e;
7899
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900 /* Shortcut for single character strings */
7901 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 Py_UNICODE_ISDIGIT(*p))
7903 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007905 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007906 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007908
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909 e = p + PyUnicode_GET_SIZE(self);
7910 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 if (!Py_UNICODE_ISDIGIT(*p))
7912 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007914 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915}
7916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007917PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007920Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007921False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922
7923static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007924unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925{
7926 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7927 register const Py_UNICODE *e;
7928
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929 /* Shortcut for single character strings */
7930 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 Py_UNICODE_ISNUMERIC(*p))
7932 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007934 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007935 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007937
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938 e = p + PyUnicode_GET_SIZE(self);
7939 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 if (!Py_UNICODE_ISNUMERIC(*p))
7941 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007943 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944}
7945
Martin v. Löwis47383402007-08-15 07:32:56 +00007946int
7947PyUnicode_IsIdentifier(PyObject *self)
7948{
7949 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7950 register const Py_UNICODE *e;
7951
7952 /* Special case for empty strings */
7953 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007955
7956 /* PEP 3131 says that the first character must be in
7957 XID_Start and subsequent characters in XID_Continue,
7958 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007959 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007960 letters, digits, underscore). However, given the current
7961 definition of XID_Start and XID_Continue, it is sufficient
7962 to check just for these, except that _ must be allowed
7963 as starting an identifier. */
7964 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7965 return 0;
7966
7967 e = p + PyUnicode_GET_SIZE(self);
7968 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 if (!_PyUnicode_IsXidContinue(*p))
7970 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007971 }
7972 return 1;
7973}
7974
7975PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007977\n\
7978Return True if S is a valid identifier according\n\
7979to the language definition.");
7980
7981static PyObject*
7982unicode_isidentifier(PyObject *self)
7983{
7984 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7985}
7986
Georg Brandl559e5d72008-06-11 18:37:52 +00007987PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007989\n\
7990Return True if all characters in S are considered\n\
7991printable in repr() or S is empty, False otherwise.");
7992
7993static PyObject*
7994unicode_isprintable(PyObject *self)
7995{
7996 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7997 register const Py_UNICODE *e;
7998
7999 /* Shortcut for single character strings */
8000 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8001 Py_RETURN_TRUE;
8002 }
8003
8004 e = p + PyUnicode_GET_SIZE(self);
8005 for (; p < e; p++) {
8006 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8007 Py_RETURN_FALSE;
8008 }
8009 }
8010 Py_RETURN_TRUE;
8011}
8012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008013PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008014 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015\n\
8016Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008017iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018
8019static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008020unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008022 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023}
8024
Martin v. Löwis18e16552006-02-15 17:27:45 +00008025static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026unicode_length(PyUnicodeObject *self)
8027{
8028 return self->length;
8029}
8030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008031PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008034Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008035done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036
8037static PyObject *
8038unicode_ljust(PyUnicodeObject *self, PyObject *args)
8039{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008040 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008041 Py_UNICODE fillchar = ' ';
8042
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008043 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044 return NULL;
8045
Tim Peters7a29bd52001-09-12 03:03:31 +00008046 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 Py_INCREF(self);
8048 return (PyObject*) self;
8049 }
8050
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008051 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052}
8053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008054PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008057Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058
8059static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008060unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 return fixup(self, fixlower);
8063}
8064
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008065#define LEFTSTRIP 0
8066#define RIGHTSTRIP 1
8067#define BOTHSTRIP 2
8068
8069/* Arrays indexed by above */
8070static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8071
8072#define STRIPNAME(i) (stripformat[i]+3)
8073
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008074/* externally visible for str.strip(unicode) */
8075PyObject *
8076_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8077{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008078 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8079 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8080 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8081 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8082 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008083
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008085
Benjamin Peterson14339b62009-01-31 16:36:08 +00008086 i = 0;
8087 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8089 i++;
8090 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008091 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008092
Benjamin Peterson14339b62009-01-31 16:36:08 +00008093 j = len;
8094 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 do {
8096 j--;
8097 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8098 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008099 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008100
Benjamin Peterson14339b62009-01-31 16:36:08 +00008101 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 Py_INCREF(self);
8103 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008104 }
8105 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008107}
8108
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109
8110static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008111do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008113 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8114 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008115
Benjamin Peterson14339b62009-01-31 16:36:08 +00008116 i = 0;
8117 if (striptype != RIGHTSTRIP) {
8118 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8119 i++;
8120 }
8121 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008122
Benjamin Peterson14339b62009-01-31 16:36:08 +00008123 j = len;
8124 if (striptype != LEFTSTRIP) {
8125 do {
8126 j--;
8127 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8128 j++;
8129 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008130
Benjamin Peterson14339b62009-01-31 16:36:08 +00008131 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8132 Py_INCREF(self);
8133 return (PyObject*)self;
8134 }
8135 else
8136 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137}
8138
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008139
8140static PyObject *
8141do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008143 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008144
Benjamin Peterson14339b62009-01-31 16:36:08 +00008145 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8146 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008147
Benjamin Peterson14339b62009-01-31 16:36:08 +00008148 if (sep != NULL && sep != Py_None) {
8149 if (PyUnicode_Check(sep))
8150 return _PyUnicode_XStrip(self, striptype, sep);
8151 else {
8152 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 "%s arg must be None or str",
8154 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008155 return NULL;
8156 }
8157 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008158
Benjamin Peterson14339b62009-01-31 16:36:08 +00008159 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008160}
8161
8162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008163PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008165\n\
8166Return a copy of the string S with leading and trailing\n\
8167whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008168If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008169
8170static PyObject *
8171unicode_strip(PyUnicodeObject *self, PyObject *args)
8172{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 if (PyTuple_GET_SIZE(args) == 0)
8174 return do_strip(self, BOTHSTRIP); /* Common case */
8175 else
8176 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008177}
8178
8179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008180PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008182\n\
8183Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008184If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008185
8186static PyObject *
8187unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8188{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008189 if (PyTuple_GET_SIZE(args) == 0)
8190 return do_strip(self, LEFTSTRIP); /* Common case */
8191 else
8192 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008193}
8194
8195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008196PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008198\n\
8199Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008200If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008201
8202static PyObject *
8203unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8204{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008205 if (PyTuple_GET_SIZE(args) == 0)
8206 return do_strip(self, RIGHTSTRIP); /* Common case */
8207 else
8208 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008209}
8210
8211
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008213unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214{
8215 PyUnicodeObject *u;
8216 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008217 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008218 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219
Georg Brandl222de0f2009-04-12 12:01:50 +00008220 if (len < 1) {
8221 Py_INCREF(unicode_empty);
8222 return (PyObject *)unicode_empty;
8223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224
Tim Peters7a29bd52001-09-12 03:03:31 +00008225 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226 /* no repeat, return original string */
8227 Py_INCREF(str);
8228 return (PyObject*) str;
8229 }
Tim Peters8f422462000-09-09 06:13:41 +00008230
8231 /* ensure # of chars needed doesn't overflow int and # of bytes
8232 * needed doesn't overflow size_t
8233 */
8234 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008235 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008236 PyErr_SetString(PyExc_OverflowError,
8237 "repeated string is too long");
8238 return NULL;
8239 }
8240 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8241 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8242 PyErr_SetString(PyExc_OverflowError,
8243 "repeated string is too long");
8244 return NULL;
8245 }
8246 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247 if (!u)
8248 return NULL;
8249
8250 p = u->str;
8251
Georg Brandl222de0f2009-04-12 12:01:50 +00008252 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008253 Py_UNICODE_FILL(p, str->str[0], len);
8254 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008255 Py_ssize_t done = str->length; /* number of characters copied this far */
8256 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008258 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008259 Py_UNICODE_COPY(p+done, p, n);
8260 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 }
8263
8264 return (PyObject*) u;
8265}
8266
8267PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 PyObject *subobj,
8269 PyObject *replobj,
8270 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271{
8272 PyObject *self;
8273 PyObject *str1;
8274 PyObject *str2;
8275 PyObject *result;
8276
8277 self = PyUnicode_FromObject(obj);
8278 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 str1 = PyUnicode_FromObject(subobj);
8281 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 Py_DECREF(self);
8283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 }
8285 str2 = PyUnicode_FromObject(replobj);
8286 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 Py_DECREF(self);
8288 Py_DECREF(str1);
8289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 }
Tim Petersced69f82003-09-16 20:30:58 +00008291 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 (PyUnicodeObject *)str1,
8293 (PyUnicodeObject *)str2,
8294 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 Py_DECREF(self);
8296 Py_DECREF(str1);
8297 Py_DECREF(str2);
8298 return result;
8299}
8300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008301PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008302 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303\n\
8304Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008305old replaced by new. If the optional argument count is\n\
8306given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307
8308static PyObject*
8309unicode_replace(PyUnicodeObject *self, PyObject *args)
8310{
8311 PyUnicodeObject *str1;
8312 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008313 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 PyObject *result;
8315
Martin v. Löwis18e16552006-02-15 17:27:45 +00008316 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 return NULL;
8318 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8319 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008322 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 Py_DECREF(str1);
8324 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326
8327 result = replace(self, str1, str2, maxcount);
8328
8329 Py_DECREF(str1);
8330 Py_DECREF(str2);
8331 return result;
8332}
8333
8334static
8335PyObject *unicode_repr(PyObject *unicode)
8336{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008337 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008338 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008339 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8340 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8341
8342 /* XXX(nnorwitz): rather than over-allocating, it would be
8343 better to choose a different scheme. Perhaps scan the
8344 first N-chars of the string and allocate based on that size.
8345 */
8346 /* Initial allocation is based on the longest-possible unichr
8347 escape.
8348
8349 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8350 unichr, so in this case it's the longest unichr escape. In
8351 narrow (UTF-16) builds this is five chars per source unichr
8352 since there are two unichrs in the surrogate pair, so in narrow
8353 (UTF-16) builds it's not the longest unichr escape.
8354
8355 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8356 so in the narrow (UTF-16) build case it's the longest unichr
8357 escape.
8358 */
8359
Walter Dörwald1ab83302007-05-18 17:15:44 +00008360 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008362#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008364#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008366#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008368 if (repr == NULL)
8369 return NULL;
8370
Walter Dörwald1ab83302007-05-18 17:15:44 +00008371 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008372
8373 /* Add quote */
8374 *p++ = (findchar(s, size, '\'') &&
8375 !findchar(s, size, '"')) ? '"' : '\'';
8376 while (size-- > 0) {
8377 Py_UNICODE ch = *s++;
8378
8379 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008380 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008381 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008382 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008383 continue;
8384 }
8385
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008387 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008388 *p++ = '\\';
8389 *p++ = 't';
8390 }
8391 else if (ch == '\n') {
8392 *p++ = '\\';
8393 *p++ = 'n';
8394 }
8395 else if (ch == '\r') {
8396 *p++ = '\\';
8397 *p++ = 'r';
8398 }
8399
8400 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008401 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008402 *p++ = '\\';
8403 *p++ = 'x';
8404 *p++ = hexdigits[(ch >> 4) & 0x000F];
8405 *p++ = hexdigits[ch & 0x000F];
8406 }
8407
Georg Brandl559e5d72008-06-11 18:37:52 +00008408 /* Copy ASCII characters as-is */
8409 else if (ch < 0x7F) {
8410 *p++ = ch;
8411 }
8412
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008414 else {
8415 Py_UCS4 ucs = ch;
8416
8417#ifndef Py_UNICODE_WIDE
8418 Py_UNICODE ch2 = 0;
8419 /* Get code point from surrogate pair */
8420 if (size > 0) {
8421 ch2 = *s;
8422 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008426 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008427 size--;
8428 }
8429 }
8430#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008431 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008432 (categories Z* and C* except ASCII space)
8433 */
8434 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8435 /* Map 8-bit characters to '\xhh' */
8436 if (ucs <= 0xff) {
8437 *p++ = '\\';
8438 *p++ = 'x';
8439 *p++ = hexdigits[(ch >> 4) & 0x000F];
8440 *p++ = hexdigits[ch & 0x000F];
8441 }
8442 /* Map 21-bit characters to '\U00xxxxxx' */
8443 else if (ucs >= 0x10000) {
8444 *p++ = '\\';
8445 *p++ = 'U';
8446 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8447 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8448 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8449 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8450 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8451 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8452 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8453 *p++ = hexdigits[ucs & 0x0000000F];
8454 }
8455 /* Map 16-bit characters to '\uxxxx' */
8456 else {
8457 *p++ = '\\';
8458 *p++ = 'u';
8459 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8460 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8461 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8462 *p++ = hexdigits[ucs & 0x000F];
8463 }
8464 }
8465 /* Copy characters as-is */
8466 else {
8467 *p++ = ch;
8468#ifndef Py_UNICODE_WIDE
8469 if (ucs >= 0x10000)
8470 *p++ = ch2;
8471#endif
8472 }
8473 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008474 }
8475 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008476 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008477
8478 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008479 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008480 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481}
8482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008483PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485\n\
8486Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008487such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008488arguments start and end are interpreted as in slice notation.\n\
8489\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008490Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491
8492static PyObject *
8493unicode_rfind(PyUnicodeObject *self, PyObject *args)
8494{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008495 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008496 Py_ssize_t start;
8497 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008498 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499
Christian Heimes9cd17752007-11-18 19:35:23 +00008500 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502
Thomas Wouters477c8d52006-05-27 19:21:47 +00008503 result = stringlib_rfind_slice(
8504 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8505 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8506 start, end
8507 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508
8509 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008510
Christian Heimes217cfd12007-12-02 14:31:20 +00008511 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512}
8513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008514PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008517Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518
8519static PyObject *
8520unicode_rindex(PyUnicodeObject *self, PyObject *args)
8521{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008522 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008523 Py_ssize_t start;
8524 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008525 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526
Christian Heimes9cd17752007-11-18 19:35:23 +00008527 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529
Thomas Wouters477c8d52006-05-27 19:21:47 +00008530 result = stringlib_rfind_slice(
8531 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8532 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8533 start, end
8534 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535
8536 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008537
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 if (result < 0) {
8539 PyErr_SetString(PyExc_ValueError, "substring not found");
8540 return NULL;
8541 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008542 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543}
8544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008545PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008548Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008549done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550
8551static PyObject *
8552unicode_rjust(PyUnicodeObject *self, PyObject *args)
8553{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008554 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008555 Py_UNICODE fillchar = ' ';
8556
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008557 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558 return NULL;
8559
Tim Peters7a29bd52001-09-12 03:03:31 +00008560 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561 Py_INCREF(self);
8562 return (PyObject*) self;
8563 }
8564
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008565 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566}
8567
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 PyObject *sep,
8570 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571{
8572 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008573
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574 s = PyUnicode_FromObject(s);
8575 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008576 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 if (sep != NULL) {
8578 sep = PyUnicode_FromObject(sep);
8579 if (sep == NULL) {
8580 Py_DECREF(s);
8581 return NULL;
8582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 }
8584
8585 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8586
8587 Py_DECREF(s);
8588 Py_XDECREF(sep);
8589 return result;
8590}
8591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008592PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594\n\
8595Return a list of the words in S, using sep as the\n\
8596delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008597splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008598whitespace string is a separator and empty strings are\n\
8599removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600
8601static PyObject*
8602unicode_split(PyUnicodeObject *self, PyObject *args)
8603{
8604 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008605 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606
Martin v. Löwis18e16552006-02-15 17:27:45 +00008607 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 return NULL;
8609
8610 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616}
8617
Thomas Wouters477c8d52006-05-27 19:21:47 +00008618PyObject *
8619PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8620{
8621 PyObject* str_obj;
8622 PyObject* sep_obj;
8623 PyObject* out;
8624
8625 str_obj = PyUnicode_FromObject(str_in);
8626 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008628 sep_obj = PyUnicode_FromObject(sep_in);
8629 if (!sep_obj) {
8630 Py_DECREF(str_obj);
8631 return NULL;
8632 }
8633
8634 out = stringlib_partition(
8635 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8636 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8637 );
8638
8639 Py_DECREF(sep_obj);
8640 Py_DECREF(str_obj);
8641
8642 return out;
8643}
8644
8645
8646PyObject *
8647PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8648{
8649 PyObject* str_obj;
8650 PyObject* sep_obj;
8651 PyObject* out;
8652
8653 str_obj = PyUnicode_FromObject(str_in);
8654 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008656 sep_obj = PyUnicode_FromObject(sep_in);
8657 if (!sep_obj) {
8658 Py_DECREF(str_obj);
8659 return NULL;
8660 }
8661
8662 out = stringlib_rpartition(
8663 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8664 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8665 );
8666
8667 Py_DECREF(sep_obj);
8668 Py_DECREF(str_obj);
8669
8670 return out;
8671}
8672
8673PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008675\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008676Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008677the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008678found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008679
8680static PyObject*
8681unicode_partition(PyUnicodeObject *self, PyObject *separator)
8682{
8683 return PyUnicode_Partition((PyObject *)self, separator);
8684}
8685
8686PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008687 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008688\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008689Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008690the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008691separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008692
8693static PyObject*
8694unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8695{
8696 return PyUnicode_RPartition((PyObject *)self, separator);
8697}
8698
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008699PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 PyObject *sep,
8701 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008702{
8703 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008704
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008705 s = PyUnicode_FromObject(s);
8706 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008707 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 if (sep != NULL) {
8709 sep = PyUnicode_FromObject(sep);
8710 if (sep == NULL) {
8711 Py_DECREF(s);
8712 return NULL;
8713 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008714 }
8715
8716 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8717
8718 Py_DECREF(s);
8719 Py_XDECREF(sep);
8720 return result;
8721}
8722
8723PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008725\n\
8726Return a list of the words in S, using sep as the\n\
8727delimiter string, starting at the end of the string and\n\
8728working to the front. If maxsplit is given, at most maxsplit\n\
8729splits are done. If sep is not specified, any whitespace string\n\
8730is a separator.");
8731
8732static PyObject*
8733unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8734{
8735 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008736 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008737
Martin v. Löwis18e16552006-02-15 17:27:45 +00008738 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008739 return NULL;
8740
8741 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008743 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008745 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008747}
8748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008749PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751\n\
8752Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008753Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008754is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755
8756static PyObject*
8757unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8758{
Guido van Rossum86662912000-04-11 15:38:46 +00008759 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760
Guido van Rossum86662912000-04-11 15:38:46 +00008761 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762 return NULL;
8763
Guido van Rossum86662912000-04-11 15:38:46 +00008764 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765}
8766
8767static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008768PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769{
Walter Dörwald346737f2007-05-31 10:44:43 +00008770 if (PyUnicode_CheckExact(self)) {
8771 Py_INCREF(self);
8772 return self;
8773 } else
8774 /* Subtype -- return genuine unicode string with the same value. */
8775 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8776 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777}
8778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008779PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781\n\
8782Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008783and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784
8785static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008786unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 return fixup(self, fixswapcase);
8789}
8790
Georg Brandlceee0772007-11-27 23:48:05 +00008791PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008793\n\
8794Return a translation table usable for str.translate().\n\
8795If there is only one argument, it must be a dictionary mapping Unicode\n\
8796ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008797Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008798If there are two arguments, they must be strings of equal length, and\n\
8799in the resulting dictionary, each character in x will be mapped to the\n\
8800character at the same position in y. If there is a third argument, it\n\
8801must be a string, whose characters will be mapped to None in the result.");
8802
8803static PyObject*
8804unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8805{
8806 PyObject *x, *y = NULL, *z = NULL;
8807 PyObject *new = NULL, *key, *value;
8808 Py_ssize_t i = 0;
8809 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008810
Georg Brandlceee0772007-11-27 23:48:05 +00008811 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8812 return NULL;
8813 new = PyDict_New();
8814 if (!new)
8815 return NULL;
8816 if (y != NULL) {
8817 /* x must be a string too, of equal length */
8818 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8819 if (!PyUnicode_Check(x)) {
8820 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8821 "be a string if there is a second argument");
8822 goto err;
8823 }
8824 if (PyUnicode_GET_SIZE(x) != ylen) {
8825 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8826 "arguments must have equal length");
8827 goto err;
8828 }
8829 /* create entries for translating chars in x to those in y */
8830 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008831 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8832 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008833 if (!key || !value)
8834 goto err;
8835 res = PyDict_SetItem(new, key, value);
8836 Py_DECREF(key);
8837 Py_DECREF(value);
8838 if (res < 0)
8839 goto err;
8840 }
8841 /* create entries for deleting chars in z */
8842 if (z != NULL) {
8843 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008844 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008845 if (!key)
8846 goto err;
8847 res = PyDict_SetItem(new, key, Py_None);
8848 Py_DECREF(key);
8849 if (res < 0)
8850 goto err;
8851 }
8852 }
8853 } else {
8854 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008855 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008856 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8857 "to maketrans it must be a dict");
8858 goto err;
8859 }
8860 /* copy entries into the new dict, converting string keys to int keys */
8861 while (PyDict_Next(x, &i, &key, &value)) {
8862 if (PyUnicode_Check(key)) {
8863 /* convert string keys to integer keys */
8864 PyObject *newkey;
8865 if (PyUnicode_GET_SIZE(key) != 1) {
8866 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8867 "table must be of length 1");
8868 goto err;
8869 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008870 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008871 if (!newkey)
8872 goto err;
8873 res = PyDict_SetItem(new, newkey, value);
8874 Py_DECREF(newkey);
8875 if (res < 0)
8876 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008877 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008878 /* just keep integer keys */
8879 if (PyDict_SetItem(new, key, value) < 0)
8880 goto err;
8881 } else {
8882 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8883 "be strings or integers");
8884 goto err;
8885 }
8886 }
8887 }
8888 return new;
8889 err:
8890 Py_DECREF(new);
8891 return NULL;
8892}
8893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008894PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896\n\
8897Return a copy of the string S, where all characters have been mapped\n\
8898through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008899Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008900Unmapped characters are left untouched. Characters mapped to None\n\
8901are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902
8903static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008904unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905{
Georg Brandlceee0772007-11-27 23:48:05 +00008906 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907}
8908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008909PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008912Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913
8914static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008915unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917 return fixup(self, fixupper);
8918}
8919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008920PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008923Pad a numeric string S with zeros on the left, to fill a field\n\
8924of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925
8926static PyObject *
8927unicode_zfill(PyUnicodeObject *self, PyObject *args)
8928{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008929 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930 PyUnicodeObject *u;
8931
Martin v. Löwis18e16552006-02-15 17:27:45 +00008932 Py_ssize_t width;
8933 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 return NULL;
8935
8936 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008937 if (PyUnicode_CheckExact(self)) {
8938 Py_INCREF(self);
8939 return (PyObject*) self;
8940 }
8941 else
8942 return PyUnicode_FromUnicode(
8943 PyUnicode_AS_UNICODE(self),
8944 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 }
8947
8948 fill = width - self->length;
8949
8950 u = pad(self, fill, 0, '0');
8951
Walter Dörwald068325e2002-04-15 13:36:47 +00008952 if (u == NULL)
8953 return NULL;
8954
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955 if (u->str[fill] == '+' || u->str[fill] == '-') {
8956 /* move sign to beginning of string */
8957 u->str[0] = u->str[fill];
8958 u->str[fill] = '0';
8959 }
8960
8961 return (PyObject*) u;
8962}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963
8964#if 0
8965static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008966unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967{
Christian Heimes2202f872008-02-06 14:31:34 +00008968 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969}
8970#endif
8971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008972PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008975Return True if S starts with the specified prefix, False otherwise.\n\
8976With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008977With optional end, stop comparing S at that position.\n\
8978prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
8980static PyObject *
8981unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008984 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008986 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008987 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008988 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008990 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8992 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008993 if (PyTuple_Check(subobj)) {
8994 Py_ssize_t i;
8995 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8996 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008998 if (substring == NULL)
8999 return NULL;
9000 result = tailmatch(self, substring, start, end, -1);
9001 Py_DECREF(substring);
9002 if (result) {
9003 Py_RETURN_TRUE;
9004 }
9005 }
9006 /* nothing matched */
9007 Py_RETURN_FALSE;
9008 }
9009 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009011 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009012 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009014 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015}
9016
9017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009018PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009019 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009021Return True if S ends with the specified suffix, False otherwise.\n\
9022With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009023With optional end, stop comparing S at that position.\n\
9024suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025
9026static PyObject *
9027unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009030 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009032 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009033 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009034 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009036 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009037 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9038 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009039 if (PyTuple_Check(subobj)) {
9040 Py_ssize_t i;
9041 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9042 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009043 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009044 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009045 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009046 result = tailmatch(self, substring, start, end, +1);
9047 Py_DECREF(substring);
9048 if (result) {
9049 Py_RETURN_TRUE;
9050 }
9051 }
9052 Py_RETURN_FALSE;
9053 }
9054 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009056 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009058 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009060 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061}
9062
Eric Smith8c663262007-08-25 02:26:07 +00009063#include "stringlib/string_format.h"
9064
9065PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009066 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009067\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009068Return a formatted version of S, using substitutions from args and kwargs.\n\
9069The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009070
Eric Smith27bbca62010-11-04 17:06:58 +00009071PyDoc_STRVAR(format_map__doc__,
9072 "S.format_map(mapping) -> str\n\
9073\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009074Return a formatted version of S, using substitutions from mapping.\n\
9075The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009076
Eric Smith4a7d76d2008-05-30 18:10:19 +00009077static PyObject *
9078unicode__format__(PyObject* self, PyObject* args)
9079{
9080 PyObject *format_spec;
9081
9082 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9083 return NULL;
9084
9085 return _PyUnicode_FormatAdvanced(self,
9086 PyUnicode_AS_UNICODE(format_spec),
9087 PyUnicode_GET_SIZE(format_spec));
9088}
9089
Eric Smith8c663262007-08-25 02:26:07 +00009090PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009091 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009092\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009093Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009094
9095static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009096unicode__sizeof__(PyUnicodeObject *v)
9097{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009098 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9099 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009100}
9101
9102PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009104
9105static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009106unicode_getnewargs(PyUnicodeObject *v)
9107{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009108 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009109}
9110
9111
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112static PyMethodDef unicode_methods[] = {
9113
9114 /* Order is according to common usage: often used methods should
9115 appear first, since lookup is done sequentially. */
9116
Georg Brandl02524622010-12-02 18:06:51 +00009117 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS,
9118 encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009119 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9120 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009121 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009122 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9123 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9124 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9125 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9126 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9127 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9128 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009129 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009130 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9131 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9132 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009133 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009134 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9135 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9136 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009137 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009138 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009139 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009140 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009141 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9142 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9143 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9144 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9145 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9146 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9147 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9148 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9149 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9150 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9151 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9152 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9153 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9154 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009155 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009156 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009157 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009158 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009159 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009160 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009161 {"maketrans", (PyCFunction) unicode_maketrans,
9162 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandl02524622010-12-02 18:06:51 +00009163 {"transform", (PyCFunction) unicode_transform, METH_VARARGS | METH_KEYWORDS,
9164 transform__doc__},
9165 {"untransform", (PyCFunction) unicode_untransform, METH_VARARGS | METH_KEYWORDS,
9166 untransform__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009167 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009168#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009169 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170#endif
9171
9172#if 0
9173 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009174 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175#endif
9176
Benjamin Peterson14339b62009-01-31 16:36:08 +00009177 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178 {NULL, NULL}
9179};
9180
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009181static PyObject *
9182unicode_mod(PyObject *v, PyObject *w)
9183{
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 if (!PyUnicode_Check(v)) {
9185 Py_INCREF(Py_NotImplemented);
9186 return Py_NotImplemented;
9187 }
9188 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009189}
9190
9191static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009192 0, /*nb_add*/
9193 0, /*nb_subtract*/
9194 0, /*nb_multiply*/
9195 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009196};
9197
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009199 (lenfunc) unicode_length, /* sq_length */
9200 PyUnicode_Concat, /* sq_concat */
9201 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9202 (ssizeargfunc) unicode_getitem, /* sq_item */
9203 0, /* sq_slice */
9204 0, /* sq_ass_item */
9205 0, /* sq_ass_slice */
9206 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207};
9208
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009209static PyObject*
9210unicode_subscript(PyUnicodeObject* self, PyObject* item)
9211{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009212 if (PyIndex_Check(item)) {
9213 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009214 if (i == -1 && PyErr_Occurred())
9215 return NULL;
9216 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009217 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009218 return unicode_getitem(self, i);
9219 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009220 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009221 Py_UNICODE* source_buf;
9222 Py_UNICODE* result_buf;
9223 PyObject* result;
9224
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009225 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009226 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009227 return NULL;
9228 }
9229
9230 if (slicelength <= 0) {
9231 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009232 } else if (start == 0 && step == 1 && slicelength == self->length &&
9233 PyUnicode_CheckExact(self)) {
9234 Py_INCREF(self);
9235 return (PyObject *)self;
9236 } else if (step == 1) {
9237 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009238 } else {
9239 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009240 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9241 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009242
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 if (result_buf == NULL)
9244 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009245
9246 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9247 result_buf[i] = source_buf[cur];
9248 }
Tim Petersced69f82003-09-16 20:30:58 +00009249
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009250 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009251 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009252 return result;
9253 }
9254 } else {
9255 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9256 return NULL;
9257 }
9258}
9259
9260static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009261 (lenfunc)unicode_length, /* mp_length */
9262 (binaryfunc)unicode_subscript, /* mp_subscript */
9263 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009264};
9265
Guido van Rossumd57fd912000-03-10 22:53:23 +00009266
Guido van Rossumd57fd912000-03-10 22:53:23 +00009267/* Helpers for PyUnicode_Format() */
9268
9269static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009270getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009271{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009272 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009274 (*p_argidx)++;
9275 if (arglen < 0)
9276 return args;
9277 else
9278 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279 }
9280 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009281 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282 return NULL;
9283}
9284
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009285/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009287static PyObject *
9288formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009290 char *p;
9291 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009293
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294 x = PyFloat_AsDouble(v);
9295 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009296 return NULL;
9297
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009299 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009300
Eric Smith0923d1d2009-04-16 20:16:10 +00009301 p = PyOS_double_to_string(x, type, prec,
9302 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009303 if (p == NULL)
9304 return NULL;
9305 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009306 PyMem_Free(p);
9307 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308}
9309
Tim Peters38fd5b62000-09-21 05:43:11 +00009310static PyObject*
9311formatlong(PyObject *val, int flags, int prec, int type)
9312{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009313 char *buf;
9314 int len;
9315 PyObject *str; /* temporary string object. */
9316 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009317
Benjamin Peterson14339b62009-01-31 16:36:08 +00009318 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9319 if (!str)
9320 return NULL;
9321 result = PyUnicode_FromStringAndSize(buf, len);
9322 Py_DECREF(str);
9323 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009324}
9325
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326static int
9327formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009328 size_t buflen,
9329 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009330{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009331 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009332 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009333 if (PyUnicode_GET_SIZE(v) == 1) {
9334 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9335 buf[1] = '\0';
9336 return 1;
9337 }
9338#ifndef Py_UNICODE_WIDE
9339 if (PyUnicode_GET_SIZE(v) == 2) {
9340 /* Decode a valid surrogate pair */
9341 int c0 = PyUnicode_AS_UNICODE(v)[0];
9342 int c1 = PyUnicode_AS_UNICODE(v)[1];
9343 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9344 0xDC00 <= c1 && c1 <= 0xDFFF) {
9345 buf[0] = c0;
9346 buf[1] = c1;
9347 buf[2] = '\0';
9348 return 2;
9349 }
9350 }
9351#endif
9352 goto onError;
9353 }
9354 else {
9355 /* Integer input truncated to a character */
9356 long x;
9357 x = PyLong_AsLong(v);
9358 if (x == -1 && PyErr_Occurred())
9359 goto onError;
9360
9361 if (x < 0 || x > 0x10ffff) {
9362 PyErr_SetString(PyExc_OverflowError,
9363 "%c arg not in range(0x110000)");
9364 return -1;
9365 }
9366
9367#ifndef Py_UNICODE_WIDE
9368 if (x > 0xffff) {
9369 x -= 0x10000;
9370 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9371 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9372 return 2;
9373 }
9374#endif
9375 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009376 buf[1] = '\0';
9377 return 1;
9378 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009379
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009381 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009383 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384}
9385
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009386/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009387 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009388*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009389#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009390
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009392 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393{
9394 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009395 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396 int args_owned = 0;
9397 PyUnicodeObject *result = NULL;
9398 PyObject *dict = NULL;
9399 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009400
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009402 PyErr_BadInternalCall();
9403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404 }
9405 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009406 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408 fmt = PyUnicode_AS_UNICODE(uformat);
9409 fmtcnt = PyUnicode_GET_SIZE(uformat);
9410
9411 reslen = rescnt = fmtcnt + 100;
9412 result = _PyUnicode_New(reslen);
9413 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415 res = PyUnicode_AS_UNICODE(result);
9416
9417 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009418 arglen = PyTuple_Size(args);
9419 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420 }
9421 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009422 arglen = -1;
9423 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009425 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009426 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009427 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428
9429 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009430 if (*fmt != '%') {
9431 if (--rescnt < 0) {
9432 rescnt = fmtcnt + 100;
9433 reslen += rescnt;
9434 if (_PyUnicode_Resize(&result, reslen) < 0)
9435 goto onError;
9436 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9437 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009438 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009439 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009440 }
9441 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009442 /* Got a format specifier */
9443 int flags = 0;
9444 Py_ssize_t width = -1;
9445 int prec = -1;
9446 Py_UNICODE c = '\0';
9447 Py_UNICODE fill;
9448 int isnumok;
9449 PyObject *v = NULL;
9450 PyObject *temp = NULL;
9451 Py_UNICODE *pbuf;
9452 Py_UNICODE sign;
9453 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009454 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455
Benjamin Peterson29060642009-01-31 22:14:21 +00009456 fmt++;
9457 if (*fmt == '(') {
9458 Py_UNICODE *keystart;
9459 Py_ssize_t keylen;
9460 PyObject *key;
9461 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009462
Benjamin Peterson29060642009-01-31 22:14:21 +00009463 if (dict == NULL) {
9464 PyErr_SetString(PyExc_TypeError,
9465 "format requires a mapping");
9466 goto onError;
9467 }
9468 ++fmt;
9469 --fmtcnt;
9470 keystart = fmt;
9471 /* Skip over balanced parentheses */
9472 while (pcount > 0 && --fmtcnt >= 0) {
9473 if (*fmt == ')')
9474 --pcount;
9475 else if (*fmt == '(')
9476 ++pcount;
9477 fmt++;
9478 }
9479 keylen = fmt - keystart - 1;
9480 if (fmtcnt < 0 || pcount > 0) {
9481 PyErr_SetString(PyExc_ValueError,
9482 "incomplete format key");
9483 goto onError;
9484 }
9485#if 0
9486 /* keys are converted to strings using UTF-8 and
9487 then looked up since Python uses strings to hold
9488 variables names etc. in its namespaces and we
9489 wouldn't want to break common idioms. */
9490 key = PyUnicode_EncodeUTF8(keystart,
9491 keylen,
9492 NULL);
9493#else
9494 key = PyUnicode_FromUnicode(keystart, keylen);
9495#endif
9496 if (key == NULL)
9497 goto onError;
9498 if (args_owned) {
9499 Py_DECREF(args);
9500 args_owned = 0;
9501 }
9502 args = PyObject_GetItem(dict, key);
9503 Py_DECREF(key);
9504 if (args == NULL) {
9505 goto onError;
9506 }
9507 args_owned = 1;
9508 arglen = -1;
9509 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009510 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009511 while (--fmtcnt >= 0) {
9512 switch (c = *fmt++) {
9513 case '-': flags |= F_LJUST; continue;
9514 case '+': flags |= F_SIGN; continue;
9515 case ' ': flags |= F_BLANK; continue;
9516 case '#': flags |= F_ALT; continue;
9517 case '0': flags |= F_ZERO; continue;
9518 }
9519 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009520 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 if (c == '*') {
9522 v = getnextarg(args, arglen, &argidx);
9523 if (v == NULL)
9524 goto onError;
9525 if (!PyLong_Check(v)) {
9526 PyErr_SetString(PyExc_TypeError,
9527 "* wants int");
9528 goto onError;
9529 }
9530 width = PyLong_AsLong(v);
9531 if (width == -1 && PyErr_Occurred())
9532 goto onError;
9533 if (width < 0) {
9534 flags |= F_LJUST;
9535 width = -width;
9536 }
9537 if (--fmtcnt >= 0)
9538 c = *fmt++;
9539 }
9540 else if (c >= '0' && c <= '9') {
9541 width = c - '0';
9542 while (--fmtcnt >= 0) {
9543 c = *fmt++;
9544 if (c < '0' || c > '9')
9545 break;
9546 if ((width*10) / 10 != width) {
9547 PyErr_SetString(PyExc_ValueError,
9548 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009549 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 }
9551 width = width*10 + (c - '0');
9552 }
9553 }
9554 if (c == '.') {
9555 prec = 0;
9556 if (--fmtcnt >= 0)
9557 c = *fmt++;
9558 if (c == '*') {
9559 v = getnextarg(args, arglen, &argidx);
9560 if (v == NULL)
9561 goto onError;
9562 if (!PyLong_Check(v)) {
9563 PyErr_SetString(PyExc_TypeError,
9564 "* wants int");
9565 goto onError;
9566 }
9567 prec = PyLong_AsLong(v);
9568 if (prec == -1 && PyErr_Occurred())
9569 goto onError;
9570 if (prec < 0)
9571 prec = 0;
9572 if (--fmtcnt >= 0)
9573 c = *fmt++;
9574 }
9575 else if (c >= '0' && c <= '9') {
9576 prec = c - '0';
9577 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009578 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 if (c < '0' || c > '9')
9580 break;
9581 if ((prec*10) / 10 != prec) {
9582 PyErr_SetString(PyExc_ValueError,
9583 "prec too big");
9584 goto onError;
9585 }
9586 prec = prec*10 + (c - '0');
9587 }
9588 }
9589 } /* prec */
9590 if (fmtcnt >= 0) {
9591 if (c == 'h' || c == 'l' || c == 'L') {
9592 if (--fmtcnt >= 0)
9593 c = *fmt++;
9594 }
9595 }
9596 if (fmtcnt < 0) {
9597 PyErr_SetString(PyExc_ValueError,
9598 "incomplete format");
9599 goto onError;
9600 }
9601 if (c != '%') {
9602 v = getnextarg(args, arglen, &argidx);
9603 if (v == NULL)
9604 goto onError;
9605 }
9606 sign = 0;
9607 fill = ' ';
9608 switch (c) {
9609
9610 case '%':
9611 pbuf = formatbuf;
9612 /* presume that buffer length is at least 1 */
9613 pbuf[0] = '%';
9614 len = 1;
9615 break;
9616
9617 case 's':
9618 case 'r':
9619 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009620 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009621 temp = v;
9622 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009623 }
9624 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009625 if (c == 's')
9626 temp = PyObject_Str(v);
9627 else if (c == 'r')
9628 temp = PyObject_Repr(v);
9629 else
9630 temp = PyObject_ASCII(v);
9631 if (temp == NULL)
9632 goto onError;
9633 if (PyUnicode_Check(temp))
9634 /* nothing to do */;
9635 else {
9636 Py_DECREF(temp);
9637 PyErr_SetString(PyExc_TypeError,
9638 "%s argument has non-string str()");
9639 goto onError;
9640 }
9641 }
9642 pbuf = PyUnicode_AS_UNICODE(temp);
9643 len = PyUnicode_GET_SIZE(temp);
9644 if (prec >= 0 && len > prec)
9645 len = prec;
9646 break;
9647
9648 case 'i':
9649 case 'd':
9650 case 'u':
9651 case 'o':
9652 case 'x':
9653 case 'X':
9654 if (c == 'i')
9655 c = 'd';
9656 isnumok = 0;
9657 if (PyNumber_Check(v)) {
9658 PyObject *iobj=NULL;
9659
9660 if (PyLong_Check(v)) {
9661 iobj = v;
9662 Py_INCREF(iobj);
9663 }
9664 else {
9665 iobj = PyNumber_Long(v);
9666 }
9667 if (iobj!=NULL) {
9668 if (PyLong_Check(iobj)) {
9669 isnumok = 1;
9670 temp = formatlong(iobj, flags, prec, c);
9671 Py_DECREF(iobj);
9672 if (!temp)
9673 goto onError;
9674 pbuf = PyUnicode_AS_UNICODE(temp);
9675 len = PyUnicode_GET_SIZE(temp);
9676 sign = 1;
9677 }
9678 else {
9679 Py_DECREF(iobj);
9680 }
9681 }
9682 }
9683 if (!isnumok) {
9684 PyErr_Format(PyExc_TypeError,
9685 "%%%c format: a number is required, "
9686 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9687 goto onError;
9688 }
9689 if (flags & F_ZERO)
9690 fill = '0';
9691 break;
9692
9693 case 'e':
9694 case 'E':
9695 case 'f':
9696 case 'F':
9697 case 'g':
9698 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009699 temp = formatfloat(v, flags, prec, c);
9700 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009701 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009702 pbuf = PyUnicode_AS_UNICODE(temp);
9703 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009704 sign = 1;
9705 if (flags & F_ZERO)
9706 fill = '0';
9707 break;
9708
9709 case 'c':
9710 pbuf = formatbuf;
9711 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9712 if (len < 0)
9713 goto onError;
9714 break;
9715
9716 default:
9717 PyErr_Format(PyExc_ValueError,
9718 "unsupported format character '%c' (0x%x) "
9719 "at index %zd",
9720 (31<=c && c<=126) ? (char)c : '?',
9721 (int)c,
9722 (Py_ssize_t)(fmt - 1 -
9723 PyUnicode_AS_UNICODE(uformat)));
9724 goto onError;
9725 }
9726 if (sign) {
9727 if (*pbuf == '-' || *pbuf == '+') {
9728 sign = *pbuf++;
9729 len--;
9730 }
9731 else if (flags & F_SIGN)
9732 sign = '+';
9733 else if (flags & F_BLANK)
9734 sign = ' ';
9735 else
9736 sign = 0;
9737 }
9738 if (width < len)
9739 width = len;
9740 if (rescnt - (sign != 0) < width) {
9741 reslen -= rescnt;
9742 rescnt = width + fmtcnt + 100;
9743 reslen += rescnt;
9744 if (reslen < 0) {
9745 Py_XDECREF(temp);
9746 PyErr_NoMemory();
9747 goto onError;
9748 }
9749 if (_PyUnicode_Resize(&result, reslen) < 0) {
9750 Py_XDECREF(temp);
9751 goto onError;
9752 }
9753 res = PyUnicode_AS_UNICODE(result)
9754 + reslen - rescnt;
9755 }
9756 if (sign) {
9757 if (fill != ' ')
9758 *res++ = sign;
9759 rescnt--;
9760 if (width > len)
9761 width--;
9762 }
9763 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9764 assert(pbuf[0] == '0');
9765 assert(pbuf[1] == c);
9766 if (fill != ' ') {
9767 *res++ = *pbuf++;
9768 *res++ = *pbuf++;
9769 }
9770 rescnt -= 2;
9771 width -= 2;
9772 if (width < 0)
9773 width = 0;
9774 len -= 2;
9775 }
9776 if (width > len && !(flags & F_LJUST)) {
9777 do {
9778 --rescnt;
9779 *res++ = fill;
9780 } while (--width > len);
9781 }
9782 if (fill == ' ') {
9783 if (sign)
9784 *res++ = sign;
9785 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9786 assert(pbuf[0] == '0');
9787 assert(pbuf[1] == c);
9788 *res++ = *pbuf++;
9789 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009790 }
9791 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009792 Py_UNICODE_COPY(res, pbuf, len);
9793 res += len;
9794 rescnt -= len;
9795 while (--width >= len) {
9796 --rescnt;
9797 *res++ = ' ';
9798 }
9799 if (dict && (argidx < arglen) && c != '%') {
9800 PyErr_SetString(PyExc_TypeError,
9801 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009802 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009803 goto onError;
9804 }
9805 Py_XDECREF(temp);
9806 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807 } /* until end */
9808 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009809 PyErr_SetString(PyExc_TypeError,
9810 "not all arguments converted during string formatting");
9811 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812 }
9813
Thomas Woutersa96affe2006-03-12 00:29:36 +00009814 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009815 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009817 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818 }
9819 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820 return (PyObject *)result;
9821
Benjamin Peterson29060642009-01-31 22:14:21 +00009822 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823 Py_XDECREF(result);
9824 Py_DECREF(uformat);
9825 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009826 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827 }
9828 return NULL;
9829}
9830
Jeremy Hylton938ace62002-07-17 16:30:39 +00009831static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009832unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9833
Tim Peters6d6c1a32001-08-02 04:15:00 +00009834static PyObject *
9835unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9836{
Benjamin Peterson29060642009-01-31 22:14:21 +00009837 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009838 static char *kwlist[] = {"object", "encoding", "errors", 0};
9839 char *encoding = NULL;
9840 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009841
Benjamin Peterson14339b62009-01-31 16:36:08 +00009842 if (type != &PyUnicode_Type)
9843 return unicode_subtype_new(type, args, kwds);
9844 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009845 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009846 return NULL;
9847 if (x == NULL)
9848 return (PyObject *)_PyUnicode_New(0);
9849 if (encoding == NULL && errors == NULL)
9850 return PyObject_Str(x);
9851 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009852 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009853}
9854
Guido van Rossume023fe02001-08-30 03:12:59 +00009855static PyObject *
9856unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9857{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009858 PyUnicodeObject *tmp, *pnew;
9859 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009860
Benjamin Peterson14339b62009-01-31 16:36:08 +00009861 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9862 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9863 if (tmp == NULL)
9864 return NULL;
9865 assert(PyUnicode_Check(tmp));
9866 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9867 if (pnew == NULL) {
9868 Py_DECREF(tmp);
9869 return NULL;
9870 }
9871 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9872 if (pnew->str == NULL) {
9873 _Py_ForgetReference((PyObject *)pnew);
9874 PyObject_Del(pnew);
9875 Py_DECREF(tmp);
9876 return PyErr_NoMemory();
9877 }
9878 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9879 pnew->length = n;
9880 pnew->hash = tmp->hash;
9881 Py_DECREF(tmp);
9882 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009883}
9884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009885PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009886 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009887\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009888Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009889encoding defaults to the current default string encoding.\n\
9890errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009891
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009892static PyObject *unicode_iter(PyObject *seq);
9893
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009895 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009896 "str", /* tp_name */
9897 sizeof(PyUnicodeObject), /* tp_size */
9898 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009900 (destructor)unicode_dealloc, /* tp_dealloc */
9901 0, /* tp_print */
9902 0, /* tp_getattr */
9903 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009904 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009905 unicode_repr, /* tp_repr */
9906 &unicode_as_number, /* tp_as_number */
9907 &unicode_as_sequence, /* tp_as_sequence */
9908 &unicode_as_mapping, /* tp_as_mapping */
9909 (hashfunc) unicode_hash, /* tp_hash*/
9910 0, /* tp_call*/
9911 (reprfunc) unicode_str, /* tp_str */
9912 PyObject_GenericGetAttr, /* tp_getattro */
9913 0, /* tp_setattro */
9914 0, /* tp_as_buffer */
9915 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009916 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009917 unicode_doc, /* tp_doc */
9918 0, /* tp_traverse */
9919 0, /* tp_clear */
9920 PyUnicode_RichCompare, /* tp_richcompare */
9921 0, /* tp_weaklistoffset */
9922 unicode_iter, /* tp_iter */
9923 0, /* tp_iternext */
9924 unicode_methods, /* tp_methods */
9925 0, /* tp_members */
9926 0, /* tp_getset */
9927 &PyBaseObject_Type, /* tp_base */
9928 0, /* tp_dict */
9929 0, /* tp_descr_get */
9930 0, /* tp_descr_set */
9931 0, /* tp_dictoffset */
9932 0, /* tp_init */
9933 0, /* tp_alloc */
9934 unicode_new, /* tp_new */
9935 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936};
9937
9938/* Initialize the Unicode implementation */
9939
Thomas Wouters78890102000-07-22 19:25:51 +00009940void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009942 int i;
9943
Thomas Wouters477c8d52006-05-27 19:21:47 +00009944 /* XXX - move this array to unicodectype.c ? */
9945 Py_UNICODE linebreak[] = {
9946 0x000A, /* LINE FEED */
9947 0x000D, /* CARRIAGE RETURN */
9948 0x001C, /* FILE SEPARATOR */
9949 0x001D, /* GROUP SEPARATOR */
9950 0x001E, /* RECORD SEPARATOR */
9951 0x0085, /* NEXT LINE */
9952 0x2028, /* LINE SEPARATOR */
9953 0x2029, /* PARAGRAPH SEPARATOR */
9954 };
9955
Fred Drakee4315f52000-05-09 19:53:39 +00009956 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009957 free_list = NULL;
9958 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009960 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009961 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009962
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009963 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009964 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009965 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009966 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009967
9968 /* initialize the linebreak bloom filter */
9969 bloom_linebreak = make_bloom_mask(
9970 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9971 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009972
9973 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974}
9975
9976/* Finalize the Unicode implementation */
9977
Christian Heimesa156e092008-02-16 07:38:31 +00009978int
9979PyUnicode_ClearFreeList(void)
9980{
9981 int freelist_size = numfree;
9982 PyUnicodeObject *u;
9983
9984 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 PyUnicodeObject *v = u;
9986 u = *(PyUnicodeObject **)u;
9987 if (v->str)
9988 PyObject_DEL(v->str);
9989 Py_XDECREF(v->defenc);
9990 PyObject_Del(v);
9991 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009992 }
9993 free_list = NULL;
9994 assert(numfree == 0);
9995 return freelist_size;
9996}
9997
Guido van Rossumd57fd912000-03-10 22:53:23 +00009998void
Thomas Wouters78890102000-07-22 19:25:51 +00009999_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010001 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010003 Py_XDECREF(unicode_empty);
10004 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010005
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010006 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010007 if (unicode_latin1[i]) {
10008 Py_DECREF(unicode_latin1[i]);
10009 unicode_latin1[i] = NULL;
10010 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010011 }
Christian Heimesa156e092008-02-16 07:38:31 +000010012 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010014
Walter Dörwald16807132007-05-25 13:52:07 +000010015void
10016PyUnicode_InternInPlace(PyObject **p)
10017{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010018 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10019 PyObject *t;
10020 if (s == NULL || !PyUnicode_Check(s))
10021 Py_FatalError(
10022 "PyUnicode_InternInPlace: unicode strings only please!");
10023 /* If it's a subclass, we don't really know what putting
10024 it in the interned dict might do. */
10025 if (!PyUnicode_CheckExact(s))
10026 return;
10027 if (PyUnicode_CHECK_INTERNED(s))
10028 return;
10029 if (interned == NULL) {
10030 interned = PyDict_New();
10031 if (interned == NULL) {
10032 PyErr_Clear(); /* Don't leave an exception */
10033 return;
10034 }
10035 }
10036 /* It might be that the GetItem call fails even
10037 though the key is present in the dictionary,
10038 namely when this happens during a stack overflow. */
10039 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010040 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010041 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010042
Benjamin Peterson29060642009-01-31 22:14:21 +000010043 if (t) {
10044 Py_INCREF(t);
10045 Py_DECREF(*p);
10046 *p = t;
10047 return;
10048 }
Walter Dörwald16807132007-05-25 13:52:07 +000010049
Benjamin Peterson14339b62009-01-31 16:36:08 +000010050 PyThreadState_GET()->recursion_critical = 1;
10051 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10052 PyErr_Clear();
10053 PyThreadState_GET()->recursion_critical = 0;
10054 return;
10055 }
10056 PyThreadState_GET()->recursion_critical = 0;
10057 /* The two references in interned are not counted by refcnt.
10058 The deallocator will take care of this */
10059 Py_REFCNT(s) -= 2;
10060 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010061}
10062
10063void
10064PyUnicode_InternImmortal(PyObject **p)
10065{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010066 PyUnicode_InternInPlace(p);
10067 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10068 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10069 Py_INCREF(*p);
10070 }
Walter Dörwald16807132007-05-25 13:52:07 +000010071}
10072
10073PyObject *
10074PyUnicode_InternFromString(const char *cp)
10075{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010076 PyObject *s = PyUnicode_FromString(cp);
10077 if (s == NULL)
10078 return NULL;
10079 PyUnicode_InternInPlace(&s);
10080 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010081}
10082
10083void _Py_ReleaseInternedUnicodeStrings(void)
10084{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010085 PyObject *keys;
10086 PyUnicodeObject *s;
10087 Py_ssize_t i, n;
10088 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010089
Benjamin Peterson14339b62009-01-31 16:36:08 +000010090 if (interned == NULL || !PyDict_Check(interned))
10091 return;
10092 keys = PyDict_Keys(interned);
10093 if (keys == NULL || !PyList_Check(keys)) {
10094 PyErr_Clear();
10095 return;
10096 }
Walter Dörwald16807132007-05-25 13:52:07 +000010097
Benjamin Peterson14339b62009-01-31 16:36:08 +000010098 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10099 detector, interned unicode strings are not forcibly deallocated;
10100 rather, we give them their stolen references back, and then clear
10101 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010102
Benjamin Peterson14339b62009-01-31 16:36:08 +000010103 n = PyList_GET_SIZE(keys);
10104 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010105 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010106 for (i = 0; i < n; i++) {
10107 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10108 switch (s->state) {
10109 case SSTATE_NOT_INTERNED:
10110 /* XXX Shouldn't happen */
10111 break;
10112 case SSTATE_INTERNED_IMMORTAL:
10113 Py_REFCNT(s) += 1;
10114 immortal_size += s->length;
10115 break;
10116 case SSTATE_INTERNED_MORTAL:
10117 Py_REFCNT(s) += 2;
10118 mortal_size += s->length;
10119 break;
10120 default:
10121 Py_FatalError("Inconsistent interned string state.");
10122 }
10123 s->state = SSTATE_NOT_INTERNED;
10124 }
10125 fprintf(stderr, "total size of all interned strings: "
10126 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10127 "mortal/immortal\n", mortal_size, immortal_size);
10128 Py_DECREF(keys);
10129 PyDict_Clear(interned);
10130 Py_DECREF(interned);
10131 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010132}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010133
10134
10135/********************* Unicode Iterator **************************/
10136
10137typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010138 PyObject_HEAD
10139 Py_ssize_t it_index;
10140 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010141} unicodeiterobject;
10142
10143static void
10144unicodeiter_dealloc(unicodeiterobject *it)
10145{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010146 _PyObject_GC_UNTRACK(it);
10147 Py_XDECREF(it->it_seq);
10148 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010149}
10150
10151static int
10152unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10153{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010154 Py_VISIT(it->it_seq);
10155 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010156}
10157
10158static PyObject *
10159unicodeiter_next(unicodeiterobject *it)
10160{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010161 PyUnicodeObject *seq;
10162 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010163
Benjamin Peterson14339b62009-01-31 16:36:08 +000010164 assert(it != NULL);
10165 seq = it->it_seq;
10166 if (seq == NULL)
10167 return NULL;
10168 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010169
Benjamin Peterson14339b62009-01-31 16:36:08 +000010170 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10171 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010172 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010173 if (item != NULL)
10174 ++it->it_index;
10175 return item;
10176 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010177
Benjamin Peterson14339b62009-01-31 16:36:08 +000010178 Py_DECREF(seq);
10179 it->it_seq = NULL;
10180 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010181}
10182
10183static PyObject *
10184unicodeiter_len(unicodeiterobject *it)
10185{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010186 Py_ssize_t len = 0;
10187 if (it->it_seq)
10188 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10189 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010190}
10191
10192PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10193
10194static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010195 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010196 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010197 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010198};
10199
10200PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010201 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10202 "str_iterator", /* tp_name */
10203 sizeof(unicodeiterobject), /* tp_basicsize */
10204 0, /* tp_itemsize */
10205 /* methods */
10206 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10207 0, /* tp_print */
10208 0, /* tp_getattr */
10209 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010210 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010211 0, /* tp_repr */
10212 0, /* tp_as_number */
10213 0, /* tp_as_sequence */
10214 0, /* tp_as_mapping */
10215 0, /* tp_hash */
10216 0, /* tp_call */
10217 0, /* tp_str */
10218 PyObject_GenericGetAttr, /* tp_getattro */
10219 0, /* tp_setattro */
10220 0, /* tp_as_buffer */
10221 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10222 0, /* tp_doc */
10223 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10224 0, /* tp_clear */
10225 0, /* tp_richcompare */
10226 0, /* tp_weaklistoffset */
10227 PyObject_SelfIter, /* tp_iter */
10228 (iternextfunc)unicodeiter_next, /* tp_iternext */
10229 unicodeiter_methods, /* tp_methods */
10230 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010231};
10232
10233static PyObject *
10234unicode_iter(PyObject *seq)
10235{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010236 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010237
Benjamin Peterson14339b62009-01-31 16:36:08 +000010238 if (!PyUnicode_Check(seq)) {
10239 PyErr_BadInternalCall();
10240 return NULL;
10241 }
10242 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10243 if (it == NULL)
10244 return NULL;
10245 it->it_index = 0;
10246 Py_INCREF(seq);
10247 it->it_seq = (PyUnicodeObject *)seq;
10248 _PyObject_GC_TRACK(it);
10249 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010250}
10251
Martin v. Löwis5b222132007-06-10 09:51:05 +000010252size_t
10253Py_UNICODE_strlen(const Py_UNICODE *u)
10254{
10255 int res = 0;
10256 while(*u++)
10257 res++;
10258 return res;
10259}
10260
10261Py_UNICODE*
10262Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10263{
10264 Py_UNICODE *u = s1;
10265 while ((*u++ = *s2++));
10266 return s1;
10267}
10268
10269Py_UNICODE*
10270Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10271{
10272 Py_UNICODE *u = s1;
10273 while ((*u++ = *s2++))
10274 if (n-- == 0)
10275 break;
10276 return s1;
10277}
10278
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010279Py_UNICODE*
10280Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10281{
10282 Py_UNICODE *u1 = s1;
10283 u1 += Py_UNICODE_strlen(u1);
10284 Py_UNICODE_strcpy(u1, s2);
10285 return s1;
10286}
10287
Martin v. Löwis5b222132007-06-10 09:51:05 +000010288int
10289Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10290{
10291 while (*s1 && *s2 && *s1 == *s2)
10292 s1++, s2++;
10293 if (*s1 && *s2)
10294 return (*s1 < *s2) ? -1 : +1;
10295 if (*s1)
10296 return 1;
10297 if (*s2)
10298 return -1;
10299 return 0;
10300}
10301
Victor Stinneref8d95c2010-08-16 22:03:11 +000010302int
10303Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10304{
10305 register Py_UNICODE u1, u2;
10306 for (; n != 0; n--) {
10307 u1 = *s1;
10308 u2 = *s2;
10309 if (u1 != u2)
10310 return (u1 < u2) ? -1 : +1;
10311 if (u1 == '\0')
10312 return 0;
10313 s1++;
10314 s2++;
10315 }
10316 return 0;
10317}
10318
Martin v. Löwis5b222132007-06-10 09:51:05 +000010319Py_UNICODE*
10320Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10321{
10322 const Py_UNICODE *p;
10323 for (p = s; *p; p++)
10324 if (*p == c)
10325 return (Py_UNICODE*)p;
10326 return NULL;
10327}
10328
Victor Stinner331ea922010-08-10 16:37:20 +000010329Py_UNICODE*
10330Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10331{
10332 const Py_UNICODE *p;
10333 p = s + Py_UNICODE_strlen(s);
10334 while (p != s) {
10335 p--;
10336 if (*p == c)
10337 return (Py_UNICODE*)p;
10338 }
10339 return NULL;
10340}
10341
Victor Stinner71133ff2010-09-01 23:43:53 +000010342Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010343PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010344{
10345 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10346 Py_UNICODE *copy;
10347 Py_ssize_t size;
10348
10349 /* Ensure we won't overflow the size. */
10350 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10351 PyErr_NoMemory();
10352 return NULL;
10353 }
10354 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10355 size *= sizeof(Py_UNICODE);
10356 copy = PyMem_Malloc(size);
10357 if (copy == NULL) {
10358 PyErr_NoMemory();
10359 return NULL;
10360 }
10361 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10362 return copy;
10363}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010364
Georg Brandl66c221e2010-10-14 07:04:07 +000010365/* A _string module, to export formatter_parser and formatter_field_name_split
10366 to the string.Formatter class implemented in Python. */
10367
10368static PyMethodDef _string_methods[] = {
10369 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10370 METH_O, PyDoc_STR("split the argument as a field name")},
10371 {"formatter_parser", (PyCFunction) formatter_parser,
10372 METH_O, PyDoc_STR("parse the argument as a format string")},
10373 {NULL, NULL}
10374};
10375
10376static struct PyModuleDef _string_module = {
10377 PyModuleDef_HEAD_INIT,
10378 "_string",
10379 PyDoc_STR("string helper module"),
10380 0,
10381 _string_methods,
10382 NULL,
10383 NULL,
10384 NULL,
10385 NULL
10386};
10387
10388PyMODINIT_FUNC
10389PyInit__string(void)
10390{
10391 return PyModule_Create(&_string_module);
10392}
10393
10394
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010395#ifdef __cplusplus
10396}
10397#endif