blob: 751da30e42353155c20bed3a3588603c607ba546 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Guido van Rossumdaa251c2007-10-25 23:47:33 +000044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes2202f872008-02-06 14:31:34 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes2202f872008-02-06 14:31:34 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Walter Dörwald16807132007-05-25 13:52:07 +000095/* This dictionary holds all interned unicode strings. Note that references
96 to strings in this dictionary are *not* counted in the string's ob_refcnt.
97 When the interned string reaches a refcnt of 0 the string deallocation
98 function will delete the reference from this dictionary.
99
100 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000101 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000102*/
103static PyObject *interned;
104
Guido van Rossumd57fd912000-03-10 22:53:23 +0000105/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000106static PyUnicodeObject *free_list;
107static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000108
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113 shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
Christian Heimes190d79e2008-01-30 11:58:22 +0000116/* Fast detection of the most frequent whitespace characters */
117const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000118 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000121/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000122/* case 0x000C: * FORM FEED */
123/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000124 0, 1, 1, 1, 1, 1, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000126/* case 0x001C: * FILE SEPARATOR */
127/* case 0x001D: * GROUP SEPARATOR */
128/* case 0x001E: * RECORD SEPARATOR */
129/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000131/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000132 1, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000136
Benjamin Peterson14339b62009-01-31 16:36:08 +0000137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000145};
146
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147static PyObject *unicode_encode_call_errorhandler(const char *errors,
148 PyObject **errorHandler,const char *encoding, const char *reason,
149 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
150 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
151
Victor Stinner31be90b2010-04-22 19:38:16 +0000152static void raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
228Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
229{
230 /* calculate simple bloom-style bitmask for a given unicode string */
231
Antoine Pitrouf068f942010-01-13 14:19:12 +0000232 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000233 Py_ssize_t i;
234
235 mask = 0;
236 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000237 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000238
239 return mask;
240}
241
242Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
243{
244 Py_ssize_t i;
245
246 for (i = 0; i < setlen; i++)
247 if (set[i] == chr)
248 return 1;
249
250 return 0;
251}
252
Benjamin Peterson29060642009-01-31 22:14:21 +0000253#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000254 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
255
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256/* --- Unicode Object ----------------------------------------------------- */
257
258static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000259int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000260 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261{
262 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000263
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000264 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000266 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000268 /* Resizing shared object (unicode_empty or single character
269 objects) in-place is not allowed. Use PyUnicode_Resize()
270 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000271
Benjamin Peterson14339b62009-01-31 16:36:08 +0000272 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000273 (unicode->length == 1 &&
274 unicode->str[0] < 256U &&
275 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000277 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 return -1;
279 }
280
Thomas Wouters477c8d52006-05-27 19:21:47 +0000281 /* We allocate one more byte to make sure the string is Ux0000 terminated.
282 The overallocation is also used by fastsearch, which assumes that it's
283 safe to look at str[length] (without making any assumptions about what
284 it contains). */
285
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000287 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 PyErr_NoMemory();
292 return -1;
293 }
294 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296
Benjamin Peterson29060642009-01-31 22:14:21 +0000297 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000299 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000300 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000301 }
302 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000303
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304 return 0;
305}
306
307/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000308 Ux0000 terminated; some code (e.g. new_identifier)
309 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310
311 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000312 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000313
314*/
315
316static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318{
319 register PyUnicodeObject *unicode;
320
Thomas Wouters477c8d52006-05-27 19:21:47 +0000321 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
Guido van Rossumd57fd912000-03-10 22:53:23 +0000332 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000341 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000345 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000346 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000351 }
352 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000353 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000355 if (unicode == NULL)
356 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 }
360
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000361 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000362 PyErr_NoMemory();
363 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000364 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000365 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000372 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000376 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000377 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000379
Benjamin Peterson29060642009-01-31 22:14:21 +0000380 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000381 /* XXX UNREF/NEWREF interface should be more symmetrical */
382 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000384 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386}
387
388static
Guido van Rossum9475a232001-10-05 20:51:39 +0000389void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000390{
Walter Dörwald16807132007-05-25 13:52:07 +0000391 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000392 case SSTATE_NOT_INTERNED:
393 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000394
Benjamin Peterson29060642009-01-31 22:14:21 +0000395 case SSTATE_INTERNED_MORTAL:
396 /* revive dead object temporarily for DelItem */
397 Py_REFCNT(unicode) = 3;
398 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
399 Py_FatalError(
400 "deletion of interned string failed");
401 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000402
Benjamin Peterson29060642009-01-31 22:14:21 +0000403 case SSTATE_INTERNED_IMMORTAL:
404 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000405
Benjamin Peterson29060642009-01-31 22:14:21 +0000406 default:
407 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000408 }
409
Guido van Rossum604ddf82001-12-06 20:03:56 +0000410 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000412 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
414 PyObject_DEL(unicode->str);
415 unicode->str = NULL;
416 unicode->length = 0;
417 }
418 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000419 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000420 }
421 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000422 *(PyUnicodeObject **)unicode = free_list;
423 free_list = unicode;
424 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000425 }
426 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000427 PyObject_DEL(unicode->str);
428 Py_XDECREF(unicode->defenc);
429 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430 }
431}
432
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000433static
434int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435{
436 register PyUnicodeObject *v;
437
438 /* Argument checks */
439 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000440 PyErr_BadInternalCall();
441 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000442 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000443 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000444 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000445 PyErr_BadInternalCall();
446 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000447 }
448
449 /* Resizing unicode_empty and single character objects is not
450 possible since these are being shared. We simply return a fresh
451 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000452 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000453 (v == unicode_empty || v->length == 1)) {
454 PyUnicodeObject *w = _PyUnicode_New(length);
455 if (w == NULL)
456 return -1;
457 Py_UNICODE_COPY(w->str, v->str,
458 length < v->length ? length : v->length);
459 Py_DECREF(*unicode);
460 *unicode = w;
461 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000462 }
463
464 /* Note that we don't have to modify *unicode for unshared Unicode
465 objects, since we can modify them in-place. */
466 return unicode_resize(v, length);
467}
468
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000469int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
470{
471 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
472}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000473
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000475 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476{
477 PyUnicodeObject *unicode;
478
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000479 /* If the Unicode data is known at construction time, we can apply
480 some optimizations which share commonly used objects. */
481 if (u != NULL) {
482
Benjamin Peterson29060642009-01-31 22:14:21 +0000483 /* Optimization for empty strings */
484 if (size == 0 && unicode_empty != NULL) {
485 Py_INCREF(unicode_empty);
486 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000487 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000488
489 /* Single character Unicode objects in the Latin-1 range are
490 shared when using this constructor */
491 if (size == 1 && *u < 256) {
492 unicode = unicode_latin1[*u];
493 if (!unicode) {
494 unicode = _PyUnicode_New(1);
495 if (!unicode)
496 return NULL;
497 unicode->str[0] = *u;
498 unicode_latin1[*u] = unicode;
499 }
500 Py_INCREF(unicode);
501 return (PyObject *)unicode;
502 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000503 }
Tim Petersced69f82003-09-16 20:30:58 +0000504
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 unicode = _PyUnicode_New(size);
506 if (!unicode)
507 return NULL;
508
509 /* Copy the Unicode data into the new object */
510 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000511 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512
513 return (PyObject *)unicode;
514}
515
Walter Dörwaldd2034312007-05-18 16:29:38 +0000516PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000517{
518 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000519
Benjamin Peterson14339b62009-01-31 16:36:08 +0000520 if (size < 0) {
521 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000522 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000523 return NULL;
524 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000525
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000526 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000527 some optimizations which share commonly used objects.
528 Also, this means the input must be UTF-8, so fall back to the
529 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 if (u != NULL) {
531
Benjamin Peterson29060642009-01-31 22:14:21 +0000532 /* Optimization for empty strings */
533 if (size == 0 && unicode_empty != NULL) {
534 Py_INCREF(unicode_empty);
535 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000536 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000537
538 /* Single characters are shared when using this constructor.
539 Restrict to ASCII, since the input must be UTF-8. */
540 if (size == 1 && Py_CHARMASK(*u) < 128) {
541 unicode = unicode_latin1[Py_CHARMASK(*u)];
542 if (!unicode) {
543 unicode = _PyUnicode_New(1);
544 if (!unicode)
545 return NULL;
546 unicode->str[0] = Py_CHARMASK(*u);
547 unicode_latin1[Py_CHARMASK(*u)] = unicode;
548 }
549 Py_INCREF(unicode);
550 return (PyObject *)unicode;
551 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000552
553 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000554 }
555
Walter Dörwald55507312007-05-18 13:12:10 +0000556 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000557 if (!unicode)
558 return NULL;
559
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000560 return (PyObject *)unicode;
561}
562
Walter Dörwaldd2034312007-05-18 16:29:38 +0000563PyObject *PyUnicode_FromString(const char *u)
564{
565 size_t size = strlen(u);
566 if (size > PY_SSIZE_T_MAX) {
567 PyErr_SetString(PyExc_OverflowError, "input too long");
568 return NULL;
569 }
570
571 return PyUnicode_FromStringAndSize(u, size);
572}
573
Guido van Rossumd57fd912000-03-10 22:53:23 +0000574#ifdef HAVE_WCHAR_H
575
Mark Dickinson081dfee2009-03-18 14:47:41 +0000576#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
577# define CONVERT_WCHAR_TO_SURROGATES
578#endif
579
580#ifdef CONVERT_WCHAR_TO_SURROGATES
581
582/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
583 to convert from UTF32 to UTF16. */
584
585PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
586 Py_ssize_t size)
587{
588 PyUnicodeObject *unicode;
589 register Py_ssize_t i;
590 Py_ssize_t alloc;
591 const wchar_t *orig_w;
592
593 if (w == NULL) {
594 if (size == 0)
595 return PyUnicode_FromStringAndSize(NULL, 0);
596 PyErr_BadInternalCall();
597 return NULL;
598 }
599
600 if (size == -1) {
601 size = wcslen(w);
602 }
603
604 alloc = size;
605 orig_w = w;
606 for (i = size; i > 0; i--) {
607 if (*w > 0xFFFF)
608 alloc++;
609 w++;
610 }
611 w = orig_w;
612 unicode = _PyUnicode_New(alloc);
613 if (!unicode)
614 return NULL;
615
616 /* Copy the wchar_t data into the new object */
617 {
618 register Py_UNICODE *u;
619 u = PyUnicode_AS_UNICODE(unicode);
620 for (i = size; i > 0; i--) {
621 if (*w > 0xFFFF) {
622 wchar_t ordinal = *w++;
623 ordinal -= 0x10000;
624 *u++ = 0xD800 | (ordinal >> 10);
625 *u++ = 0xDC00 | (ordinal & 0x3FF);
626 }
627 else
628 *u++ = *w++;
629 }
630 }
631 return (PyObject *)unicode;
632}
633
634#else
635
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000637 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000638{
639 PyUnicodeObject *unicode;
640
641 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000642 if (size == 0)
643 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000644 PyErr_BadInternalCall();
645 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646 }
647
Martin v. Löwis790465f2008-04-05 20:41:37 +0000648 if (size == -1) {
649 size = wcslen(w);
650 }
651
Guido van Rossumd57fd912000-03-10 22:53:23 +0000652 unicode = _PyUnicode_New(size);
653 if (!unicode)
654 return NULL;
655
656 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000657#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000659#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000660 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000661 register Py_UNICODE *u;
662 register Py_ssize_t i;
663 u = PyUnicode_AS_UNICODE(unicode);
664 for (i = size; i > 0; i--)
665 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 }
667#endif
668
669 return (PyObject *)unicode;
670}
671
Mark Dickinson081dfee2009-03-18 14:47:41 +0000672#endif /* CONVERT_WCHAR_TO_SURROGATES */
673
674#undef CONVERT_WCHAR_TO_SURROGATES
675
Walter Dörwald346737f2007-05-31 10:44:43 +0000676static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000677makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
678 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000679{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000680 *fmt++ = '%';
681 if (width) {
682 if (zeropad)
683 *fmt++ = '0';
684 fmt += sprintf(fmt, "%d", width);
685 }
686 if (precision)
687 fmt += sprintf(fmt, ".%d", precision);
688 if (longflag)
689 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000690 else if (longlongflag) {
691 /* longlongflag should only ever be nonzero on machines with
692 HAVE_LONG_LONG defined */
693#ifdef HAVE_LONG_LONG
694 char *f = PY_FORMAT_LONG_LONG;
695 while (*f)
696 *fmt++ = *f++;
697#else
698 /* we shouldn't ever get here */
699 assert(0);
700 *fmt++ = 'l';
701#endif
702 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000703 else if (size_tflag) {
704 char *f = PY_FORMAT_SIZE_T;
705 while (*f)
706 *fmt++ = *f++;
707 }
708 *fmt++ = c;
709 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000710}
711
Walter Dörwaldd2034312007-05-18 16:29:38 +0000712#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
713
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000714/* size of fixed-size buffer for formatting single arguments */
715#define ITEM_BUFFER_LEN 21
716/* maximum number of characters required for output of %ld. 21 characters
717 allows for 64-bit integers (in decimal) and an optional sign. */
718#define MAX_LONG_CHARS 21
719/* maximum number of characters required for output of %lld.
720 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
721 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
722#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
723
Walter Dörwaldd2034312007-05-18 16:29:38 +0000724PyObject *
725PyUnicode_FromFormatV(const char *format, va_list vargs)
726{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000727 va_list count;
728 Py_ssize_t callcount = 0;
729 PyObject **callresults = NULL;
730 PyObject **callresult = NULL;
731 Py_ssize_t n = 0;
732 int width = 0;
733 int precision = 0;
734 int zeropad;
735 const char* f;
736 Py_UNICODE *s;
737 PyObject *string;
738 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000739 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000740 /* use abuffer instead of buffer, if we need more space
741 * (which can happen if there's a format specifier with width). */
742 char *abuffer = NULL;
743 char *realbuffer;
744 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000745 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000746 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000747
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000748 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000749 /* step 1: count the number of %S/%R/%A/%s format specifications
750 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
751 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
752 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000753 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000754 if (*f == '%') {
755 if (*(f+1)=='%')
756 continue;
757 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
758 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000759 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000761 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000762 ;
763 if (*f == 's')
764 ++callcount;
765 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000766 else if (128 <= (unsigned char)*f) {
767 PyErr_Format(PyExc_ValueError,
768 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000769 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000771 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000772 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000773 }
774 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000775 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000776 if (callcount) {
777 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
778 if (!callresults) {
779 PyErr_NoMemory();
780 return NULL;
781 }
782 callresult = callresults;
783 }
784 /* step 3: figure out how large a buffer we need */
785 for (f = format; *f; f++) {
786 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000787#ifdef HAVE_LONG_LONG
788 int longlongflag = 0;
789#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000790 const char* p = f;
791 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000792 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000794 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000796
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
798 * they don't affect the amount of space we reserve.
799 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000800 if (*f == 'l') {
801 if (f[1] == 'd' || f[1] == 'u') {
802 ++f;
803 }
804#ifdef HAVE_LONG_LONG
805 else if (f[1] == 'l' &&
806 (f[2] == 'd' || f[2] == 'u')) {
807 longlongflag = 1;
808 f += 2;
809 }
810#endif
811 }
812 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000813 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000814 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000815
Benjamin Peterson14339b62009-01-31 16:36:08 +0000816 switch (*f) {
817 case 'c':
818 (void)va_arg(count, int);
819 /* fall through... */
820 case '%':
821 n++;
822 break;
823 case 'd': case 'u': case 'i': case 'x':
824 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000825#ifdef HAVE_LONG_LONG
826 if (longlongflag) {
827 if (width < MAX_LONG_LONG_CHARS)
828 width = MAX_LONG_LONG_CHARS;
829 }
830 else
831#endif
832 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
833 including sign. Decimal takes the most space. This
834 isn't enough for octal. If a width is specified we
835 need more (which we allocate later). */
836 if (width < MAX_LONG_CHARS)
837 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000838 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000839 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000840 if (abuffersize < width)
841 abuffersize = width;
842 break;
843 case 's':
844 {
845 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000846 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000847 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
848 if (!str)
849 goto fail;
850 n += PyUnicode_GET_SIZE(str);
851 /* Remember the str and switch to the next slot */
852 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000853 break;
854 }
855 case 'U':
856 {
857 PyObject *obj = va_arg(count, PyObject *);
858 assert(obj && PyUnicode_Check(obj));
859 n += PyUnicode_GET_SIZE(obj);
860 break;
861 }
862 case 'V':
863 {
864 PyObject *obj = va_arg(count, PyObject *);
865 const char *str = va_arg(count, const char *);
866 assert(obj || str);
867 assert(!obj || PyUnicode_Check(obj));
868 if (obj)
869 n += PyUnicode_GET_SIZE(obj);
870 else
871 n += strlen(str);
872 break;
873 }
874 case 'S':
875 {
876 PyObject *obj = va_arg(count, PyObject *);
877 PyObject *str;
878 assert(obj);
879 str = PyObject_Str(obj);
880 if (!str)
881 goto fail;
882 n += PyUnicode_GET_SIZE(str);
883 /* Remember the str and switch to the next slot */
884 *callresult++ = str;
885 break;
886 }
887 case 'R':
888 {
889 PyObject *obj = va_arg(count, PyObject *);
890 PyObject *repr;
891 assert(obj);
892 repr = PyObject_Repr(obj);
893 if (!repr)
894 goto fail;
895 n += PyUnicode_GET_SIZE(repr);
896 /* Remember the repr and switch to the next slot */
897 *callresult++ = repr;
898 break;
899 }
900 case 'A':
901 {
902 PyObject *obj = va_arg(count, PyObject *);
903 PyObject *ascii;
904 assert(obj);
905 ascii = PyObject_ASCII(obj);
906 if (!ascii)
907 goto fail;
908 n += PyUnicode_GET_SIZE(ascii);
909 /* Remember the repr and switch to the next slot */
910 *callresult++ = ascii;
911 break;
912 }
913 case 'p':
914 (void) va_arg(count, int);
915 /* maximum 64-bit pointer representation:
916 * 0xffffffffffffffff
917 * so 19 characters is enough.
918 * XXX I count 18 -- what's the extra for?
919 */
920 n += 19;
921 break;
922 default:
923 /* if we stumble upon an unknown
924 formatting code, copy the rest of
925 the format string to the output
926 string. (we cannot just skip the
927 code, since there's no way to know
928 what's in the argument list) */
929 n += strlen(p);
930 goto expand;
931 }
932 } else
933 n++;
934 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000935 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000936 if (abuffersize > ITEM_BUFFER_LEN) {
937 /* add 1 for sprintf's trailing null byte */
938 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000939 if (!abuffer) {
940 PyErr_NoMemory();
941 goto fail;
942 }
943 realbuffer = abuffer;
944 }
945 else
946 realbuffer = buffer;
947 /* step 4: fill the buffer */
948 /* Since we've analyzed how much space we need for the worst case,
949 we don't have to resize the string.
950 There can be no errors beyond this point. */
951 string = PyUnicode_FromUnicode(NULL, n);
952 if (!string)
953 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000954
Benjamin Peterson14339b62009-01-31 16:36:08 +0000955 s = PyUnicode_AS_UNICODE(string);
956 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000957
Benjamin Peterson14339b62009-01-31 16:36:08 +0000958 for (f = format; *f; f++) {
959 if (*f == '%') {
960 const char* p = f++;
961 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000962 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000963 int size_tflag = 0;
964 zeropad = (*f == '0');
965 /* parse the width.precision part */
966 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000967 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000968 width = (width*10) + *f++ - '0';
969 precision = 0;
970 if (*f == '.') {
971 f++;
David Malcolm96960882010-11-05 17:23:41 +0000972 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000973 precision = (precision*10) + *f++ - '0';
974 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000975 /* Handle %ld, %lu, %lld and %llu. */
976 if (*f == 'l') {
977 if (f[1] == 'd' || f[1] == 'u') {
978 longflag = 1;
979 ++f;
980 }
981#ifdef HAVE_LONG_LONG
982 else if (f[1] == 'l' &&
983 (f[2] == 'd' || f[2] == 'u')) {
984 longlongflag = 1;
985 f += 2;
986 }
987#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000988 }
989 /* handle the size_t flag. */
990 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
991 size_tflag = 1;
992 ++f;
993 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000994
Benjamin Peterson14339b62009-01-31 16:36:08 +0000995 switch (*f) {
996 case 'c':
997 *s++ = va_arg(vargs, int);
998 break;
999 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001000 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1001 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 if (longflag)
1003 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001004#ifdef HAVE_LONG_LONG
1005 else if (longlongflag)
1006 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1007#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001008 else if (size_tflag)
1009 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1010 else
1011 sprintf(realbuffer, fmt, va_arg(vargs, int));
1012 appendstring(realbuffer);
1013 break;
1014 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001015 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1016 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001017 if (longflag)
1018 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001019#ifdef HAVE_LONG_LONG
1020 else if (longlongflag)
1021 sprintf(realbuffer, fmt, va_arg(vargs,
1022 unsigned PY_LONG_LONG));
1023#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001024 else if (size_tflag)
1025 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1026 else
1027 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1028 appendstring(realbuffer);
1029 break;
1030 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001031 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001032 sprintf(realbuffer, fmt, va_arg(vargs, int));
1033 appendstring(realbuffer);
1034 break;
1035 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001036 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001037 sprintf(realbuffer, fmt, va_arg(vargs, int));
1038 appendstring(realbuffer);
1039 break;
1040 case 's':
1041 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001042 /* unused, since we already have the result */
1043 (void) va_arg(vargs, char *);
1044 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1045 PyUnicode_GET_SIZE(*callresult));
1046 s += PyUnicode_GET_SIZE(*callresult);
1047 /* We're done with the unicode()/repr() => forget it */
1048 Py_DECREF(*callresult);
1049 /* switch to next unicode()/repr() result */
1050 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 break;
1052 }
1053 case 'U':
1054 {
1055 PyObject *obj = va_arg(vargs, PyObject *);
1056 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1057 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1058 s += size;
1059 break;
1060 }
1061 case 'V':
1062 {
1063 PyObject *obj = va_arg(vargs, PyObject *);
1064 const char *str = va_arg(vargs, const char *);
1065 if (obj) {
1066 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1067 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1068 s += size;
1069 } else {
1070 appendstring(str);
1071 }
1072 break;
1073 }
1074 case 'S':
1075 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001076 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001077 {
1078 Py_UNICODE *ucopy;
1079 Py_ssize_t usize;
1080 Py_ssize_t upos;
1081 /* unused, since we already have the result */
1082 (void) va_arg(vargs, PyObject *);
1083 ucopy = PyUnicode_AS_UNICODE(*callresult);
1084 usize = PyUnicode_GET_SIZE(*callresult);
1085 for (upos = 0; upos<usize;)
1086 *s++ = ucopy[upos++];
1087 /* We're done with the unicode()/repr() => forget it */
1088 Py_DECREF(*callresult);
1089 /* switch to next unicode()/repr() result */
1090 ++callresult;
1091 break;
1092 }
1093 case 'p':
1094 sprintf(buffer, "%p", va_arg(vargs, void*));
1095 /* %p is ill-defined: ensure leading 0x. */
1096 if (buffer[1] == 'X')
1097 buffer[1] = 'x';
1098 else if (buffer[1] != 'x') {
1099 memmove(buffer+2, buffer, strlen(buffer)+1);
1100 buffer[0] = '0';
1101 buffer[1] = 'x';
1102 }
1103 appendstring(buffer);
1104 break;
1105 case '%':
1106 *s++ = '%';
1107 break;
1108 default:
1109 appendstring(p);
1110 goto end;
1111 }
Victor Stinner1205f272010-09-11 00:54:47 +00001112 }
Victor Stinner1205f272010-09-11 00:54:47 +00001113 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001114 *s++ = *f;
1115 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001116
Benjamin Peterson29060642009-01-31 22:14:21 +00001117 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 if (callresults)
1119 PyObject_Free(callresults);
1120 if (abuffer)
1121 PyObject_Free(abuffer);
1122 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1123 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001124 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 if (callresults) {
1126 PyObject **callresult2 = callresults;
1127 while (callresult2 < callresult) {
1128 Py_DECREF(*callresult2);
1129 ++callresult2;
1130 }
1131 PyObject_Free(callresults);
1132 }
1133 if (abuffer)
1134 PyObject_Free(abuffer);
1135 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001136}
1137
1138#undef appendstring
1139
1140PyObject *
1141PyUnicode_FromFormat(const char *format, ...)
1142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001143 PyObject* ret;
1144 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001145
1146#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001147 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001148#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001150#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001151 ret = PyUnicode_FromFormatV(format, vargs);
1152 va_end(vargs);
1153 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001154}
1155
Victor Stinner5593d8a2010-10-02 11:11:27 +00001156/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1157 convert a Unicode object to a wide character string.
1158
1159 - If w is NULL: return the number of wide characters (including the nul
1160 character) required to convert the unicode object. Ignore size argument.
1161
1162 - Otherwise: return the number of wide characters (excluding the nul
1163 character) written into w. Write at most size wide characters (including
1164 the nul character). */
1165static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001166unicode_aswidechar(PyUnicodeObject *unicode,
1167 wchar_t *w,
1168 Py_ssize_t size)
1169{
1170#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001171 Py_ssize_t res;
1172 if (w != NULL) {
1173 res = PyUnicode_GET_SIZE(unicode);
1174 if (size > res)
1175 size = res + 1;
1176 else
1177 res = size;
1178 memcpy(w, unicode->str, size * sizeof(wchar_t));
1179 return res;
1180 }
1181 else
1182 return PyUnicode_GET_SIZE(unicode) + 1;
1183#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1184 register const Py_UNICODE *u;
1185 const Py_UNICODE *uend;
1186 const wchar_t *worig, *wend;
1187 Py_ssize_t nchar;
1188
Victor Stinner137c34c2010-09-29 10:25:54 +00001189 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001190 uend = u + PyUnicode_GET_SIZE(unicode);
1191 if (w != NULL) {
1192 worig = w;
1193 wend = w + size;
1194 while (u != uend && w != wend) {
1195 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1196 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1197 {
1198 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1199 u += 2;
1200 }
1201 else {
1202 *w = *u;
1203 u++;
1204 }
1205 w++;
1206 }
1207 if (w != wend)
1208 *w = L'\0';
1209 return w - worig;
1210 }
1211 else {
1212 nchar = 1; /* nul character at the end */
1213 while (u != uend) {
1214 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1215 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1216 u += 2;
1217 else
1218 u++;
1219 nchar++;
1220 }
1221 }
1222 return nchar;
1223#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1224 register Py_UNICODE *u, *uend, ordinal;
1225 register Py_ssize_t i;
1226 wchar_t *worig, *wend;
1227 Py_ssize_t nchar;
1228
1229 u = PyUnicode_AS_UNICODE(unicode);
1230 uend = u + PyUnicode_GET_SIZE(u);
1231 if (w != NULL) {
1232 worig = w;
1233 wend = w + size;
1234 while (u != uend && w != wend) {
1235 ordinal = *u;
1236 if (ordinal > 0xffff) {
1237 ordinal -= 0x10000;
1238 *w++ = 0xD800 | (ordinal >> 10);
1239 *w++ = 0xDC00 | (ordinal & 0x3FF);
1240 }
1241 else
1242 *w++ = ordinal;
1243 u++;
1244 }
1245 if (w != wend)
1246 *w = 0;
1247 return w - worig;
1248 }
1249 else {
1250 nchar = 1; /* nul character */
1251 while (u != uend) {
1252 if (*u > 0xffff)
1253 nchar += 2;
1254 else
1255 nchar++;
1256 u++;
1257 }
1258 return nchar;
1259 }
1260#else
1261# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001262#endif
1263}
1264
1265Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001266PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001267 wchar_t *w,
1268 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269{
1270 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001271 PyErr_BadInternalCall();
1272 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001274 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275}
1276
Victor Stinner137c34c2010-09-29 10:25:54 +00001277wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001278PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001279 Py_ssize_t *size)
1280{
1281 wchar_t* buffer;
1282 Py_ssize_t buflen;
1283
1284 if (unicode == NULL) {
1285 PyErr_BadInternalCall();
1286 return NULL;
1287 }
1288
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001289 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001290 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001291 PyErr_NoMemory();
1292 return NULL;
1293 }
1294
Victor Stinner137c34c2010-09-29 10:25:54 +00001295 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1296 if (buffer == NULL) {
1297 PyErr_NoMemory();
1298 return NULL;
1299 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001300 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001301 if (size != NULL)
1302 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001303 return buffer;
1304}
1305
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306#endif
1307
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001308PyObject *PyUnicode_FromOrdinal(int ordinal)
1309{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001310 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001311
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001312 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001313 PyErr_SetString(PyExc_ValueError,
1314 "chr() arg not in range(0x110000)");
1315 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001316 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001317
1318#ifndef Py_UNICODE_WIDE
1319 if (ordinal > 0xffff) {
1320 ordinal -= 0x10000;
1321 s[0] = 0xD800 | (ordinal >> 10);
1322 s[1] = 0xDC00 | (ordinal & 0x3FF);
1323 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001324 }
1325#endif
1326
Hye-Shik Chang40574832004-04-06 07:24:51 +00001327 s[0] = (Py_UNICODE)ordinal;
1328 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001329}
1330
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331PyObject *PyUnicode_FromObject(register PyObject *obj)
1332{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001333 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001334 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001335 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001336 Py_INCREF(obj);
1337 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001338 }
1339 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001340 /* For a Unicode subtype that's not a Unicode object,
1341 return a true Unicode object with the same data. */
1342 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1343 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001344 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001345 PyErr_Format(PyExc_TypeError,
1346 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001347 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001348 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001349}
1350
1351PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001352 const char *encoding,
1353 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001354{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001355 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001356 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001357
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 PyErr_BadInternalCall();
1360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001362
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001363 /* Decoding bytes objects is the most common case and should be fast */
1364 if (PyBytes_Check(obj)) {
1365 if (PyBytes_GET_SIZE(obj) == 0) {
1366 Py_INCREF(unicode_empty);
1367 v = (PyObject *) unicode_empty;
1368 }
1369 else {
1370 v = PyUnicode_Decode(
1371 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1372 encoding, errors);
1373 }
1374 return v;
1375 }
1376
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001377 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001378 PyErr_SetString(PyExc_TypeError,
1379 "decoding str is not supported");
1380 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001381 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001382
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001383 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1384 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1385 PyErr_Format(PyExc_TypeError,
1386 "coercing to str: need bytes, bytearray "
1387 "or buffer-like object, %.80s found",
1388 Py_TYPE(obj)->tp_name);
1389 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001390 }
Tim Petersced69f82003-09-16 20:30:58 +00001391
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001392 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001393 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001394 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395 }
Tim Petersced69f82003-09-16 20:30:58 +00001396 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001397 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001398
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001399 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001400 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401}
1402
Victor Stinner600d3be2010-06-10 12:00:55 +00001403/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001404 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1405 1 on success. */
1406static int
1407normalize_encoding(const char *encoding,
1408 char *lower,
1409 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001411 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001412 char *l;
1413 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001415 e = encoding;
1416 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001417 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001418 while (*e) {
1419 if (l == l_end)
1420 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001421 if (Py_ISUPPER(*e)) {
1422 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001423 }
1424 else if (*e == '_') {
1425 *l++ = '-';
1426 e++;
1427 }
1428 else {
1429 *l++ = *e++;
1430 }
1431 }
1432 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001433 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001434}
1435
1436PyObject *PyUnicode_Decode(const char *s,
1437 Py_ssize_t size,
1438 const char *encoding,
1439 const char *errors)
1440{
1441 PyObject *buffer = NULL, *unicode;
1442 Py_buffer info;
1443 char lower[11]; /* Enough for any encoding shortcut */
1444
1445 if (encoding == NULL)
1446 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001447
1448 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001449 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1450 if (strcmp(lower, "utf-8") == 0)
1451 return PyUnicode_DecodeUTF8(s, size, errors);
1452 else if ((strcmp(lower, "latin-1") == 0) ||
1453 (strcmp(lower, "iso-8859-1") == 0))
1454 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001455#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001456 else if (strcmp(lower, "mbcs") == 0)
1457 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001458#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001459 else if (strcmp(lower, "ascii") == 0)
1460 return PyUnicode_DecodeASCII(s, size, errors);
1461 else if (strcmp(lower, "utf-16") == 0)
1462 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1463 else if (strcmp(lower, "utf-32") == 0)
1464 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466
1467 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001468 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001469 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001470 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001471 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001472 if (buffer == NULL)
1473 goto onError;
1474 unicode = PyCodec_Decode(buffer, encoding, errors);
1475 if (unicode == NULL)
1476 goto onError;
1477 if (!PyUnicode_Check(unicode)) {
1478 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001479 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001480 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481 Py_DECREF(unicode);
1482 goto onError;
1483 }
1484 Py_DECREF(buffer);
1485 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001486
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488 Py_XDECREF(buffer);
1489 return NULL;
1490}
1491
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001492PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1493 const char *encoding,
1494 const char *errors)
1495{
1496 PyObject *v;
1497
1498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 goto onError;
1501 }
1502
1503 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001504 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001505
1506 /* Decode via the codec registry */
1507 v = PyCodec_Decode(unicode, encoding, errors);
1508 if (v == NULL)
1509 goto onError;
1510 return v;
1511
Benjamin Peterson29060642009-01-31 22:14:21 +00001512 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001513 return NULL;
1514}
1515
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001516PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1517 const char *encoding,
1518 const char *errors)
1519{
1520 PyObject *v;
1521
1522 if (!PyUnicode_Check(unicode)) {
1523 PyErr_BadArgument();
1524 goto onError;
1525 }
1526
1527 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001528 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001529
1530 /* Decode via the codec registry */
1531 v = PyCodec_Decode(unicode, encoding, errors);
1532 if (v == NULL)
1533 goto onError;
1534 if (!PyUnicode_Check(v)) {
1535 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001536 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001537 Py_TYPE(v)->tp_name);
1538 Py_DECREF(v);
1539 goto onError;
1540 }
1541 return v;
1542
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001544 return NULL;
1545}
1546
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001548 Py_ssize_t size,
1549 const char *encoding,
1550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001551{
1552 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001553
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554 unicode = PyUnicode_FromUnicode(s, size);
1555 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001556 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001557 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1558 Py_DECREF(unicode);
1559 return v;
1560}
1561
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001562PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1563 const char *encoding,
1564 const char *errors)
1565{
1566 PyObject *v;
1567
1568 if (!PyUnicode_Check(unicode)) {
1569 PyErr_BadArgument();
1570 goto onError;
1571 }
1572
1573 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001575
1576 /* Encode via the codec registry */
1577 v = PyCodec_Encode(unicode, encoding, errors);
1578 if (v == NULL)
1579 goto onError;
1580 return v;
1581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001583 return NULL;
1584}
1585
Victor Stinnerad158722010-10-27 00:25:46 +00001586PyObject *
1587PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001588{
Victor Stinner313a1202010-06-11 23:56:51 +00001589#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001590 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1591 PyUnicode_GET_SIZE(unicode),
1592 NULL);
1593#elif defined(__APPLE__)
1594 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1595 PyUnicode_GET_SIZE(unicode),
1596 "surrogateescape");
1597#else
1598 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001599 return PyUnicode_AsEncodedString(unicode,
1600 Py_FileSystemDefaultEncoding,
1601 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001602 }
1603 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001604 /* locale encoding with surrogateescape */
1605 wchar_t *wchar;
1606 char *bytes;
1607 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001608 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001609
1610 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1611 if (wchar == NULL)
1612 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001613 bytes = _Py_wchar2char(wchar, &error_pos);
1614 if (bytes == NULL) {
1615 if (error_pos != (size_t)-1) {
1616 char *errmsg = strerror(errno);
1617 PyObject *exc = NULL;
1618 if (errmsg == NULL)
1619 errmsg = "Py_wchar2char() failed";
1620 raise_encode_exception(&exc,
1621 "filesystemencoding",
1622 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1623 error_pos, error_pos+1,
1624 errmsg);
1625 Py_XDECREF(exc);
1626 }
1627 else
1628 PyErr_NoMemory();
1629 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001630 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001631 }
1632 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001633
1634 bytes_obj = PyBytes_FromString(bytes);
1635 PyMem_Free(bytes);
1636 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001637 }
Victor Stinnerad158722010-10-27 00:25:46 +00001638#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001639}
1640
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1642 const char *encoding,
1643 const char *errors)
1644{
1645 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001646 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001647
Guido van Rossumd57fd912000-03-10 22:53:23 +00001648 if (!PyUnicode_Check(unicode)) {
1649 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651 }
Fred Drakee4315f52000-05-09 19:53:39 +00001652
Tim Petersced69f82003-09-16 20:30:58 +00001653 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001654 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001655
1656 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001657 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1658 if (strcmp(lower, "utf-8") == 0)
1659 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1660 PyUnicode_GET_SIZE(unicode),
1661 errors);
1662 else if ((strcmp(lower, "latin-1") == 0) ||
1663 (strcmp(lower, "iso-8859-1") == 0))
1664 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1665 PyUnicode_GET_SIZE(unicode),
1666 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001667#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001668 else if (strcmp(lower, "mbcs") == 0)
1669 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1670 PyUnicode_GET_SIZE(unicode),
1671 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001672#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001673 else if (strcmp(lower, "ascii") == 0)
1674 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1675 PyUnicode_GET_SIZE(unicode),
1676 errors);
1677 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001678 /* During bootstrap, we may need to find the encodings
1679 package, to load the file system encoding, and require the
1680 file system encoding in order to load the encodings
1681 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001682
Victor Stinner59e62db2010-05-15 13:14:32 +00001683 Break out of this dependency by assuming that the path to
1684 the encodings module is ASCII-only. XXX could try wcstombs
1685 instead, if the file system encoding is the locale's
1686 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001687 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001688 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1689 !PyThreadState_GET()->interp->codecs_initialized)
1690 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1691 PyUnicode_GET_SIZE(unicode),
1692 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693
1694 /* Encode via the codec registry */
1695 v = PyCodec_Encode(unicode, encoding, errors);
1696 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001697 return NULL;
1698
1699 /* The normal path */
1700 if (PyBytes_Check(v))
1701 return v;
1702
1703 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001704 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001705 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001706 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001707
1708 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1709 "encoder %s returned bytearray instead of bytes",
1710 encoding);
1711 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001712 Py_DECREF(v);
1713 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001714 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001715
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001716 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1717 Py_DECREF(v);
1718 return b;
1719 }
1720
1721 PyErr_Format(PyExc_TypeError,
1722 "encoder did not return a bytes object (type=%.400s)",
1723 Py_TYPE(v)->tp_name);
1724 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001725 return NULL;
1726}
1727
1728PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1729 const char *encoding,
1730 const char *errors)
1731{
1732 PyObject *v;
1733
1734 if (!PyUnicode_Check(unicode)) {
1735 PyErr_BadArgument();
1736 goto onError;
1737 }
1738
1739 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001740 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001741
1742 /* Encode via the codec registry */
1743 v = PyCodec_Encode(unicode, encoding, errors);
1744 if (v == NULL)
1745 goto onError;
1746 if (!PyUnicode_Check(v)) {
1747 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001748 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001749 Py_TYPE(v)->tp_name);
1750 Py_DECREF(v);
1751 goto onError;
1752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001754
Benjamin Peterson29060642009-01-31 22:14:21 +00001755 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 return NULL;
1757}
1758
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001759PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001760 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001761{
1762 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001763 if (v)
1764 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001765 if (errors != NULL)
1766 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001767 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001768 PyUnicode_GET_SIZE(unicode),
1769 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001770 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001771 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001772 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001773 return v;
1774}
1775
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001776PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001777PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001778 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001779 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1780}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001781
Christian Heimes5894ba72007-11-04 11:43:14 +00001782PyObject*
1783PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1784{
Victor Stinnerad158722010-10-27 00:25:46 +00001785#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1786 return PyUnicode_DecodeMBCS(s, size, NULL);
1787#elif defined(__APPLE__)
1788 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1789#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001790 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1791 can be undefined. If it is case, decode using UTF-8. The following assumes
1792 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1793 bootstrapping process where the codecs aren't ready yet.
1794 */
1795 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001796 return PyUnicode_Decode(s, size,
1797 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001798 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001799 }
1800 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001801 /* locale encoding with surrogateescape */
1802 wchar_t *wchar;
1803 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001804 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001805
1806 if (s[size] != '\0' || size != strlen(s)) {
1807 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1808 return NULL;
1809 }
1810
Victor Stinner168e1172010-10-16 23:16:16 +00001811 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001812 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001813 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001814
Victor Stinner168e1172010-10-16 23:16:16 +00001815 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001816 PyMem_Free(wchar);
1817 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001818 }
Victor Stinnerad158722010-10-27 00:25:46 +00001819#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001820}
1821
Martin v. Löwis011e8422009-05-05 04:43:17 +00001822
1823int
1824PyUnicode_FSConverter(PyObject* arg, void* addr)
1825{
1826 PyObject *output = NULL;
1827 Py_ssize_t size;
1828 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001829 if (arg == NULL) {
1830 Py_DECREF(*(PyObject**)addr);
1831 return 1;
1832 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001833 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001834 output = arg;
1835 Py_INCREF(output);
1836 }
1837 else {
1838 arg = PyUnicode_FromObject(arg);
1839 if (!arg)
1840 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001841 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001842 Py_DECREF(arg);
1843 if (!output)
1844 return 0;
1845 if (!PyBytes_Check(output)) {
1846 Py_DECREF(output);
1847 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1848 return 0;
1849 }
1850 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001851 size = PyBytes_GET_SIZE(output);
1852 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001853 if (size != strlen(data)) {
1854 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1855 Py_DECREF(output);
1856 return 0;
1857 }
1858 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001859 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001860}
1861
1862
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001863int
1864PyUnicode_FSDecoder(PyObject* arg, void* addr)
1865{
1866 PyObject *output = NULL;
1867 Py_ssize_t size;
1868 void *data;
1869 if (arg == NULL) {
1870 Py_DECREF(*(PyObject**)addr);
1871 return 1;
1872 }
1873 if (PyUnicode_Check(arg)) {
1874 output = arg;
1875 Py_INCREF(output);
1876 }
1877 else {
1878 arg = PyBytes_FromObject(arg);
1879 if (!arg)
1880 return 0;
1881 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1882 PyBytes_GET_SIZE(arg));
1883 Py_DECREF(arg);
1884 if (!output)
1885 return 0;
1886 if (!PyUnicode_Check(output)) {
1887 Py_DECREF(output);
1888 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1889 return 0;
1890 }
1891 }
1892 size = PyUnicode_GET_SIZE(output);
1893 data = PyUnicode_AS_UNICODE(output);
1894 if (size != Py_UNICODE_strlen(data)) {
1895 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1896 Py_DECREF(output);
1897 return 0;
1898 }
1899 *(PyObject**)addr = output;
1900 return Py_CLEANUP_SUPPORTED;
1901}
1902
1903
Martin v. Löwis5b222132007-06-10 09:51:05 +00001904char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001905_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001906{
Christian Heimesf3863112007-11-22 07:46:41 +00001907 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001908 if (!PyUnicode_Check(unicode)) {
1909 PyErr_BadArgument();
1910 return NULL;
1911 }
Christian Heimesf3863112007-11-22 07:46:41 +00001912 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1913 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001914 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001915 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001916 *psize = PyBytes_GET_SIZE(bytes);
1917 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001918}
1919
1920char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001921_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001922{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001923 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001924}
1925
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1927{
1928 if (!PyUnicode_Check(unicode)) {
1929 PyErr_BadArgument();
1930 goto onError;
1931 }
1932 return PyUnicode_AS_UNICODE(unicode);
1933
Benjamin Peterson29060642009-01-31 22:14:21 +00001934 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001935 return NULL;
1936}
1937
Martin v. Löwis18e16552006-02-15 17:27:45 +00001938Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939{
1940 if (!PyUnicode_Check(unicode)) {
1941 PyErr_BadArgument();
1942 goto onError;
1943 }
1944 return PyUnicode_GET_SIZE(unicode);
1945
Benjamin Peterson29060642009-01-31 22:14:21 +00001946 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 return -1;
1948}
1949
Thomas Wouters78890102000-07-22 19:25:51 +00001950const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001951{
Victor Stinner42cb4622010-09-01 19:39:01 +00001952 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001953}
1954
Victor Stinner554f3f02010-06-16 23:33:54 +00001955/* create or adjust a UnicodeDecodeError */
1956static void
1957make_decode_exception(PyObject **exceptionObject,
1958 const char *encoding,
1959 const char *input, Py_ssize_t length,
1960 Py_ssize_t startpos, Py_ssize_t endpos,
1961 const char *reason)
1962{
1963 if (*exceptionObject == NULL) {
1964 *exceptionObject = PyUnicodeDecodeError_Create(
1965 encoding, input, length, startpos, endpos, reason);
1966 }
1967 else {
1968 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1969 goto onError;
1970 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1971 goto onError;
1972 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1973 goto onError;
1974 }
1975 return;
1976
1977onError:
1978 Py_DECREF(*exceptionObject);
1979 *exceptionObject = NULL;
1980}
1981
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001982/* error handling callback helper:
1983 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001984 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001985 and adjust various state variables.
1986 return 0 on success, -1 on error
1987*/
1988
1989static
1990int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001991 const char *encoding, const char *reason,
1992 const char **input, const char **inend, Py_ssize_t *startinpos,
1993 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1994 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001995{
Benjamin Peterson142957c2008-07-04 19:55:29 +00001996 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001997
1998 PyObject *restuple = NULL;
1999 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002000 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002001 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002002 Py_ssize_t requiredsize;
2003 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002004 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002005 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002006 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002007 int res = -1;
2008
2009 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002010 *errorHandler = PyCodec_LookupError(errors);
2011 if (*errorHandler == NULL)
2012 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013 }
2014
Victor Stinner554f3f02010-06-16 23:33:54 +00002015 make_decode_exception(exceptionObject,
2016 encoding,
2017 *input, *inend - *input,
2018 *startinpos, *endinpos,
2019 reason);
2020 if (*exceptionObject == NULL)
2021 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002022
2023 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2024 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002025 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002027 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002028 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 }
2030 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002032
2033 /* Copy back the bytes variables, which might have been modified by the
2034 callback */
2035 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2036 if (!inputobj)
2037 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002038 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002039 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002040 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002041 *input = PyBytes_AS_STRING(inputobj);
2042 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002043 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002044 /* we can DECREF safely, as the exception has another reference,
2045 so the object won't go away. */
2046 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002049 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002050 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002051 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2052 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002053 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054
2055 /* need more space? (at least enough for what we
2056 have+the replacement+the rest of the string (starting
2057 at the new input position), so we won't have to check space
2058 when there are no errors in the rest of the string) */
2059 repptr = PyUnicode_AS_UNICODE(repunicode);
2060 repsize = PyUnicode_GET_SIZE(repunicode);
2061 requiredsize = *outpos + repsize + insize-newpos;
2062 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002063 if (requiredsize<2*outsize)
2064 requiredsize = 2*outsize;
2065 if (_PyUnicode_Resize(output, requiredsize) < 0)
2066 goto onError;
2067 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002068 }
2069 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002070 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002071 Py_UNICODE_COPY(*outptr, repptr, repsize);
2072 *outptr += repsize;
2073 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002074
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002075 /* we made it! */
2076 res = 0;
2077
Benjamin Peterson29060642009-01-31 22:14:21 +00002078 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 Py_XDECREF(restuple);
2080 return res;
2081}
2082
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002083/* --- UTF-7 Codec -------------------------------------------------------- */
2084
Antoine Pitrou244651a2009-05-04 18:56:13 +00002085/* See RFC2152 for details. We encode conservatively and decode liberally. */
2086
2087/* Three simple macros defining base-64. */
2088
2089/* Is c a base-64 character? */
2090
2091#define IS_BASE64(c) \
2092 (((c) >= 'A' && (c) <= 'Z') || \
2093 ((c) >= 'a' && (c) <= 'z') || \
2094 ((c) >= '0' && (c) <= '9') || \
2095 (c) == '+' || (c) == '/')
2096
2097/* given that c is a base-64 character, what is its base-64 value? */
2098
2099#define FROM_BASE64(c) \
2100 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2101 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2102 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2103 (c) == '+' ? 62 : 63)
2104
2105/* What is the base-64 character of the bottom 6 bits of n? */
2106
2107#define TO_BASE64(n) \
2108 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2109
2110/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2111 * decoded as itself. We are permissive on decoding; the only ASCII
2112 * byte not decoding to itself is the + which begins a base64
2113 * string. */
2114
2115#define DECODE_DIRECT(c) \
2116 ((c) <= 127 && (c) != '+')
2117
2118/* The UTF-7 encoder treats ASCII characters differently according to
2119 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2120 * the above). See RFC2152. This array identifies these different
2121 * sets:
2122 * 0 : "Set D"
2123 * alphanumeric and '(),-./:?
2124 * 1 : "Set O"
2125 * !"#$%&*;<=>@[]^_`{|}
2126 * 2 : "whitespace"
2127 * ht nl cr sp
2128 * 3 : special (must be base64 encoded)
2129 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2130 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002131
Tim Petersced69f82003-09-16 20:30:58 +00002132static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002133char utf7_category[128] = {
2134/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2135 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2136/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2137 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2138/* sp ! " # $ % & ' ( ) * + , - . / */
2139 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2140/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2142/* @ A B C D E F G H I J K L M N O */
2143 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2144/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2146/* ` a b c d e f g h i j k l m n o */
2147 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2148/* p q r s t u v w x y z { | } ~ del */
2149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002150};
2151
Antoine Pitrou244651a2009-05-04 18:56:13 +00002152/* ENCODE_DIRECT: this character should be encoded as itself. The
2153 * answer depends on whether we are encoding set O as itself, and also
2154 * on whether we are encoding whitespace as itself. RFC2152 makes it
2155 * clear that the answers to these questions vary between
2156 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002157
Antoine Pitrou244651a2009-05-04 18:56:13 +00002158#define ENCODE_DIRECT(c, directO, directWS) \
2159 ((c) < 128 && (c) > 0 && \
2160 ((utf7_category[(c)] == 0) || \
2161 (directWS && (utf7_category[(c)] == 2)) || \
2162 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002163
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002164PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002165 Py_ssize_t size,
2166 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002167{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002168 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2169}
2170
Antoine Pitrou244651a2009-05-04 18:56:13 +00002171/* The decoder. The only state we preserve is our read position,
2172 * i.e. how many characters we have consumed. So if we end in the
2173 * middle of a shift sequence we have to back off the read position
2174 * and the output to the beginning of the sequence, otherwise we lose
2175 * all the shift state (seen bits, number of bits seen, high
2176 * surrogate). */
2177
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002178PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002179 Py_ssize_t size,
2180 const char *errors,
2181 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002182{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002183 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002184 Py_ssize_t startinpos;
2185 Py_ssize_t endinpos;
2186 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002187 const char *e;
2188 PyUnicodeObject *unicode;
2189 Py_UNICODE *p;
2190 const char *errmsg = "";
2191 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002192 Py_UNICODE *shiftOutStart;
2193 unsigned int base64bits = 0;
2194 unsigned long base64buffer = 0;
2195 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002196 PyObject *errorHandler = NULL;
2197 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002198
2199 unicode = _PyUnicode_New(size);
2200 if (!unicode)
2201 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002202 if (size == 0) {
2203 if (consumed)
2204 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002205 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002206 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002207
2208 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002209 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002210 e = s + size;
2211
2212 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002213 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002214 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002215 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002216
Antoine Pitrou244651a2009-05-04 18:56:13 +00002217 if (inShift) { /* in a base-64 section */
2218 if (IS_BASE64(ch)) { /* consume a base-64 character */
2219 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2220 base64bits += 6;
2221 s++;
2222 if (base64bits >= 16) {
2223 /* we have enough bits for a UTF-16 value */
2224 Py_UNICODE outCh = (Py_UNICODE)
2225 (base64buffer >> (base64bits-16));
2226 base64bits -= 16;
2227 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2228 if (surrogate) {
2229 /* expecting a second surrogate */
2230 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2231#ifdef Py_UNICODE_WIDE
2232 *p++ = (((surrogate & 0x3FF)<<10)
2233 | (outCh & 0x3FF)) + 0x10000;
2234#else
2235 *p++ = surrogate;
2236 *p++ = outCh;
2237#endif
2238 surrogate = 0;
2239 }
2240 else {
2241 surrogate = 0;
2242 errmsg = "second surrogate missing";
2243 goto utf7Error;
2244 }
2245 }
2246 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2247 /* first surrogate */
2248 surrogate = outCh;
2249 }
2250 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2251 errmsg = "unexpected second surrogate";
2252 goto utf7Error;
2253 }
2254 else {
2255 *p++ = outCh;
2256 }
2257 }
2258 }
2259 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002260 inShift = 0;
2261 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002262 if (surrogate) {
2263 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002264 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002265 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002266 if (base64bits > 0) { /* left-over bits */
2267 if (base64bits >= 6) {
2268 /* We've seen at least one base-64 character */
2269 errmsg = "partial character in shift sequence";
2270 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002271 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002272 else {
2273 /* Some bits remain; they should be zero */
2274 if (base64buffer != 0) {
2275 errmsg = "non-zero padding bits in shift sequence";
2276 goto utf7Error;
2277 }
2278 }
2279 }
2280 if (ch != '-') {
2281 /* '-' is absorbed; other terminating
2282 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002283 *p++ = ch;
2284 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002285 }
2286 }
2287 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002288 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002289 s++; /* consume '+' */
2290 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002291 s++;
2292 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293 }
2294 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002295 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002296 shiftOutStart = p;
2297 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002298 }
2299 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002300 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002301 *p++ = ch;
2302 s++;
2303 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002304 else {
2305 startinpos = s-starts;
2306 s++;
2307 errmsg = "unexpected special character";
2308 goto utf7Error;
2309 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002310 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002311utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 outpos = p-PyUnicode_AS_UNICODE(unicode);
2313 endinpos = s-starts;
2314 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002315 errors, &errorHandler,
2316 "utf7", errmsg,
2317 &starts, &e, &startinpos, &endinpos, &exc, &s,
2318 &unicode, &outpos, &p))
2319 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002320 }
2321
Antoine Pitrou244651a2009-05-04 18:56:13 +00002322 /* end of string */
2323
2324 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2325 /* if we're in an inconsistent state, that's an error */
2326 if (surrogate ||
2327 (base64bits >= 6) ||
2328 (base64bits > 0 && base64buffer != 0)) {
2329 outpos = p-PyUnicode_AS_UNICODE(unicode);
2330 endinpos = size;
2331 if (unicode_decode_call_errorhandler(
2332 errors, &errorHandler,
2333 "utf7", "unterminated shift sequence",
2334 &starts, &e, &startinpos, &endinpos, &exc, &s,
2335 &unicode, &outpos, &p))
2336 goto onError;
2337 if (s < e)
2338 goto restart;
2339 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002340 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002341
2342 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002343 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002344 if (inShift) {
2345 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002346 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002347 }
2348 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002349 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002350 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002351 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002352
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002353 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002354 goto onError;
2355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002356 Py_XDECREF(errorHandler);
2357 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002358 return (PyObject *)unicode;
2359
Benjamin Peterson29060642009-01-31 22:14:21 +00002360 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002361 Py_XDECREF(errorHandler);
2362 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002363 Py_DECREF(unicode);
2364 return NULL;
2365}
2366
2367
2368PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002369 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002370 int base64SetO,
2371 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002372 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002373{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002374 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002375 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002376 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002377 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002378 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002379 unsigned int base64bits = 0;
2380 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002381 char * out;
2382 char * start;
2383
2384 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002385 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002386
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002387 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002388 return PyErr_NoMemory();
2389
Antoine Pitrou244651a2009-05-04 18:56:13 +00002390 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002391 if (v == NULL)
2392 return NULL;
2393
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002394 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002395 for (;i < size; ++i) {
2396 Py_UNICODE ch = s[i];
2397
Antoine Pitrou244651a2009-05-04 18:56:13 +00002398 if (inShift) {
2399 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2400 /* shifting out */
2401 if (base64bits) { /* output remaining bits */
2402 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2403 base64buffer = 0;
2404 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002405 }
2406 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002407 /* Characters not in the BASE64 set implicitly unshift the sequence
2408 so no '-' is required, except if the character is itself a '-' */
2409 if (IS_BASE64(ch) || ch == '-') {
2410 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002411 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002412 *out++ = (char) ch;
2413 }
2414 else {
2415 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002416 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002417 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002418 else { /* not in a shift sequence */
2419 if (ch == '+') {
2420 *out++ = '+';
2421 *out++ = '-';
2422 }
2423 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2424 *out++ = (char) ch;
2425 }
2426 else {
2427 *out++ = '+';
2428 inShift = 1;
2429 goto encode_char;
2430 }
2431 }
2432 continue;
2433encode_char:
2434#ifdef Py_UNICODE_WIDE
2435 if (ch >= 0x10000) {
2436 /* code first surrogate */
2437 base64bits += 16;
2438 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2439 while (base64bits >= 6) {
2440 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2441 base64bits -= 6;
2442 }
2443 /* prepare second surrogate */
2444 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2445 }
2446#endif
2447 base64bits += 16;
2448 base64buffer = (base64buffer << 16) | ch;
2449 while (base64bits >= 6) {
2450 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2451 base64bits -= 6;
2452 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002453 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002454 if (base64bits)
2455 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2456 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002457 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002458 if (_PyBytes_Resize(&v, out - start) < 0)
2459 return NULL;
2460 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002461}
2462
Antoine Pitrou244651a2009-05-04 18:56:13 +00002463#undef IS_BASE64
2464#undef FROM_BASE64
2465#undef TO_BASE64
2466#undef DECODE_DIRECT
2467#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002468
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469/* --- UTF-8 Codec -------------------------------------------------------- */
2470
Tim Petersced69f82003-09-16 20:30:58 +00002471static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002473 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2474 illegal prefix. See RFC 3629 for details */
2475 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2476 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002477 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2480 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2481 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002482 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2487 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2488 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2489 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2490 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491};
2492
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 Py_ssize_t size,
2495 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496{
Walter Dörwald69652032004-09-07 20:24:22 +00002497 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2498}
2499
Antoine Pitrouab868312009-01-10 15:40:25 +00002500/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2501#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2502
2503/* Mask to quickly check whether a C 'long' contains a
2504 non-ASCII, UTF8-encoded char. */
2505#if (SIZEOF_LONG == 8)
2506# define ASCII_CHAR_MASK 0x8080808080808080L
2507#elif (SIZEOF_LONG == 4)
2508# define ASCII_CHAR_MASK 0x80808080L
2509#else
2510# error C 'long' size should be either 4 or 8!
2511#endif
2512
Walter Dörwald69652032004-09-07 20:24:22 +00002513PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002514 Py_ssize_t size,
2515 const char *errors,
2516 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002517{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002518 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002520 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002521 Py_ssize_t startinpos;
2522 Py_ssize_t endinpos;
2523 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002524 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 PyUnicodeObject *unicode;
2526 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002527 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002528 PyObject *errorHandler = NULL;
2529 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530
2531 /* Note: size will always be longer than the resulting Unicode
2532 character count */
2533 unicode = _PyUnicode_New(size);
2534 if (!unicode)
2535 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002536 if (size == 0) {
2537 if (consumed)
2538 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541
2542 /* Unpack UTF-8 encoded data */
2543 p = unicode->str;
2544 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002545 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546
2547 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002548 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549
2550 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002551 /* Fast path for runs of ASCII characters. Given that common UTF-8
2552 input will consist of an overwhelming majority of ASCII
2553 characters, we try to optimize for this case by checking
2554 as many characters as a C 'long' can contain.
2555 First, check if we can do an aligned read, as most CPUs have
2556 a penalty for unaligned reads.
2557 */
2558 if (!((size_t) s & LONG_PTR_MASK)) {
2559 /* Help register allocation */
2560 register const char *_s = s;
2561 register Py_UNICODE *_p = p;
2562 while (_s < aligned_end) {
2563 /* Read a whole long at a time (either 4 or 8 bytes),
2564 and do a fast unrolled copy if it only contains ASCII
2565 characters. */
2566 unsigned long data = *(unsigned long *) _s;
2567 if (data & ASCII_CHAR_MASK)
2568 break;
2569 _p[0] = (unsigned char) _s[0];
2570 _p[1] = (unsigned char) _s[1];
2571 _p[2] = (unsigned char) _s[2];
2572 _p[3] = (unsigned char) _s[3];
2573#if (SIZEOF_LONG == 8)
2574 _p[4] = (unsigned char) _s[4];
2575 _p[5] = (unsigned char) _s[5];
2576 _p[6] = (unsigned char) _s[6];
2577 _p[7] = (unsigned char) _s[7];
2578#endif
2579 _s += SIZEOF_LONG;
2580 _p += SIZEOF_LONG;
2581 }
2582 s = _s;
2583 p = _p;
2584 if (s == e)
2585 break;
2586 ch = (unsigned char)*s;
2587 }
2588 }
2589
2590 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002591 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592 s++;
2593 continue;
2594 }
2595
2596 n = utf8_code_length[ch];
2597
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002598 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002599 if (consumed)
2600 break;
2601 else {
2602 errmsg = "unexpected end of data";
2603 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002604 endinpos = startinpos+1;
2605 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2606 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002607 goto utf8Error;
2608 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
2611 switch (n) {
2612
2613 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002614 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002615 startinpos = s-starts;
2616 endinpos = startinpos+1;
2617 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618
2619 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002620 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002621 startinpos = s-starts;
2622 endinpos = startinpos+1;
2623 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624
2625 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002626 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002627 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002628 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002629 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002630 goto utf8Error;
2631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002633 assert ((ch > 0x007F) && (ch <= 0x07FF));
2634 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 break;
2636
2637 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002638 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2639 will result in surrogates in range d800-dfff. Surrogates are
2640 not valid UTF-8 so they are rejected.
2641 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2642 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002643 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002644 (s[2] & 0xc0) != 0x80 ||
2645 ((unsigned char)s[0] == 0xE0 &&
2646 (unsigned char)s[1] < 0xA0) ||
2647 ((unsigned char)s[0] == 0xED &&
2648 (unsigned char)s[1] > 0x9F)) {
2649 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002650 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002651 endinpos = startinpos + 1;
2652
2653 /* if s[1] first two bits are 1 and 0, then the invalid
2654 continuation byte is s[2], so increment endinpos by 1,
2655 if not, s[1] is invalid and endinpos doesn't need to
2656 be incremented. */
2657 if ((s[1] & 0xC0) == 0x80)
2658 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002659 goto utf8Error;
2660 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002662 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2663 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002664 break;
2665
2666 case 4:
2667 if ((s[1] & 0xc0) != 0x80 ||
2668 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002669 (s[3] & 0xc0) != 0x80 ||
2670 ((unsigned char)s[0] == 0xF0 &&
2671 (unsigned char)s[1] < 0x90) ||
2672 ((unsigned char)s[0] == 0xF4 &&
2673 (unsigned char)s[1] > 0x8F)) {
2674 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002675 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002676 endinpos = startinpos + 1;
2677 if ((s[1] & 0xC0) == 0x80) {
2678 endinpos++;
2679 if ((s[2] & 0xC0) == 0x80)
2680 endinpos++;
2681 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 goto utf8Error;
2683 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002684 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002685 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2686 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2687
Fredrik Lundh8f455852001-06-27 18:59:43 +00002688#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002690#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002691 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002692
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002693 /* translate from 10000..10FFFF to 0..FFFF */
2694 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002695
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002696 /* high surrogate = top 10 bits added to D800 */
2697 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002698
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002699 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002700 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002701#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703 }
2704 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002705 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002706
Benjamin Peterson29060642009-01-31 22:14:21 +00002707 utf8Error:
2708 outpos = p-PyUnicode_AS_UNICODE(unicode);
2709 if (unicode_decode_call_errorhandler(
2710 errors, &errorHandler,
2711 "utf8", errmsg,
2712 &starts, &e, &startinpos, &endinpos, &exc, &s,
2713 &unicode, &outpos, &p))
2714 goto onError;
2715 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 }
Walter Dörwald69652032004-09-07 20:24:22 +00002717 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002718 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719
2720 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002721 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722 goto onError;
2723
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002724 Py_XDECREF(errorHandler);
2725 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 return (PyObject *)unicode;
2727
Benjamin Peterson29060642009-01-31 22:14:21 +00002728 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 Py_XDECREF(errorHandler);
2730 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 Py_DECREF(unicode);
2732 return NULL;
2733}
2734
Antoine Pitrouab868312009-01-10 15:40:25 +00002735#undef ASCII_CHAR_MASK
2736
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002737#ifdef __APPLE__
2738
2739/* Simplified UTF-8 decoder using surrogateescape error handler,
2740 used to decode the command line arguments on Mac OS X. */
2741
2742wchar_t*
2743_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2744{
2745 int n;
2746 const char *e;
2747 wchar_t *unicode, *p;
2748
2749 /* Note: size will always be longer than the resulting Unicode
2750 character count */
2751 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2752 PyErr_NoMemory();
2753 return NULL;
2754 }
2755 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2756 if (!unicode)
2757 return NULL;
2758
2759 /* Unpack UTF-8 encoded data */
2760 p = unicode;
2761 e = s + size;
2762 while (s < e) {
2763 Py_UCS4 ch = (unsigned char)*s;
2764
2765 if (ch < 0x80) {
2766 *p++ = (wchar_t)ch;
2767 s++;
2768 continue;
2769 }
2770
2771 n = utf8_code_length[ch];
2772 if (s + n > e) {
2773 goto surrogateescape;
2774 }
2775
2776 switch (n) {
2777 case 0:
2778 case 1:
2779 goto surrogateescape;
2780
2781 case 2:
2782 if ((s[1] & 0xc0) != 0x80)
2783 goto surrogateescape;
2784 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2785 assert ((ch > 0x007F) && (ch <= 0x07FF));
2786 *p++ = (wchar_t)ch;
2787 break;
2788
2789 case 3:
2790 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2791 will result in surrogates in range d800-dfff. Surrogates are
2792 not valid UTF-8 so they are rejected.
2793 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2794 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2795 if ((s[1] & 0xc0) != 0x80 ||
2796 (s[2] & 0xc0) != 0x80 ||
2797 ((unsigned char)s[0] == 0xE0 &&
2798 (unsigned char)s[1] < 0xA0) ||
2799 ((unsigned char)s[0] == 0xED &&
2800 (unsigned char)s[1] > 0x9F)) {
2801
2802 goto surrogateescape;
2803 }
2804 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2805 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2806 *p++ = (Py_UNICODE)ch;
2807 break;
2808
2809 case 4:
2810 if ((s[1] & 0xc0) != 0x80 ||
2811 (s[2] & 0xc0) != 0x80 ||
2812 (s[3] & 0xc0) != 0x80 ||
2813 ((unsigned char)s[0] == 0xF0 &&
2814 (unsigned char)s[1] < 0x90) ||
2815 ((unsigned char)s[0] == 0xF4 &&
2816 (unsigned char)s[1] > 0x8F)) {
2817 goto surrogateescape;
2818 }
2819 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2820 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2821 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2822
2823#if SIZEOF_WCHAR_T == 4
2824 *p++ = (wchar_t)ch;
2825#else
2826 /* compute and append the two surrogates: */
2827
2828 /* translate from 10000..10FFFF to 0..FFFF */
2829 ch -= 0x10000;
2830
2831 /* high surrogate = top 10 bits added to D800 */
2832 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2833
2834 /* low surrogate = bottom 10 bits added to DC00 */
2835 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2836#endif
2837 break;
2838 }
2839 s += n;
2840 continue;
2841
2842 surrogateescape:
2843 *p++ = 0xDC00 + ch;
2844 s++;
2845 }
2846 *p = L'\0';
2847 return unicode;
2848}
2849
2850#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002851
Tim Peters602f7402002-04-27 18:03:26 +00002852/* Allocation strategy: if the string is short, convert into a stack buffer
2853 and allocate exactly as much space needed at the end. Else allocate the
2854 maximum possible needed (4 result bytes per Unicode character), and return
2855 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002856*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002857PyObject *
2858PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 Py_ssize_t size,
2860 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861{
Tim Peters602f7402002-04-27 18:03:26 +00002862#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002863
Guido van Rossum98297ee2007-11-06 21:34:58 +00002864 Py_ssize_t i; /* index into s of next input byte */
2865 PyObject *result; /* result string object */
2866 char *p; /* next free byte in output buffer */
2867 Py_ssize_t nallocated; /* number of result bytes allocated */
2868 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002869 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002870 PyObject *errorHandler = NULL;
2871 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002872
Tim Peters602f7402002-04-27 18:03:26 +00002873 assert(s != NULL);
2874 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875
Tim Peters602f7402002-04-27 18:03:26 +00002876 if (size <= MAX_SHORT_UNICHARS) {
2877 /* Write into the stack buffer; nallocated can't overflow.
2878 * At the end, we'll allocate exactly as much heap space as it
2879 * turns out we need.
2880 */
2881 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002882 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002883 p = stackbuf;
2884 }
2885 else {
2886 /* Overallocate on the heap, and give the excess back at the end. */
2887 nallocated = size * 4;
2888 if (nallocated / 4 != size) /* overflow! */
2889 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002890 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002891 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002892 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002893 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002894 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002895
Tim Peters602f7402002-04-27 18:03:26 +00002896 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002897 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002898
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002899 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002900 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002902
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002904 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002905 *p++ = (char)(0xc0 | (ch >> 6));
2906 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002907 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002908#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002909 /* Special case: check for high and low surrogate */
2910 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2911 Py_UCS4 ch2 = s[i];
2912 /* Combine the two surrogates to form a UCS4 value */
2913 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2914 i++;
2915
2916 /* Encode UCS4 Unicode ordinals */
2917 *p++ = (char)(0xf0 | (ch >> 18));
2918 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002919 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2920 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002921 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002922#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002923 Py_ssize_t newpos;
2924 PyObject *rep;
2925 Py_ssize_t repsize, k;
2926 rep = unicode_encode_call_errorhandler
2927 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2928 s, size, &exc, i-1, i, &newpos);
2929 if (!rep)
2930 goto error;
2931
2932 if (PyBytes_Check(rep))
2933 repsize = PyBytes_GET_SIZE(rep);
2934 else
2935 repsize = PyUnicode_GET_SIZE(rep);
2936
2937 if (repsize > 4) {
2938 Py_ssize_t offset;
2939
2940 if (result == NULL)
2941 offset = p - stackbuf;
2942 else
2943 offset = p - PyBytes_AS_STRING(result);
2944
2945 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2946 /* integer overflow */
2947 PyErr_NoMemory();
2948 goto error;
2949 }
2950 nallocated += repsize - 4;
2951 if (result != NULL) {
2952 if (_PyBytes_Resize(&result, nallocated) < 0)
2953 goto error;
2954 } else {
2955 result = PyBytes_FromStringAndSize(NULL, nallocated);
2956 if (result == NULL)
2957 goto error;
2958 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2959 }
2960 p = PyBytes_AS_STRING(result) + offset;
2961 }
2962
2963 if (PyBytes_Check(rep)) {
2964 char *prep = PyBytes_AS_STRING(rep);
2965 for(k = repsize; k > 0; k--)
2966 *p++ = *prep++;
2967 } else /* rep is unicode */ {
2968 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2969 Py_UNICODE c;
2970
2971 for(k=0; k<repsize; k++) {
2972 c = prep[k];
2973 if (0x80 <= c) {
2974 raise_encode_exception(&exc, "utf-8", s, size,
2975 i-1, i, "surrogates not allowed");
2976 goto error;
2977 }
2978 *p++ = (char)prep[k];
2979 }
2980 }
2981 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002982#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002983 }
Victor Stinner445a6232010-04-22 20:01:57 +00002984#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002985 } else if (ch < 0x10000) {
2986 *p++ = (char)(0xe0 | (ch >> 12));
2987 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2988 *p++ = (char)(0x80 | (ch & 0x3f));
2989 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002990 /* Encode UCS4 Unicode ordinals */
2991 *p++ = (char)(0xf0 | (ch >> 18));
2992 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2993 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2994 *p++ = (char)(0x80 | (ch & 0x3f));
2995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002997
Guido van Rossum98297ee2007-11-06 21:34:58 +00002998 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00002999 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003000 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003001 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003002 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003003 }
3004 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003005 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003006 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003007 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003008 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003009 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003010 Py_XDECREF(errorHandler);
3011 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003012 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003013 error:
3014 Py_XDECREF(errorHandler);
3015 Py_XDECREF(exc);
3016 Py_XDECREF(result);
3017 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003018
Tim Peters602f7402002-04-27 18:03:26 +00003019#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020}
3021
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3023{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 if (!PyUnicode_Check(unicode)) {
3025 PyErr_BadArgument();
3026 return NULL;
3027 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003028 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 PyUnicode_GET_SIZE(unicode),
3030 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031}
3032
Walter Dörwald41980ca2007-08-16 21:55:45 +00003033/* --- UTF-32 Codec ------------------------------------------------------- */
3034
3035PyObject *
3036PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 Py_ssize_t size,
3038 const char *errors,
3039 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003040{
3041 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3042}
3043
3044PyObject *
3045PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 Py_ssize_t size,
3047 const char *errors,
3048 int *byteorder,
3049 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003050{
3051 const char *starts = s;
3052 Py_ssize_t startinpos;
3053 Py_ssize_t endinpos;
3054 Py_ssize_t outpos;
3055 PyUnicodeObject *unicode;
3056 Py_UNICODE *p;
3057#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003058 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003059 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003060#else
3061 const int pairs = 0;
3062#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003063 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003064 int bo = 0; /* assume native ordering by default */
3065 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003066 /* Offsets from q for retrieving bytes in the right order. */
3067#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3068 int iorder[] = {0, 1, 2, 3};
3069#else
3070 int iorder[] = {3, 2, 1, 0};
3071#endif
3072 PyObject *errorHandler = NULL;
3073 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003074
Walter Dörwald41980ca2007-08-16 21:55:45 +00003075 q = (unsigned char *)s;
3076 e = q + size;
3077
3078 if (byteorder)
3079 bo = *byteorder;
3080
3081 /* Check for BOM marks (U+FEFF) in the input and adjust current
3082 byte order setting accordingly. In native mode, the leading BOM
3083 mark is skipped, in all other modes, it is copied to the output
3084 stream as-is (giving a ZWNBSP character). */
3085 if (bo == 0) {
3086 if (size >= 4) {
3087 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003089#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003090 if (bom == 0x0000FEFF) {
3091 q += 4;
3092 bo = -1;
3093 }
3094 else if (bom == 0xFFFE0000) {
3095 q += 4;
3096 bo = 1;
3097 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003098#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003099 if (bom == 0x0000FEFF) {
3100 q += 4;
3101 bo = 1;
3102 }
3103 else if (bom == 0xFFFE0000) {
3104 q += 4;
3105 bo = -1;
3106 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003107#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003108 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003109 }
3110
3111 if (bo == -1) {
3112 /* force LE */
3113 iorder[0] = 0;
3114 iorder[1] = 1;
3115 iorder[2] = 2;
3116 iorder[3] = 3;
3117 }
3118 else if (bo == 1) {
3119 /* force BE */
3120 iorder[0] = 3;
3121 iorder[1] = 2;
3122 iorder[2] = 1;
3123 iorder[3] = 0;
3124 }
3125
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003126 /* On narrow builds we split characters outside the BMP into two
3127 codepoints => count how much extra space we need. */
3128#ifndef Py_UNICODE_WIDE
3129 for (qq = q; qq < e; qq += 4)
3130 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3131 pairs++;
3132#endif
3133
3134 /* This might be one to much, because of a BOM */
3135 unicode = _PyUnicode_New((size+3)/4+pairs);
3136 if (!unicode)
3137 return NULL;
3138 if (size == 0)
3139 return (PyObject *)unicode;
3140
3141 /* Unpack UTF-32 encoded data */
3142 p = unicode->str;
3143
Walter Dörwald41980ca2007-08-16 21:55:45 +00003144 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 Py_UCS4 ch;
3146 /* remaining bytes at the end? (size should be divisible by 4) */
3147 if (e-q<4) {
3148 if (consumed)
3149 break;
3150 errmsg = "truncated data";
3151 startinpos = ((const char *)q)-starts;
3152 endinpos = ((const char *)e)-starts;
3153 goto utf32Error;
3154 /* The remaining input chars are ignored if the callback
3155 chooses to skip the input */
3156 }
3157 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3158 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003159
Benjamin Peterson29060642009-01-31 22:14:21 +00003160 if (ch >= 0x110000)
3161 {
3162 errmsg = "codepoint not in range(0x110000)";
3163 startinpos = ((const char *)q)-starts;
3164 endinpos = startinpos+4;
3165 goto utf32Error;
3166 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003167#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 if (ch >= 0x10000)
3169 {
3170 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3171 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3172 }
3173 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003174#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003175 *p++ = ch;
3176 q += 4;
3177 continue;
3178 utf32Error:
3179 outpos = p-PyUnicode_AS_UNICODE(unicode);
3180 if (unicode_decode_call_errorhandler(
3181 errors, &errorHandler,
3182 "utf32", errmsg,
3183 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3184 &unicode, &outpos, &p))
3185 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003186 }
3187
3188 if (byteorder)
3189 *byteorder = bo;
3190
3191 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003192 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003193
3194 /* Adjust length */
3195 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3196 goto onError;
3197
3198 Py_XDECREF(errorHandler);
3199 Py_XDECREF(exc);
3200 return (PyObject *)unicode;
3201
Benjamin Peterson29060642009-01-31 22:14:21 +00003202 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003203 Py_DECREF(unicode);
3204 Py_XDECREF(errorHandler);
3205 Py_XDECREF(exc);
3206 return NULL;
3207}
3208
3209PyObject *
3210PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003211 Py_ssize_t size,
3212 const char *errors,
3213 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003214{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003215 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003216 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003217 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003218#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003219 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003220#else
3221 const int pairs = 0;
3222#endif
3223 /* Offsets from p for storing byte pairs in the right order. */
3224#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3225 int iorder[] = {0, 1, 2, 3};
3226#else
3227 int iorder[] = {3, 2, 1, 0};
3228#endif
3229
Benjamin Peterson29060642009-01-31 22:14:21 +00003230#define STORECHAR(CH) \
3231 do { \
3232 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3233 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3234 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3235 p[iorder[0]] = (CH) & 0xff; \
3236 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003237 } while(0)
3238
3239 /* In narrow builds we can output surrogate pairs as one codepoint,
3240 so we need less space. */
3241#ifndef Py_UNICODE_WIDE
3242 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3244 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3245 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003246#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003247 nsize = (size - pairs + (byteorder == 0));
3248 bytesize = nsize * 4;
3249 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003251 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003252 if (v == NULL)
3253 return NULL;
3254
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003255 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003256 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003257 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003258 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003259 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003260
3261 if (byteorder == -1) {
3262 /* force LE */
3263 iorder[0] = 0;
3264 iorder[1] = 1;
3265 iorder[2] = 2;
3266 iorder[3] = 3;
3267 }
3268 else if (byteorder == 1) {
3269 /* force BE */
3270 iorder[0] = 3;
3271 iorder[1] = 2;
3272 iorder[2] = 1;
3273 iorder[3] = 0;
3274 }
3275
3276 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003277 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003278#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003279 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3280 Py_UCS4 ch2 = *s;
3281 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3282 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3283 s++;
3284 size--;
3285 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003286 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003287#endif
3288 STORECHAR(ch);
3289 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003290
3291 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003292 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003293#undef STORECHAR
3294}
3295
3296PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3297{
3298 if (!PyUnicode_Check(unicode)) {
3299 PyErr_BadArgument();
3300 return NULL;
3301 }
3302 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 PyUnicode_GET_SIZE(unicode),
3304 NULL,
3305 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003306}
3307
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308/* --- UTF-16 Codec ------------------------------------------------------- */
3309
Tim Peters772747b2001-08-09 22:21:55 +00003310PyObject *
3311PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003312 Py_ssize_t size,
3313 const char *errors,
3314 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003315{
Walter Dörwald69652032004-09-07 20:24:22 +00003316 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3317}
3318
Antoine Pitrouab868312009-01-10 15:40:25 +00003319/* Two masks for fast checking of whether a C 'long' may contain
3320 UTF16-encoded surrogate characters. This is an efficient heuristic,
3321 assuming that non-surrogate characters with a code point >= 0x8000 are
3322 rare in most input.
3323 FAST_CHAR_MASK is used when the input is in native byte ordering,
3324 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003325*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003326#if (SIZEOF_LONG == 8)
3327# define FAST_CHAR_MASK 0x8000800080008000L
3328# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3329#elif (SIZEOF_LONG == 4)
3330# define FAST_CHAR_MASK 0x80008000L
3331# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3332#else
3333# error C 'long' size should be either 4 or 8!
3334#endif
3335
Walter Dörwald69652032004-09-07 20:24:22 +00003336PyObject *
3337PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 Py_ssize_t size,
3339 const char *errors,
3340 int *byteorder,
3341 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003342{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003344 Py_ssize_t startinpos;
3345 Py_ssize_t endinpos;
3346 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 PyUnicodeObject *unicode;
3348 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003349 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003350 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003351 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003352 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003353 /* Offsets from q for retrieving byte pairs in the right order. */
3354#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3355 int ihi = 1, ilo = 0;
3356#else
3357 int ihi = 0, ilo = 1;
3358#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 PyObject *errorHandler = NULL;
3360 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361
3362 /* Note: size will always be longer than the resulting Unicode
3363 character count */
3364 unicode = _PyUnicode_New(size);
3365 if (!unicode)
3366 return NULL;
3367 if (size == 0)
3368 return (PyObject *)unicode;
3369
3370 /* Unpack UTF-16 encoded data */
3371 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003372 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003373 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374
3375 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003376 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003378 /* Check for BOM marks (U+FEFF) in the input and adjust current
3379 byte order setting accordingly. In native mode, the leading BOM
3380 mark is skipped, in all other modes, it is copied to the output
3381 stream as-is (giving a ZWNBSP character). */
3382 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003383 if (size >= 2) {
3384 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003385#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003386 if (bom == 0xFEFF) {
3387 q += 2;
3388 bo = -1;
3389 }
3390 else if (bom == 0xFFFE) {
3391 q += 2;
3392 bo = 1;
3393 }
Tim Petersced69f82003-09-16 20:30:58 +00003394#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 if (bom == 0xFEFF) {
3396 q += 2;
3397 bo = 1;
3398 }
3399 else if (bom == 0xFFFE) {
3400 q += 2;
3401 bo = -1;
3402 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003403#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003404 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003406
Tim Peters772747b2001-08-09 22:21:55 +00003407 if (bo == -1) {
3408 /* force LE */
3409 ihi = 1;
3410 ilo = 0;
3411 }
3412 else if (bo == 1) {
3413 /* force BE */
3414 ihi = 0;
3415 ilo = 1;
3416 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003417#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3418 native_ordering = ilo < ihi;
3419#else
3420 native_ordering = ilo > ihi;
3421#endif
Tim Peters772747b2001-08-09 22:21:55 +00003422
Antoine Pitrouab868312009-01-10 15:40:25 +00003423 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003424 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003425 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003426 /* First check for possible aligned read of a C 'long'. Unaligned
3427 reads are more expensive, better to defer to another iteration. */
3428 if (!((size_t) q & LONG_PTR_MASK)) {
3429 /* Fast path for runs of non-surrogate chars. */
3430 register const unsigned char *_q = q;
3431 Py_UNICODE *_p = p;
3432 if (native_ordering) {
3433 /* Native ordering is simple: as long as the input cannot
3434 possibly contain a surrogate char, do an unrolled copy
3435 of several 16-bit code points to the target object.
3436 The non-surrogate check is done on several input bytes
3437 at a time (as many as a C 'long' can contain). */
3438 while (_q < aligned_end) {
3439 unsigned long data = * (unsigned long *) _q;
3440 if (data & FAST_CHAR_MASK)
3441 break;
3442 _p[0] = ((unsigned short *) _q)[0];
3443 _p[1] = ((unsigned short *) _q)[1];
3444#if (SIZEOF_LONG == 8)
3445 _p[2] = ((unsigned short *) _q)[2];
3446 _p[3] = ((unsigned short *) _q)[3];
3447#endif
3448 _q += SIZEOF_LONG;
3449 _p += SIZEOF_LONG / 2;
3450 }
3451 }
3452 else {
3453 /* Byteswapped ordering is similar, but we must decompose
3454 the copy bytewise, and take care of zero'ing out the
3455 upper bytes if the target object is in 32-bit units
3456 (that is, in UCS-4 builds). */
3457 while (_q < aligned_end) {
3458 unsigned long data = * (unsigned long *) _q;
3459 if (data & SWAPPED_FAST_CHAR_MASK)
3460 break;
3461 /* Zero upper bytes in UCS-4 builds */
3462#if (Py_UNICODE_SIZE > 2)
3463 _p[0] = 0;
3464 _p[1] = 0;
3465#if (SIZEOF_LONG == 8)
3466 _p[2] = 0;
3467 _p[3] = 0;
3468#endif
3469#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003470 /* Issue #4916; UCS-4 builds on big endian machines must
3471 fill the two last bytes of each 4-byte unit. */
3472#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3473# define OFF 2
3474#else
3475# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003476#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003477 ((unsigned char *) _p)[OFF + 1] = _q[0];
3478 ((unsigned char *) _p)[OFF + 0] = _q[1];
3479 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3480 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3481#if (SIZEOF_LONG == 8)
3482 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3483 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3484 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3485 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3486#endif
3487#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003488 _q += SIZEOF_LONG;
3489 _p += SIZEOF_LONG / 2;
3490 }
3491 }
3492 p = _p;
3493 q = _q;
3494 if (q >= e)
3495 break;
3496 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003497 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498
Benjamin Peterson14339b62009-01-31 16:36:08 +00003499 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003500
3501 if (ch < 0xD800 || ch > 0xDFFF) {
3502 *p++ = ch;
3503 continue;
3504 }
3505
3506 /* UTF-16 code pair: */
3507 if (q > e) {
3508 errmsg = "unexpected end of data";
3509 startinpos = (((const char *)q) - 2) - starts;
3510 endinpos = ((const char *)e) + 1 - starts;
3511 goto utf16Error;
3512 }
3513 if (0xD800 <= ch && ch <= 0xDBFF) {
3514 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3515 q += 2;
3516 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003517#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003518 *p++ = ch;
3519 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003520#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003521 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003522#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003523 continue;
3524 }
3525 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003526 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003527 startinpos = (((const char *)q)-4)-starts;
3528 endinpos = startinpos+2;
3529 goto utf16Error;
3530 }
3531
Benjamin Peterson14339b62009-01-31 16:36:08 +00003532 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 errmsg = "illegal encoding";
3534 startinpos = (((const char *)q)-2)-starts;
3535 endinpos = startinpos+2;
3536 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003537
Benjamin Peterson29060642009-01-31 22:14:21 +00003538 utf16Error:
3539 outpos = p - PyUnicode_AS_UNICODE(unicode);
3540 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003541 errors,
3542 &errorHandler,
3543 "utf16", errmsg,
3544 &starts,
3545 (const char **)&e,
3546 &startinpos,
3547 &endinpos,
3548 &exc,
3549 (const char **)&q,
3550 &unicode,
3551 &outpos,
3552 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003553 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003555 /* remaining byte at the end? (size should be even) */
3556 if (e == q) {
3557 if (!consumed) {
3558 errmsg = "truncated data";
3559 startinpos = ((const char *)q) - starts;
3560 endinpos = ((const char *)e) + 1 - starts;
3561 outpos = p - PyUnicode_AS_UNICODE(unicode);
3562 if (unicode_decode_call_errorhandler(
3563 errors,
3564 &errorHandler,
3565 "utf16", errmsg,
3566 &starts,
3567 (const char **)&e,
3568 &startinpos,
3569 &endinpos,
3570 &exc,
3571 (const char **)&q,
3572 &unicode,
3573 &outpos,
3574 &p))
3575 goto onError;
3576 /* The remaining input chars are ignored if the callback
3577 chooses to skip the input */
3578 }
3579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580
3581 if (byteorder)
3582 *byteorder = bo;
3583
Walter Dörwald69652032004-09-07 20:24:22 +00003584 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003585 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003586
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003588 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 goto onError;
3590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 Py_XDECREF(errorHandler);
3592 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 return (PyObject *)unicode;
3594
Benjamin Peterson29060642009-01-31 22:14:21 +00003595 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 Py_XDECREF(errorHandler);
3598 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 return NULL;
3600}
3601
Antoine Pitrouab868312009-01-10 15:40:25 +00003602#undef FAST_CHAR_MASK
3603#undef SWAPPED_FAST_CHAR_MASK
3604
Tim Peters772747b2001-08-09 22:21:55 +00003605PyObject *
3606PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003607 Py_ssize_t size,
3608 const char *errors,
3609 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003611 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003612 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003613 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003614#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003615 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003616#else
3617 const int pairs = 0;
3618#endif
Tim Peters772747b2001-08-09 22:21:55 +00003619 /* Offsets from p for storing byte pairs in the right order. */
3620#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3621 int ihi = 1, ilo = 0;
3622#else
3623 int ihi = 0, ilo = 1;
3624#endif
3625
Benjamin Peterson29060642009-01-31 22:14:21 +00003626#define STORECHAR(CH) \
3627 do { \
3628 p[ihi] = ((CH) >> 8) & 0xff; \
3629 p[ilo] = (CH) & 0xff; \
3630 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003631 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003633#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003634 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 if (s[i] >= 0x10000)
3636 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003637#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003638 /* 2 * (size + pairs + (byteorder == 0)) */
3639 if (size > PY_SSIZE_T_MAX ||
3640 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003642 nsize = size + pairs + (byteorder == 0);
3643 bytesize = nsize * 2;
3644 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003646 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 if (v == NULL)
3648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003650 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003652 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003653 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003654 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003655
3656 if (byteorder == -1) {
3657 /* force LE */
3658 ihi = 1;
3659 ilo = 0;
3660 }
3661 else if (byteorder == 1) {
3662 /* force BE */
3663 ihi = 0;
3664 ilo = 1;
3665 }
3666
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003667 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 Py_UNICODE ch = *s++;
3669 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003670#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 if (ch >= 0x10000) {
3672 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3673 ch = 0xD800 | ((ch-0x10000) >> 10);
3674 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003675#endif
Tim Peters772747b2001-08-09 22:21:55 +00003676 STORECHAR(ch);
3677 if (ch2)
3678 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003679 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003680
3681 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003682 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003683#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684}
3685
3686PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3687{
3688 if (!PyUnicode_Check(unicode)) {
3689 PyErr_BadArgument();
3690 return NULL;
3691 }
3692 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 PyUnicode_GET_SIZE(unicode),
3694 NULL,
3695 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696}
3697
3698/* --- Unicode Escape Codec ----------------------------------------------- */
3699
Fredrik Lundh06d12682001-01-24 07:59:11 +00003700static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003701
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 Py_ssize_t size,
3704 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003706 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003707 Py_ssize_t startinpos;
3708 Py_ssize_t endinpos;
3709 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003712 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003714 char* message;
3715 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 PyObject *errorHandler = NULL;
3717 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003718
Guido van Rossumd57fd912000-03-10 22:53:23 +00003719 /* Escaped strings will always be longer than the resulting
3720 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 length after conversion to the true value.
3722 (but if the error callback returns a long replacement string
3723 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 v = _PyUnicode_New(size);
3725 if (v == NULL)
3726 goto onError;
3727 if (size == 0)
3728 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003729
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003732
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733 while (s < end) {
3734 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003735 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003736 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737
3738 /* Non-escape characters are interpreted as Unicode ordinals */
3739 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003740 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741 continue;
3742 }
3743
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003744 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 /* \ - Escapes */
3746 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003747 c = *s++;
3748 if (s > end)
3749 c = '\0'; /* Invalid after \ */
3750 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751
Benjamin Peterson29060642009-01-31 22:14:21 +00003752 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 case '\n': break;
3754 case '\\': *p++ = '\\'; break;
3755 case '\'': *p++ = '\''; break;
3756 case '\"': *p++ = '\"'; break;
3757 case 'b': *p++ = '\b'; break;
3758 case 'f': *p++ = '\014'; break; /* FF */
3759 case 't': *p++ = '\t'; break;
3760 case 'n': *p++ = '\n'; break;
3761 case 'r': *p++ = '\r'; break;
3762 case 'v': *p++ = '\013'; break; /* VT */
3763 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3764
Benjamin Peterson29060642009-01-31 22:14:21 +00003765 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766 case '0': case '1': case '2': case '3':
3767 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003768 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003769 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003770 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003771 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003772 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003774 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 break;
3776
Benjamin Peterson29060642009-01-31 22:14:21 +00003777 /* hex escapes */
3778 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003780 digits = 2;
3781 message = "truncated \\xXX escape";
3782 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003786 digits = 4;
3787 message = "truncated \\uXXXX escape";
3788 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789
Benjamin Peterson29060642009-01-31 22:14:21 +00003790 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003791 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003792 digits = 8;
3793 message = "truncated \\UXXXXXXXX escape";
3794 hexescape:
3795 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 outpos = p-PyUnicode_AS_UNICODE(v);
3797 if (s+digits>end) {
3798 endinpos = size;
3799 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003800 errors, &errorHandler,
3801 "unicodeescape", "end of string in escape sequence",
3802 &starts, &end, &startinpos, &endinpos, &exc, &s,
3803 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804 goto onError;
3805 goto nextByte;
3806 }
3807 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003808 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003809 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 endinpos = (s+i+1)-starts;
3811 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003812 errors, &errorHandler,
3813 "unicodeescape", message,
3814 &starts, &end, &startinpos, &endinpos, &exc, &s,
3815 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003816 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003818 }
3819 chr = (chr<<4) & ~0xF;
3820 if (c >= '0' && c <= '9')
3821 chr += c - '0';
3822 else if (c >= 'a' && c <= 'f')
3823 chr += 10 + c - 'a';
3824 else
3825 chr += 10 + c - 'A';
3826 }
3827 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003828 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829 /* _decoding_error will have already written into the
3830 target buffer. */
3831 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003832 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003833 /* when we get here, chr is a 32-bit unicode character */
3834 if (chr <= 0xffff)
3835 /* UCS-2 character */
3836 *p++ = (Py_UNICODE) chr;
3837 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003838 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003839 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003840#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003841 *p++ = chr;
3842#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003843 chr -= 0x10000L;
3844 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003845 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003846#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003847 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003848 endinpos = s-starts;
3849 outpos = p-PyUnicode_AS_UNICODE(v);
3850 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 errors, &errorHandler,
3852 "unicodeescape", "illegal Unicode character",
3853 &starts, &end, &startinpos, &endinpos, &exc, &s,
3854 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003855 goto onError;
3856 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003857 break;
3858
Benjamin Peterson29060642009-01-31 22:14:21 +00003859 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003860 case 'N':
3861 message = "malformed \\N character escape";
3862 if (ucnhash_CAPI == NULL) {
3863 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003864 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003865 if (ucnhash_CAPI == NULL)
3866 goto ucnhashError;
3867 }
3868 if (*s == '{') {
3869 const char *start = s+1;
3870 /* look for the closing brace */
3871 while (*s != '}' && s < end)
3872 s++;
3873 if (s > start && s < end && *s == '}') {
3874 /* found a name. look it up in the unicode database */
3875 message = "unknown Unicode character name";
3876 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003877 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003878 goto store;
3879 }
3880 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 endinpos = s-starts;
3882 outpos = p-PyUnicode_AS_UNICODE(v);
3883 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003884 errors, &errorHandler,
3885 "unicodeescape", message,
3886 &starts, &end, &startinpos, &endinpos, &exc, &s,
3887 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003888 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003889 break;
3890
3891 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003892 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003893 message = "\\ at end of string";
3894 s--;
3895 endinpos = s-starts;
3896 outpos = p-PyUnicode_AS_UNICODE(v);
3897 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003898 errors, &errorHandler,
3899 "unicodeescape", message,
3900 &starts, &end, &startinpos, &endinpos, &exc, &s,
3901 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003902 goto onError;
3903 }
3904 else {
3905 *p++ = '\\';
3906 *p++ = (unsigned char)s[-1];
3907 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003908 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003910 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003913 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003915 Py_XDECREF(errorHandler);
3916 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003918
Benjamin Peterson29060642009-01-31 22:14:21 +00003919 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003920 PyErr_SetString(
3921 PyExc_UnicodeError,
3922 "\\N escapes not supported (can't load unicodedata module)"
3923 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003924 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 Py_XDECREF(errorHandler);
3926 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003927 return NULL;
3928
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 Py_XDECREF(errorHandler);
3932 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 return NULL;
3934}
3935
3936/* Return a Unicode-Escape string version of the Unicode object.
3937
3938 If quotes is true, the string is enclosed in u"" or u'' quotes as
3939 appropriate.
3940
3941*/
3942
Thomas Wouters477c8d52006-05-27 19:21:47 +00003943Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003944 Py_ssize_t size,
3945 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003946{
3947 /* like wcschr, but doesn't stop at NULL characters */
3948
3949 while (size-- > 0) {
3950 if (*s == ch)
3951 return s;
3952 s++;
3953 }
3954
3955 return NULL;
3956}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003957
Walter Dörwald79e913e2007-05-12 11:08:06 +00003958static const char *hexdigits = "0123456789abcdef";
3959
3960PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003961 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003963 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003966#ifdef Py_UNICODE_WIDE
3967 const Py_ssize_t expandsize = 10;
3968#else
3969 const Py_ssize_t expandsize = 6;
3970#endif
3971
Thomas Wouters89f507f2006-12-13 04:49:30 +00003972 /* XXX(nnorwitz): rather than over-allocating, it would be
3973 better to choose a different scheme. Perhaps scan the
3974 first N-chars of the string and allocate based on that size.
3975 */
3976 /* Initial allocation is based on the longest-possible unichr
3977 escape.
3978
3979 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3980 unichr, so in this case it's the longest unichr escape. In
3981 narrow (UTF-16) builds this is five chars per source unichr
3982 since there are two unichrs in the surrogate pair, so in narrow
3983 (UTF-16) builds it's not the longest unichr escape.
3984
3985 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3986 so in the narrow (UTF-16) build case it's the longest unichr
3987 escape.
3988 */
3989
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003990 if (size == 0)
3991 return PyBytes_FromStringAndSize(NULL, 0);
3992
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003993 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003995
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003996 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 2
3998 + expandsize*size
3999 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 if (repr == NULL)
4001 return NULL;
4002
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004003 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 while (size-- > 0) {
4006 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004007
Walter Dörwald79e913e2007-05-12 11:08:06 +00004008 /* Escape backslashes */
4009 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 *p++ = '\\';
4011 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004012 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004013 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004014
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004015#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004016 /* Map 21-bit characters to '\U00xxxxxx' */
4017 else if (ch >= 0x10000) {
4018 *p++ = '\\';
4019 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004020 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4021 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4022 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4023 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4024 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4025 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4026 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4027 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004029 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004030#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4032 else if (ch >= 0xD800 && ch < 0xDC00) {
4033 Py_UNICODE ch2;
4034 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004035
Benjamin Peterson29060642009-01-31 22:14:21 +00004036 ch2 = *s++;
4037 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004038 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4040 *p++ = '\\';
4041 *p++ = 'U';
4042 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4043 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4044 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4045 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4046 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4047 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4048 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4049 *p++ = hexdigits[ucs & 0x0000000F];
4050 continue;
4051 }
4052 /* Fall through: isolated surrogates are copied as-is */
4053 s--;
4054 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004055 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004056#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004057
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004059 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 *p++ = '\\';
4061 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004062 *p++ = hexdigits[(ch >> 12) & 0x000F];
4063 *p++ = hexdigits[(ch >> 8) & 0x000F];
4064 *p++ = hexdigits[(ch >> 4) & 0x000F];
4065 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004067
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004068 /* Map special whitespace to '\t', \n', '\r' */
4069 else if (ch == '\t') {
4070 *p++ = '\\';
4071 *p++ = 't';
4072 }
4073 else if (ch == '\n') {
4074 *p++ = '\\';
4075 *p++ = 'n';
4076 }
4077 else if (ch == '\r') {
4078 *p++ = '\\';
4079 *p++ = 'r';
4080 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004081
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004082 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004083 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004085 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004086 *p++ = hexdigits[(ch >> 4) & 0x000F];
4087 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004088 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004089
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 /* Copy everything else as-is */
4091 else
4092 *p++ = (char) ch;
4093 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004095 assert(p - PyBytes_AS_STRING(repr) > 0);
4096 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4097 return NULL;
4098 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099}
4100
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004101PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004103 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 if (!PyUnicode_Check(unicode)) {
4105 PyErr_BadArgument();
4106 return NULL;
4107 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004108 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4109 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004110 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111}
4112
4113/* --- Raw Unicode Escape Codec ------------------------------------------- */
4114
4115PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 Py_ssize_t size,
4117 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004120 Py_ssize_t startinpos;
4121 Py_ssize_t endinpos;
4122 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 const char *end;
4126 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 PyObject *errorHandler = NULL;
4128 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 /* Escaped strings will always be longer than the resulting
4131 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 length after conversion to the true value. (But decoding error
4133 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 v = _PyUnicode_New(size);
4135 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 end = s + size;
4141 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 unsigned char c;
4143 Py_UCS4 x;
4144 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004145 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 /* Non-escape characters are interpreted as Unicode ordinals */
4148 if (*s != '\\') {
4149 *p++ = (unsigned char)*s++;
4150 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004151 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 startinpos = s-starts;
4153
4154 /* \u-escapes are only interpreted iff the number of leading
4155 backslashes if odd */
4156 bs = s;
4157 for (;s < end;) {
4158 if (*s != '\\')
4159 break;
4160 *p++ = (unsigned char)*s++;
4161 }
4162 if (((s - bs) & 1) == 0 ||
4163 s >= end ||
4164 (*s != 'u' && *s != 'U')) {
4165 continue;
4166 }
4167 p--;
4168 count = *s=='u' ? 4 : 8;
4169 s++;
4170
4171 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4172 outpos = p-PyUnicode_AS_UNICODE(v);
4173 for (x = 0, i = 0; i < count; ++i, ++s) {
4174 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004175 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 endinpos = s-starts;
4177 if (unicode_decode_call_errorhandler(
4178 errors, &errorHandler,
4179 "rawunicodeescape", "truncated \\uXXXX",
4180 &starts, &end, &startinpos, &endinpos, &exc, &s,
4181 &v, &outpos, &p))
4182 goto onError;
4183 goto nextByte;
4184 }
4185 x = (x<<4) & ~0xF;
4186 if (c >= '0' && c <= '9')
4187 x += c - '0';
4188 else if (c >= 'a' && c <= 'f')
4189 x += 10 + c - 'a';
4190 else
4191 x += 10 + c - 'A';
4192 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004193 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 /* UCS-2 character */
4195 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004196 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 /* UCS-4 character. Either store directly, or as
4198 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004199#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004201#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 x -= 0x10000L;
4203 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4204 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004205#endif
4206 } else {
4207 endinpos = s-starts;
4208 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004209 if (unicode_decode_call_errorhandler(
4210 errors, &errorHandler,
4211 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 &starts, &end, &startinpos, &endinpos, &exc, &s,
4213 &v, &outpos, &p))
4214 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004215 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 nextByte:
4217 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004219 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 Py_XDECREF(errorHandler);
4222 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004224
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227 Py_XDECREF(errorHandler);
4228 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 return NULL;
4230}
4231
4232PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004233 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004235 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 char *p;
4237 char *q;
4238
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004239#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004240 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004241#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004242 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004243#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004244
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004245 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004247
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004248 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249 if (repr == NULL)
4250 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004251 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004252 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004254 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 while (size-- > 0) {
4256 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004257#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 /* Map 32-bit characters to '\Uxxxxxxxx' */
4259 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004260 *p++ = '\\';
4261 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004262 *p++ = hexdigits[(ch >> 28) & 0xf];
4263 *p++ = hexdigits[(ch >> 24) & 0xf];
4264 *p++ = hexdigits[(ch >> 20) & 0xf];
4265 *p++ = hexdigits[(ch >> 16) & 0xf];
4266 *p++ = hexdigits[(ch >> 12) & 0xf];
4267 *p++ = hexdigits[(ch >> 8) & 0xf];
4268 *p++ = hexdigits[(ch >> 4) & 0xf];
4269 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004270 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004271 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004272#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004273 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4274 if (ch >= 0xD800 && ch < 0xDC00) {
4275 Py_UNICODE ch2;
4276 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004277
Benjamin Peterson29060642009-01-31 22:14:21 +00004278 ch2 = *s++;
4279 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004280 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4282 *p++ = '\\';
4283 *p++ = 'U';
4284 *p++ = hexdigits[(ucs >> 28) & 0xf];
4285 *p++ = hexdigits[(ucs >> 24) & 0xf];
4286 *p++ = hexdigits[(ucs >> 20) & 0xf];
4287 *p++ = hexdigits[(ucs >> 16) & 0xf];
4288 *p++ = hexdigits[(ucs >> 12) & 0xf];
4289 *p++ = hexdigits[(ucs >> 8) & 0xf];
4290 *p++ = hexdigits[(ucs >> 4) & 0xf];
4291 *p++ = hexdigits[ucs & 0xf];
4292 continue;
4293 }
4294 /* Fall through: isolated surrogates are copied as-is */
4295 s--;
4296 size++;
4297 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004298#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 /* Map 16-bit characters to '\uxxxx' */
4300 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301 *p++ = '\\';
4302 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004303 *p++ = hexdigits[(ch >> 12) & 0xf];
4304 *p++ = hexdigits[(ch >> 8) & 0xf];
4305 *p++ = hexdigits[(ch >> 4) & 0xf];
4306 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 /* Copy everything else as-is */
4309 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310 *p++ = (char) ch;
4311 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004312 size = p - q;
4313
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004314 assert(size > 0);
4315 if (_PyBytes_Resize(&repr, size) < 0)
4316 return NULL;
4317 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318}
4319
4320PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4321{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004322 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004324 PyErr_BadArgument();
4325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004327 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4328 PyUnicode_GET_SIZE(unicode));
4329
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004330 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331}
4332
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004333/* --- Unicode Internal Codec ------------------------------------------- */
4334
4335PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004336 Py_ssize_t size,
4337 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004338{
4339 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004340 Py_ssize_t startinpos;
4341 Py_ssize_t endinpos;
4342 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004343 PyUnicodeObject *v;
4344 Py_UNICODE *p;
4345 const char *end;
4346 const char *reason;
4347 PyObject *errorHandler = NULL;
4348 PyObject *exc = NULL;
4349
Neal Norwitzd43069c2006-01-08 01:12:10 +00004350#ifdef Py_UNICODE_WIDE
4351 Py_UNICODE unimax = PyUnicode_GetMax();
4352#endif
4353
Thomas Wouters89f507f2006-12-13 04:49:30 +00004354 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004355 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4356 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004357 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004358 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004359 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004360 p = PyUnicode_AS_UNICODE(v);
4361 end = s + size;
4362
4363 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004364 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004365 /* We have to sanity check the raw data, otherwise doom looms for
4366 some malformed UCS-4 data. */
4367 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004368#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004369 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004370#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004371 end-s < Py_UNICODE_SIZE
4372 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004373 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004374 startinpos = s - starts;
4375 if (end-s < Py_UNICODE_SIZE) {
4376 endinpos = end-starts;
4377 reason = "truncated input";
4378 }
4379 else {
4380 endinpos = s - starts + Py_UNICODE_SIZE;
4381 reason = "illegal code point (> 0x10FFFF)";
4382 }
4383 outpos = p - PyUnicode_AS_UNICODE(v);
4384 if (unicode_decode_call_errorhandler(
4385 errors, &errorHandler,
4386 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004387 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004388 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004389 goto onError;
4390 }
4391 }
4392 else {
4393 p++;
4394 s += Py_UNICODE_SIZE;
4395 }
4396 }
4397
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004398 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004399 goto onError;
4400 Py_XDECREF(errorHandler);
4401 Py_XDECREF(exc);
4402 return (PyObject *)v;
4403
Benjamin Peterson29060642009-01-31 22:14:21 +00004404 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004405 Py_XDECREF(v);
4406 Py_XDECREF(errorHandler);
4407 Py_XDECREF(exc);
4408 return NULL;
4409}
4410
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411/* --- Latin-1 Codec ------------------------------------------------------ */
4412
4413PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 Py_ssize_t size,
4415 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416{
4417 PyUnicodeObject *v;
4418 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004419 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004420
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004422 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 Py_UNICODE r = *(unsigned char*)s;
4424 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004425 }
4426
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 v = _PyUnicode_New(size);
4428 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004433 e = s + size;
4434 /* Unrolling the copy makes it much faster by reducing the looping
4435 overhead. This is similar to what many memcpy() implementations do. */
4436 unrolled_end = e - 4;
4437 while (s < unrolled_end) {
4438 p[0] = (unsigned char) s[0];
4439 p[1] = (unsigned char) s[1];
4440 p[2] = (unsigned char) s[2];
4441 p[3] = (unsigned char) s[3];
4442 s += 4;
4443 p += 4;
4444 }
4445 while (s < e)
4446 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004448
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 Py_XDECREF(v);
4451 return NULL;
4452}
4453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454/* create or adjust a UnicodeEncodeError */
4455static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 const char *encoding,
4457 const Py_UNICODE *unicode, Py_ssize_t size,
4458 Py_ssize_t startpos, Py_ssize_t endpos,
4459 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 *exceptionObject = PyUnicodeEncodeError_Create(
4463 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464 }
4465 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4467 goto onError;
4468 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4469 goto onError;
4470 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4471 goto onError;
4472 return;
4473 onError:
4474 Py_DECREF(*exceptionObject);
4475 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 }
4477}
4478
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479/* raises a UnicodeEncodeError */
4480static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 const char *encoding,
4482 const Py_UNICODE *unicode, Py_ssize_t size,
4483 Py_ssize_t startpos, Py_ssize_t endpos,
4484 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485{
4486 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490}
4491
4492/* error handling callback helper:
4493 build arguments, call the callback and check the arguments,
4494 put the result into newpos and return the replacement string, which
4495 has to be freed by the caller */
4496static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 PyObject **errorHandler,
4498 const char *encoding, const char *reason,
4499 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4500 Py_ssize_t startpos, Py_ssize_t endpos,
4501 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004502{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004503 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504
4505 PyObject *restuple;
4506 PyObject *resunicode;
4507
4508 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512 }
4513
4514 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518
4519 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004524 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 Py_DECREF(restuple);
4526 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004528 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 &resunicode, newpos)) {
4530 Py_DECREF(restuple);
4531 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004533 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4534 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4535 Py_DECREF(restuple);
4536 return NULL;
4537 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004540 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4542 Py_DECREF(restuple);
4543 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004544 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 Py_INCREF(resunicode);
4546 Py_DECREF(restuple);
4547 return resunicode;
4548}
4549
4550static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 Py_ssize_t size,
4552 const char *errors,
4553 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554{
4555 /* output object */
4556 PyObject *res;
4557 /* pointers to the beginning and end+1 of input */
4558 const Py_UNICODE *startp = p;
4559 const Py_UNICODE *endp = p + size;
4560 /* pointer to the beginning of the unencodable characters */
4561 /* const Py_UNICODE *badp = NULL; */
4562 /* pointer into the output */
4563 char *str;
4564 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004565 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004566 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4567 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 PyObject *errorHandler = NULL;
4569 PyObject *exc = NULL;
4570 /* the following variable is used for caching string comparisons
4571 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4572 int known_errorHandler = -1;
4573
4574 /* allocate enough for a simple encoding without
4575 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004576 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004577 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004578 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004580 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004581 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 ressize = size;
4583
4584 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004585 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 /* can we encode this? */
4588 if (c<limit) {
4589 /* no overflow check, because we know that the space is enough */
4590 *str++ = (char)c;
4591 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004592 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 else {
4594 Py_ssize_t unicodepos = p-startp;
4595 Py_ssize_t requiredsize;
4596 PyObject *repunicode;
4597 Py_ssize_t repsize;
4598 Py_ssize_t newpos;
4599 Py_ssize_t respos;
4600 Py_UNICODE *uni2;
4601 /* startpos for collecting unencodable chars */
4602 const Py_UNICODE *collstart = p;
4603 const Py_UNICODE *collend = p;
4604 /* find all unecodable characters */
4605 while ((collend < endp) && ((*collend)>=limit))
4606 ++collend;
4607 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4608 if (known_errorHandler==-1) {
4609 if ((errors==NULL) || (!strcmp(errors, "strict")))
4610 known_errorHandler = 1;
4611 else if (!strcmp(errors, "replace"))
4612 known_errorHandler = 2;
4613 else if (!strcmp(errors, "ignore"))
4614 known_errorHandler = 3;
4615 else if (!strcmp(errors, "xmlcharrefreplace"))
4616 known_errorHandler = 4;
4617 else
4618 known_errorHandler = 0;
4619 }
4620 switch (known_errorHandler) {
4621 case 1: /* strict */
4622 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4623 goto onError;
4624 case 2: /* replace */
4625 while (collstart++<collend)
4626 *str++ = '?'; /* fall through */
4627 case 3: /* ignore */
4628 p = collend;
4629 break;
4630 case 4: /* xmlcharrefreplace */
4631 respos = str - PyBytes_AS_STRING(res);
4632 /* determine replacement size (temporarily (mis)uses p) */
4633 for (p = collstart, repsize = 0; p < collend; ++p) {
4634 if (*p<10)
4635 repsize += 2+1+1;
4636 else if (*p<100)
4637 repsize += 2+2+1;
4638 else if (*p<1000)
4639 repsize += 2+3+1;
4640 else if (*p<10000)
4641 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004642#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 else
4644 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004645#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 else if (*p<100000)
4647 repsize += 2+5+1;
4648 else if (*p<1000000)
4649 repsize += 2+6+1;
4650 else
4651 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004652#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004653 }
4654 requiredsize = respos+repsize+(endp-collend);
4655 if (requiredsize > ressize) {
4656 if (requiredsize<2*ressize)
4657 requiredsize = 2*ressize;
4658 if (_PyBytes_Resize(&res, requiredsize))
4659 goto onError;
4660 str = PyBytes_AS_STRING(res) + respos;
4661 ressize = requiredsize;
4662 }
4663 /* generate replacement (temporarily (mis)uses p) */
4664 for (p = collstart; p < collend; ++p) {
4665 str += sprintf(str, "&#%d;", (int)*p);
4666 }
4667 p = collend;
4668 break;
4669 default:
4670 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4671 encoding, reason, startp, size, &exc,
4672 collstart-startp, collend-startp, &newpos);
4673 if (repunicode == NULL)
4674 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004675 if (PyBytes_Check(repunicode)) {
4676 /* Directly copy bytes result to output. */
4677 repsize = PyBytes_Size(repunicode);
4678 if (repsize > 1) {
4679 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004680 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004681 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4682 Py_DECREF(repunicode);
4683 goto onError;
4684 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004685 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004686 ressize += repsize-1;
4687 }
4688 memcpy(str, PyBytes_AsString(repunicode), repsize);
4689 str += repsize;
4690 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004691 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004692 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004693 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 /* need more space? (at least enough for what we
4695 have+the replacement+the rest of the string, so
4696 we won't have to check space for encodable characters) */
4697 respos = str - PyBytes_AS_STRING(res);
4698 repsize = PyUnicode_GET_SIZE(repunicode);
4699 requiredsize = respos+repsize+(endp-collend);
4700 if (requiredsize > ressize) {
4701 if (requiredsize<2*ressize)
4702 requiredsize = 2*ressize;
4703 if (_PyBytes_Resize(&res, requiredsize)) {
4704 Py_DECREF(repunicode);
4705 goto onError;
4706 }
4707 str = PyBytes_AS_STRING(res) + respos;
4708 ressize = requiredsize;
4709 }
4710 /* check if there is anything unencodable in the replacement
4711 and copy it to the output */
4712 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4713 c = *uni2;
4714 if (c >= limit) {
4715 raise_encode_exception(&exc, encoding, startp, size,
4716 unicodepos, unicodepos+1, reason);
4717 Py_DECREF(repunicode);
4718 goto onError;
4719 }
4720 *str = (char)c;
4721 }
4722 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004723 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004725 }
4726 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004727 /* Resize if we allocated to much */
4728 size = str - PyBytes_AS_STRING(res);
4729 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004730 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004731 if (_PyBytes_Resize(&res, size) < 0)
4732 goto onError;
4733 }
4734
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004735 Py_XDECREF(errorHandler);
4736 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004737 return res;
4738
4739 onError:
4740 Py_XDECREF(res);
4741 Py_XDECREF(errorHandler);
4742 Py_XDECREF(exc);
4743 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744}
4745
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004747 Py_ssize_t size,
4748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004750 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751}
4752
4753PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4754{
4755 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 PyErr_BadArgument();
4757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 }
4759 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004760 PyUnicode_GET_SIZE(unicode),
4761 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762}
4763
4764/* --- 7-bit ASCII Codec -------------------------------------------------- */
4765
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004767 Py_ssize_t size,
4768 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004770 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771 PyUnicodeObject *v;
4772 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004773 Py_ssize_t startinpos;
4774 Py_ssize_t endinpos;
4775 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 const char *e;
4777 PyObject *errorHandler = NULL;
4778 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004779
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004781 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 Py_UNICODE r = *(unsigned char*)s;
4783 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004784 }
Tim Petersced69f82003-09-16 20:30:58 +00004785
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 v = _PyUnicode_New(size);
4787 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004790 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792 e = s + size;
4793 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 register unsigned char c = (unsigned char)*s;
4795 if (c < 128) {
4796 *p++ = c;
4797 ++s;
4798 }
4799 else {
4800 startinpos = s-starts;
4801 endinpos = startinpos + 1;
4802 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4803 if (unicode_decode_call_errorhandler(
4804 errors, &errorHandler,
4805 "ascii", "ordinal not in range(128)",
4806 &starts, &e, &startinpos, &endinpos, &exc, &s,
4807 &v, &outpos, &p))
4808 goto onError;
4809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004811 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004812 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4813 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 Py_XDECREF(errorHandler);
4815 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004817
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 Py_XDECREF(errorHandler);
4821 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 return NULL;
4823}
4824
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004826 Py_ssize_t size,
4827 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830}
4831
4832PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4833{
4834 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 PyErr_BadArgument();
4836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 }
4838 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 PyUnicode_GET_SIZE(unicode),
4840 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841}
4842
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004843#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004844
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004845/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004846
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004847#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004848#define NEED_RETRY
4849#endif
4850
4851/* XXX This code is limited to "true" double-byte encodings, as
4852 a) it assumes an incomplete character consists of a single byte, and
4853 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004855
4856static int is_dbcs_lead_byte(const char *s, int offset)
4857{
4858 const char *curr = s + offset;
4859
4860 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 const char *prev = CharPrev(s, curr);
4862 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004863 }
4864 return 0;
4865}
4866
4867/*
4868 * Decode MBCS string into unicode object. If 'final' is set, converts
4869 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4870 */
4871static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004872 const char *s, /* MBCS string */
4873 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004874 int final,
4875 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004876{
4877 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004878 Py_ssize_t n;
4879 DWORD usize;
4880 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004881
4882 assert(size >= 0);
4883
Victor Stinner554f3f02010-06-16 23:33:54 +00004884 /* check and handle 'errors' arg */
4885 if (errors==NULL || strcmp(errors, "strict")==0)
4886 flags = MB_ERR_INVALID_CHARS;
4887 else if (strcmp(errors, "ignore")==0)
4888 flags = 0;
4889 else {
4890 PyErr_Format(PyExc_ValueError,
4891 "mbcs encoding does not support errors='%s'",
4892 errors);
4893 return -1;
4894 }
4895
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004896 /* Skip trailing lead-byte unless 'final' is set */
4897 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004899
4900 /* First get the size of the result */
4901 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004902 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4903 if (usize==0)
4904 goto mbcs_decode_error;
4905 } else
4906 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004907
4908 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004909 /* Create unicode object */
4910 *v = _PyUnicode_New(usize);
4911 if (*v == NULL)
4912 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004913 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004914 }
4915 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 /* Extend unicode object */
4917 n = PyUnicode_GET_SIZE(*v);
4918 if (_PyUnicode_Resize(v, n + usize) < 0)
4919 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004920 }
4921
4922 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004923 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004924 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004925 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4926 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004928 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004929 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004930
4931mbcs_decode_error:
4932 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4933 we raise a UnicodeDecodeError - else it is a 'generic'
4934 windows error
4935 */
4936 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4937 /* Ideally, we should get reason from FormatMessage - this
4938 is the Windows 2000 English version of the message
4939 */
4940 PyObject *exc = NULL;
4941 const char *reason = "No mapping for the Unicode character exists "
4942 "in the target multi-byte code page.";
4943 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4944 if (exc != NULL) {
4945 PyCodec_StrictErrors(exc);
4946 Py_DECREF(exc);
4947 }
4948 } else {
4949 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4950 }
4951 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004952}
4953
4954PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 Py_ssize_t size,
4956 const char *errors,
4957 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004958{
4959 PyUnicodeObject *v = NULL;
4960 int done;
4961
4962 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004964
4965#ifdef NEED_RETRY
4966 retry:
4967 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004968 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004969 else
4970#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004971 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004972
4973 if (done < 0) {
4974 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004975 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004976 }
4977
4978 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004980
4981#ifdef NEED_RETRY
4982 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 s += done;
4984 size -= done;
4985 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004986 }
4987#endif
4988
4989 return (PyObject *)v;
4990}
4991
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004992PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 Py_ssize_t size,
4994 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004995{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004996 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4997}
4998
4999/*
5000 * Convert unicode into string object (MBCS).
5001 * Returns 0 if succeed, -1 otherwise.
5002 */
5003static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005005 int size, /* size of unicode */
5006 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005007{
Victor Stinner554f3f02010-06-16 23:33:54 +00005008 BOOL usedDefaultChar = FALSE;
5009 BOOL *pusedDefaultChar;
5010 int mbcssize;
5011 Py_ssize_t n;
5012 PyObject *exc = NULL;
5013 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005014
5015 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005016
Victor Stinner554f3f02010-06-16 23:33:54 +00005017 /* check and handle 'errors' arg */
5018 if (errors==NULL || strcmp(errors, "strict")==0) {
5019 flags = WC_NO_BEST_FIT_CHARS;
5020 pusedDefaultChar = &usedDefaultChar;
5021 } else if (strcmp(errors, "replace")==0) {
5022 flags = 0;
5023 pusedDefaultChar = NULL;
5024 } else {
5025 PyErr_Format(PyExc_ValueError,
5026 "mbcs encoding does not support errors='%s'",
5027 errors);
5028 return -1;
5029 }
5030
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005031 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005032 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005033 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5034 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 if (mbcssize == 0) {
5036 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5037 return -1;
5038 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005039 /* If we used a default char, then we failed! */
5040 if (pusedDefaultChar && *pusedDefaultChar)
5041 goto mbcs_encode_error;
5042 } else {
5043 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005044 }
5045
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005046 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 /* Create string object */
5048 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5049 if (*repr == NULL)
5050 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005051 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005052 }
5053 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 /* Extend string object */
5055 n = PyBytes_Size(*repr);
5056 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5057 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005058 }
5059
5060 /* Do the conversion */
5061 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005063 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5064 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5066 return -1;
5067 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005068 if (pusedDefaultChar && *pusedDefaultChar)
5069 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005070 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005071 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005072
5073mbcs_encode_error:
5074 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5075 Py_XDECREF(exc);
5076 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005077}
5078
5079PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005080 Py_ssize_t size,
5081 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005082{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005083 PyObject *repr = NULL;
5084 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005085
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005086#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005088 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005089 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005090 else
5091#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005092 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005093
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005094 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 Py_XDECREF(repr);
5096 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005097 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005098
5099#ifdef NEED_RETRY
5100 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 p += INT_MAX;
5102 size -= INT_MAX;
5103 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005104 }
5105#endif
5106
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005107 return repr;
5108}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005109
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005110PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5111{
5112 if (!PyUnicode_Check(unicode)) {
5113 PyErr_BadArgument();
5114 return NULL;
5115 }
5116 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 PyUnicode_GET_SIZE(unicode),
5118 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005119}
5120
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005121#undef NEED_RETRY
5122
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005123#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005124
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125/* --- Character Mapping Codec -------------------------------------------- */
5126
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005128 Py_ssize_t size,
5129 PyObject *mapping,
5130 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005132 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005133 Py_ssize_t startinpos;
5134 Py_ssize_t endinpos;
5135 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005136 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 PyUnicodeObject *v;
5138 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005139 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 PyObject *errorHandler = NULL;
5141 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005142 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005143 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 /* Default to Latin-1 */
5146 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148
5149 v = _PyUnicode_New(size);
5150 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005153 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005155 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005156 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 mapstring = PyUnicode_AS_UNICODE(mapping);
5158 maplen = PyUnicode_GET_SIZE(mapping);
5159 while (s < e) {
5160 unsigned char ch = *s;
5161 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 if (ch < maplen)
5164 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 if (x == 0xfffe) {
5167 /* undefined mapping */
5168 outpos = p-PyUnicode_AS_UNICODE(v);
5169 startinpos = s-starts;
5170 endinpos = startinpos+1;
5171 if (unicode_decode_call_errorhandler(
5172 errors, &errorHandler,
5173 "charmap", "character maps to <undefined>",
5174 &starts, &e, &startinpos, &endinpos, &exc, &s,
5175 &v, &outpos, &p)) {
5176 goto onError;
5177 }
5178 continue;
5179 }
5180 *p++ = x;
5181 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005182 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005183 }
5184 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 while (s < e) {
5186 unsigned char ch = *s;
5187 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005188
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5190 w = PyLong_FromLong((long)ch);
5191 if (w == NULL)
5192 goto onError;
5193 x = PyObject_GetItem(mapping, w);
5194 Py_DECREF(w);
5195 if (x == NULL) {
5196 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5197 /* No mapping found means: mapping is undefined. */
5198 PyErr_Clear();
5199 x = Py_None;
5200 Py_INCREF(x);
5201 } else
5202 goto onError;
5203 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005204
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 /* Apply mapping */
5206 if (PyLong_Check(x)) {
5207 long value = PyLong_AS_LONG(x);
5208 if (value < 0 || value > 65535) {
5209 PyErr_SetString(PyExc_TypeError,
5210 "character mapping must be in range(65536)");
5211 Py_DECREF(x);
5212 goto onError;
5213 }
5214 *p++ = (Py_UNICODE)value;
5215 }
5216 else if (x == Py_None) {
5217 /* undefined mapping */
5218 outpos = p-PyUnicode_AS_UNICODE(v);
5219 startinpos = s-starts;
5220 endinpos = startinpos+1;
5221 if (unicode_decode_call_errorhandler(
5222 errors, &errorHandler,
5223 "charmap", "character maps to <undefined>",
5224 &starts, &e, &startinpos, &endinpos, &exc, &s,
5225 &v, &outpos, &p)) {
5226 Py_DECREF(x);
5227 goto onError;
5228 }
5229 Py_DECREF(x);
5230 continue;
5231 }
5232 else if (PyUnicode_Check(x)) {
5233 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005234
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 if (targetsize == 1)
5236 /* 1-1 mapping */
5237 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 else if (targetsize > 1) {
5240 /* 1-n mapping */
5241 if (targetsize > extrachars) {
5242 /* resize first */
5243 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5244 Py_ssize_t needed = (targetsize - extrachars) + \
5245 (targetsize << 2);
5246 extrachars += needed;
5247 /* XXX overflow detection missing */
5248 if (_PyUnicode_Resize(&v,
5249 PyUnicode_GET_SIZE(v) + needed) < 0) {
5250 Py_DECREF(x);
5251 goto onError;
5252 }
5253 p = PyUnicode_AS_UNICODE(v) + oldpos;
5254 }
5255 Py_UNICODE_COPY(p,
5256 PyUnicode_AS_UNICODE(x),
5257 targetsize);
5258 p += targetsize;
5259 extrachars -= targetsize;
5260 }
5261 /* 1-0 mapping: skip the character */
5262 }
5263 else {
5264 /* wrong return value */
5265 PyErr_SetString(PyExc_TypeError,
5266 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005267 Py_DECREF(x);
5268 goto onError;
5269 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 Py_DECREF(x);
5271 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 }
5274 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005275 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5276 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005277 Py_XDECREF(errorHandler);
5278 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005280
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005282 Py_XDECREF(errorHandler);
5283 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 Py_XDECREF(v);
5285 return NULL;
5286}
5287
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005288/* Charmap encoding: the lookup table */
5289
5290struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 PyObject_HEAD
5292 unsigned char level1[32];
5293 int count2, count3;
5294 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005295};
5296
5297static PyObject*
5298encoding_map_size(PyObject *obj, PyObject* args)
5299{
5300 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005301 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005303}
5304
5305static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005306 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 PyDoc_STR("Return the size (in bytes) of this object") },
5308 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005309};
5310
5311static void
5312encoding_map_dealloc(PyObject* o)
5313{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005314 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005315}
5316
5317static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005318 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 "EncodingMap", /*tp_name*/
5320 sizeof(struct encoding_map), /*tp_basicsize*/
5321 0, /*tp_itemsize*/
5322 /* methods */
5323 encoding_map_dealloc, /*tp_dealloc*/
5324 0, /*tp_print*/
5325 0, /*tp_getattr*/
5326 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005327 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 0, /*tp_repr*/
5329 0, /*tp_as_number*/
5330 0, /*tp_as_sequence*/
5331 0, /*tp_as_mapping*/
5332 0, /*tp_hash*/
5333 0, /*tp_call*/
5334 0, /*tp_str*/
5335 0, /*tp_getattro*/
5336 0, /*tp_setattro*/
5337 0, /*tp_as_buffer*/
5338 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5339 0, /*tp_doc*/
5340 0, /*tp_traverse*/
5341 0, /*tp_clear*/
5342 0, /*tp_richcompare*/
5343 0, /*tp_weaklistoffset*/
5344 0, /*tp_iter*/
5345 0, /*tp_iternext*/
5346 encoding_map_methods, /*tp_methods*/
5347 0, /*tp_members*/
5348 0, /*tp_getset*/
5349 0, /*tp_base*/
5350 0, /*tp_dict*/
5351 0, /*tp_descr_get*/
5352 0, /*tp_descr_set*/
5353 0, /*tp_dictoffset*/
5354 0, /*tp_init*/
5355 0, /*tp_alloc*/
5356 0, /*tp_new*/
5357 0, /*tp_free*/
5358 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005359};
5360
5361PyObject*
5362PyUnicode_BuildEncodingMap(PyObject* string)
5363{
5364 Py_UNICODE *decode;
5365 PyObject *result;
5366 struct encoding_map *mresult;
5367 int i;
5368 int need_dict = 0;
5369 unsigned char level1[32];
5370 unsigned char level2[512];
5371 unsigned char *mlevel1, *mlevel2, *mlevel3;
5372 int count2 = 0, count3 = 0;
5373
5374 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5375 PyErr_BadArgument();
5376 return NULL;
5377 }
5378 decode = PyUnicode_AS_UNICODE(string);
5379 memset(level1, 0xFF, sizeof level1);
5380 memset(level2, 0xFF, sizeof level2);
5381
5382 /* If there isn't a one-to-one mapping of NULL to \0,
5383 or if there are non-BMP characters, we need to use
5384 a mapping dictionary. */
5385 if (decode[0] != 0)
5386 need_dict = 1;
5387 for (i = 1; i < 256; i++) {
5388 int l1, l2;
5389 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005390#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005391 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005392#endif
5393 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005394 need_dict = 1;
5395 break;
5396 }
5397 if (decode[i] == 0xFFFE)
5398 /* unmapped character */
5399 continue;
5400 l1 = decode[i] >> 11;
5401 l2 = decode[i] >> 7;
5402 if (level1[l1] == 0xFF)
5403 level1[l1] = count2++;
5404 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005405 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005406 }
5407
5408 if (count2 >= 0xFF || count3 >= 0xFF)
5409 need_dict = 1;
5410
5411 if (need_dict) {
5412 PyObject *result = PyDict_New();
5413 PyObject *key, *value;
5414 if (!result)
5415 return NULL;
5416 for (i = 0; i < 256; i++) {
5417 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005418 key = PyLong_FromLong(decode[i]);
5419 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005420 if (!key || !value)
5421 goto failed1;
5422 if (PyDict_SetItem(result, key, value) == -1)
5423 goto failed1;
5424 Py_DECREF(key);
5425 Py_DECREF(value);
5426 }
5427 return result;
5428 failed1:
5429 Py_XDECREF(key);
5430 Py_XDECREF(value);
5431 Py_DECREF(result);
5432 return NULL;
5433 }
5434
5435 /* Create a three-level trie */
5436 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5437 16*count2 + 128*count3 - 1);
5438 if (!result)
5439 return PyErr_NoMemory();
5440 PyObject_Init(result, &EncodingMapType);
5441 mresult = (struct encoding_map*)result;
5442 mresult->count2 = count2;
5443 mresult->count3 = count3;
5444 mlevel1 = mresult->level1;
5445 mlevel2 = mresult->level23;
5446 mlevel3 = mresult->level23 + 16*count2;
5447 memcpy(mlevel1, level1, 32);
5448 memset(mlevel2, 0xFF, 16*count2);
5449 memset(mlevel3, 0, 128*count3);
5450 count3 = 0;
5451 for (i = 1; i < 256; i++) {
5452 int o1, o2, o3, i2, i3;
5453 if (decode[i] == 0xFFFE)
5454 /* unmapped character */
5455 continue;
5456 o1 = decode[i]>>11;
5457 o2 = (decode[i]>>7) & 0xF;
5458 i2 = 16*mlevel1[o1] + o2;
5459 if (mlevel2[i2] == 0xFF)
5460 mlevel2[i2] = count3++;
5461 o3 = decode[i] & 0x7F;
5462 i3 = 128*mlevel2[i2] + o3;
5463 mlevel3[i3] = i;
5464 }
5465 return result;
5466}
5467
5468static int
5469encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5470{
5471 struct encoding_map *map = (struct encoding_map*)mapping;
5472 int l1 = c>>11;
5473 int l2 = (c>>7) & 0xF;
5474 int l3 = c & 0x7F;
5475 int i;
5476
5477#ifdef Py_UNICODE_WIDE
5478 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005480 }
5481#endif
5482 if (c == 0)
5483 return 0;
5484 /* level 1*/
5485 i = map->level1[l1];
5486 if (i == 0xFF) {
5487 return -1;
5488 }
5489 /* level 2*/
5490 i = map->level23[16*i+l2];
5491 if (i == 0xFF) {
5492 return -1;
5493 }
5494 /* level 3 */
5495 i = map->level23[16*map->count2 + 128*i + l3];
5496 if (i == 0) {
5497 return -1;
5498 }
5499 return i;
5500}
5501
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502/* Lookup the character ch in the mapping. If the character
5503 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005504 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506{
Christian Heimes217cfd12007-12-02 14:31:20 +00005507 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 PyObject *x;
5509
5510 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 x = PyObject_GetItem(mapping, w);
5513 Py_DECREF(w);
5514 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5516 /* No mapping found means: mapping is undefined. */
5517 PyErr_Clear();
5518 x = Py_None;
5519 Py_INCREF(x);
5520 return x;
5521 } else
5522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005524 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005526 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 long value = PyLong_AS_LONG(x);
5528 if (value < 0 || value > 255) {
5529 PyErr_SetString(PyExc_TypeError,
5530 "character mapping must be in range(256)");
5531 Py_DECREF(x);
5532 return NULL;
5533 }
5534 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005536 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 /* wrong return value */
5540 PyErr_Format(PyExc_TypeError,
5541 "character mapping must return integer, bytes or None, not %.400s",
5542 x->ob_type->tp_name);
5543 Py_DECREF(x);
5544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 }
5546}
5547
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005548static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005549charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005550{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005551 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5552 /* exponentially overallocate to minimize reallocations */
5553 if (requiredsize < 2*outsize)
5554 requiredsize = 2*outsize;
5555 if (_PyBytes_Resize(outobj, requiredsize))
5556 return -1;
5557 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005558}
5559
Benjamin Peterson14339b62009-01-31 16:36:08 +00005560typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005562}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005564 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005565 space is available. Return a new reference to the object that
5566 was put in the output buffer, or Py_None, if the mapping was undefined
5567 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005568 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005570charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005573 PyObject *rep;
5574 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005575 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576
Christian Heimes90aa7642007-12-19 02:45:37 +00005577 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005578 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005580 if (res == -1)
5581 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 if (outsize<requiredsize)
5583 if (charmapencode_resize(outobj, outpos, requiredsize))
5584 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005585 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 outstart[(*outpos)++] = (char)res;
5587 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005588 }
5589
5590 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005593 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 Py_DECREF(rep);
5595 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005596 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 if (PyLong_Check(rep)) {
5598 Py_ssize_t requiredsize = *outpos+1;
5599 if (outsize<requiredsize)
5600 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5601 Py_DECREF(rep);
5602 return enc_EXCEPTION;
5603 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005604 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005606 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 else {
5608 const char *repchars = PyBytes_AS_STRING(rep);
5609 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5610 Py_ssize_t requiredsize = *outpos+repsize;
5611 if (outsize<requiredsize)
5612 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5613 Py_DECREF(rep);
5614 return enc_EXCEPTION;
5615 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005616 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 memcpy(outstart + *outpos, repchars, repsize);
5618 *outpos += repsize;
5619 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005620 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005621 Py_DECREF(rep);
5622 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623}
5624
5625/* handle an error in PyUnicode_EncodeCharmap
5626 Return 0 on success, -1 on error */
5627static
5628int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005629 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005631 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005632 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633{
5634 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005635 Py_ssize_t repsize;
5636 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005637 Py_UNICODE *uni2;
5638 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005639 Py_ssize_t collstartpos = *inpos;
5640 Py_ssize_t collendpos = *inpos+1;
5641 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642 char *encoding = "charmap";
5643 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005644 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 /* find all unencodable characters */
5647 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005648 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005649 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005650 int res = encoding_map_lookup(p[collendpos], mapping);
5651 if (res != -1)
5652 break;
5653 ++collendpos;
5654 continue;
5655 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005656
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 rep = charmapencode_lookup(p[collendpos], mapping);
5658 if (rep==NULL)
5659 return -1;
5660 else if (rep!=Py_None) {
5661 Py_DECREF(rep);
5662 break;
5663 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005664 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005666 }
5667 /* cache callback name lookup
5668 * (if not done yet, i.e. it's the first error) */
5669 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 if ((errors==NULL) || (!strcmp(errors, "strict")))
5671 *known_errorHandler = 1;
5672 else if (!strcmp(errors, "replace"))
5673 *known_errorHandler = 2;
5674 else if (!strcmp(errors, "ignore"))
5675 *known_errorHandler = 3;
5676 else if (!strcmp(errors, "xmlcharrefreplace"))
5677 *known_errorHandler = 4;
5678 else
5679 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 }
5681 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005682 case 1: /* strict */
5683 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5684 return -1;
5685 case 2: /* replace */
5686 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 x = charmapencode_output('?', mapping, res, respos);
5688 if (x==enc_EXCEPTION) {
5689 return -1;
5690 }
5691 else if (x==enc_FAILED) {
5692 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5693 return -1;
5694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005695 }
5696 /* fall through */
5697 case 3: /* ignore */
5698 *inpos = collendpos;
5699 break;
5700 case 4: /* xmlcharrefreplace */
5701 /* generate replacement (temporarily (mis)uses p) */
5702 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 char buffer[2+29+1+1];
5704 char *cp;
5705 sprintf(buffer, "&#%d;", (int)p[collpos]);
5706 for (cp = buffer; *cp; ++cp) {
5707 x = charmapencode_output(*cp, mapping, res, respos);
5708 if (x==enc_EXCEPTION)
5709 return -1;
5710 else if (x==enc_FAILED) {
5711 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5712 return -1;
5713 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005714 }
5715 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005716 *inpos = collendpos;
5717 break;
5718 default:
5719 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005720 encoding, reason, p, size, exceptionObject,
5721 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005722 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005724 if (PyBytes_Check(repunicode)) {
5725 /* Directly copy bytes result to output. */
5726 Py_ssize_t outsize = PyBytes_Size(*res);
5727 Py_ssize_t requiredsize;
5728 repsize = PyBytes_Size(repunicode);
5729 requiredsize = *respos + repsize;
5730 if (requiredsize > outsize)
5731 /* Make room for all additional bytes. */
5732 if (charmapencode_resize(res, respos, requiredsize)) {
5733 Py_DECREF(repunicode);
5734 return -1;
5735 }
5736 memcpy(PyBytes_AsString(*res) + *respos,
5737 PyBytes_AsString(repunicode), repsize);
5738 *respos += repsize;
5739 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005740 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005741 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005743 /* generate replacement */
5744 repsize = PyUnicode_GET_SIZE(repunicode);
5745 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 x = charmapencode_output(*uni2, mapping, res, respos);
5747 if (x==enc_EXCEPTION) {
5748 return -1;
5749 }
5750 else if (x==enc_FAILED) {
5751 Py_DECREF(repunicode);
5752 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5753 return -1;
5754 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005755 }
5756 *inpos = newpos;
5757 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 }
5759 return 0;
5760}
5761
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 Py_ssize_t size,
5764 PyObject *mapping,
5765 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 /* output object */
5768 PyObject *res = NULL;
5769 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005770 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005772 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 PyObject *errorHandler = NULL;
5774 PyObject *exc = NULL;
5775 /* the following variable is used for caching string comparisons
5776 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5777 * 3=ignore, 4=xmlcharrefreplace */
5778 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779
5780 /* Default to Latin-1 */
5781 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 /* allocate enough for a simple encoding without
5785 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005786 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005787 if (res == NULL)
5788 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005789 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 /* try to encode it */
5794 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5795 if (x==enc_EXCEPTION) /* error */
5796 goto onError;
5797 if (x==enc_FAILED) { /* unencodable character */
5798 if (charmap_encoding_error(p, size, &inpos, mapping,
5799 &exc,
5800 &known_errorHandler, &errorHandler, errors,
5801 &res, &respos)) {
5802 goto onError;
5803 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005804 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 else
5806 /* done with this character => adjust input position */
5807 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005811 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005812 if (_PyBytes_Resize(&res, respos) < 0)
5813 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005814
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005815 Py_XDECREF(exc);
5816 Py_XDECREF(errorHandler);
5817 return res;
5818
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820 Py_XDECREF(res);
5821 Py_XDECREF(exc);
5822 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 return NULL;
5824}
5825
5826PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828{
5829 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 PyErr_BadArgument();
5831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 }
5833 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 PyUnicode_GET_SIZE(unicode),
5835 mapping,
5836 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837}
5838
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839/* create or adjust a UnicodeTranslateError */
5840static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 const Py_UNICODE *unicode, Py_ssize_t size,
5842 Py_ssize_t startpos, Py_ssize_t endpos,
5843 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005846 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 }
5849 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5851 goto onError;
5852 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5853 goto onError;
5854 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5855 goto onError;
5856 return;
5857 onError:
5858 Py_DECREF(*exceptionObject);
5859 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 }
5861}
5862
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863/* raises a UnicodeTranslateError */
5864static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 const Py_UNICODE *unicode, Py_ssize_t size,
5866 Py_ssize_t startpos, Py_ssize_t endpos,
5867 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868{
5869 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005870 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005873}
5874
5875/* error handling callback helper:
5876 build arguments, call the callback and check the arguments,
5877 put the result into newpos and return the replacement string, which
5878 has to be freed by the caller */
5879static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 PyObject **errorHandler,
5881 const char *reason,
5882 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5883 Py_ssize_t startpos, Py_ssize_t endpos,
5884 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005885{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005886 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005887
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005888 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 PyObject *restuple;
5890 PyObject *resunicode;
5891
5892 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005894 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005896 }
5897
5898 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005900 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902
5903 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005907 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005908 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 Py_DECREF(restuple);
5910 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 }
5912 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 &resunicode, &i_newpos)) {
5914 Py_DECREF(restuple);
5915 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005917 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005919 else
5920 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005921 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5923 Py_DECREF(restuple);
5924 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005925 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 Py_INCREF(resunicode);
5927 Py_DECREF(restuple);
5928 return resunicode;
5929}
5930
5931/* Lookup the character ch in the mapping and put the result in result,
5932 which must be decrefed by the caller.
5933 Return 0 on success, -1 on error */
5934static
5935int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5936{
Christian Heimes217cfd12007-12-02 14:31:20 +00005937 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 PyObject *x;
5939
5940 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005942 x = PyObject_GetItem(mapping, w);
5943 Py_DECREF(w);
5944 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5946 /* No mapping found means: use 1:1 mapping. */
5947 PyErr_Clear();
5948 *result = NULL;
5949 return 0;
5950 } else
5951 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005952 }
5953 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 *result = x;
5955 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005956 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005957 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 long value = PyLong_AS_LONG(x);
5959 long max = PyUnicode_GetMax();
5960 if (value < 0 || value > max) {
5961 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005962 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 Py_DECREF(x);
5964 return -1;
5965 }
5966 *result = x;
5967 return 0;
5968 }
5969 else if (PyUnicode_Check(x)) {
5970 *result = x;
5971 return 0;
5972 }
5973 else {
5974 /* wrong return value */
5975 PyErr_SetString(PyExc_TypeError,
5976 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005977 Py_DECREF(x);
5978 return -1;
5979 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980}
5981/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 if not reallocate and adjust various state variables.
5983 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005984static
Walter Dörwald4894c302003-10-24 14:25:28 +00005985int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005988 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005989 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 /* remember old output position */
5991 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5992 /* exponentially overallocate to minimize reallocations */
5993 if (requiredsize < 2 * oldsize)
5994 requiredsize = 2 * oldsize;
5995 if (PyUnicode_Resize(outobj, requiredsize) < 0)
5996 return -1;
5997 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 }
5999 return 0;
6000}
6001/* lookup the character, put the result in the output string and adjust
6002 various state variables. Return a new reference to the object that
6003 was put in the output buffer in *result, or Py_None, if the mapping was
6004 undefined (in which case no character was written).
6005 The called must decref result.
6006 Return 0 on success, -1 on error. */
6007static
Walter Dörwald4894c302003-10-24 14:25:28 +00006008int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6010 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011{
Walter Dörwald4894c302003-10-24 14:25:28 +00006012 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006014 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 /* not found => default to 1:1 mapping */
6016 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017 }
6018 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006020 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 /* no overflow check, because we know that the space is enough */
6022 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 }
6024 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6026 if (repsize==1) {
6027 /* no overflow check, because we know that the space is enough */
6028 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6029 }
6030 else if (repsize!=0) {
6031 /* more than one character */
6032 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6033 (insize - (curinp-startinp)) +
6034 repsize - 1;
6035 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6036 return -1;
6037 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6038 *outp += repsize;
6039 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006040 }
6041 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 return 0;
6044}
6045
6046PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 Py_ssize_t size,
6048 PyObject *mapping,
6049 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051 /* output object */
6052 PyObject *res = NULL;
6053 /* pointers to the beginning and end+1 of input */
6054 const Py_UNICODE *startp = p;
6055 const Py_UNICODE *endp = p + size;
6056 /* pointer into the output */
6057 Py_UNICODE *str;
6058 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006059 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060 char *reason = "character maps to <undefined>";
6061 PyObject *errorHandler = NULL;
6062 PyObject *exc = NULL;
6063 /* the following variable is used for caching string comparisons
6064 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6065 * 3=ignore, 4=xmlcharrefreplace */
6066 int known_errorHandler = -1;
6067
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 PyErr_BadArgument();
6070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006072
6073 /* allocate enough for a simple 1:1 translation without
6074 replacements, if we need more, we'll resize */
6075 res = PyUnicode_FromUnicode(NULL, size);
6076 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 /* try to encode it */
6084 PyObject *x = NULL;
6085 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6086 Py_XDECREF(x);
6087 goto onError;
6088 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006089 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 if (x!=Py_None) /* it worked => adjust input pointer */
6091 ++p;
6092 else { /* untranslatable character */
6093 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6094 Py_ssize_t repsize;
6095 Py_ssize_t newpos;
6096 Py_UNICODE *uni2;
6097 /* startpos for collecting untranslatable chars */
6098 const Py_UNICODE *collstart = p;
6099 const Py_UNICODE *collend = p+1;
6100 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 /* find all untranslatable characters */
6103 while (collend < endp) {
6104 if (charmaptranslate_lookup(*collend, mapping, &x))
6105 goto onError;
6106 Py_XDECREF(x);
6107 if (x!=Py_None)
6108 break;
6109 ++collend;
6110 }
6111 /* cache callback name lookup
6112 * (if not done yet, i.e. it's the first error) */
6113 if (known_errorHandler==-1) {
6114 if ((errors==NULL) || (!strcmp(errors, "strict")))
6115 known_errorHandler = 1;
6116 else if (!strcmp(errors, "replace"))
6117 known_errorHandler = 2;
6118 else if (!strcmp(errors, "ignore"))
6119 known_errorHandler = 3;
6120 else if (!strcmp(errors, "xmlcharrefreplace"))
6121 known_errorHandler = 4;
6122 else
6123 known_errorHandler = 0;
6124 }
6125 switch (known_errorHandler) {
6126 case 1: /* strict */
6127 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006128 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 case 2: /* replace */
6130 /* No need to check for space, this is a 1:1 replacement */
6131 for (coll = collstart; coll<collend; ++coll)
6132 *str++ = '?';
6133 /* fall through */
6134 case 3: /* ignore */
6135 p = collend;
6136 break;
6137 case 4: /* xmlcharrefreplace */
6138 /* generate replacement (temporarily (mis)uses p) */
6139 for (p = collstart; p < collend; ++p) {
6140 char buffer[2+29+1+1];
6141 char *cp;
6142 sprintf(buffer, "&#%d;", (int)*p);
6143 if (charmaptranslate_makespace(&res, &str,
6144 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6145 goto onError;
6146 for (cp = buffer; *cp; ++cp)
6147 *str++ = *cp;
6148 }
6149 p = collend;
6150 break;
6151 default:
6152 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6153 reason, startp, size, &exc,
6154 collstart-startp, collend-startp, &newpos);
6155 if (repunicode == NULL)
6156 goto onError;
6157 /* generate replacement */
6158 repsize = PyUnicode_GET_SIZE(repunicode);
6159 if (charmaptranslate_makespace(&res, &str,
6160 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6161 Py_DECREF(repunicode);
6162 goto onError;
6163 }
6164 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6165 *str++ = *uni2;
6166 p = startp + newpos;
6167 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006168 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006169 }
6170 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171 /* Resize if we allocated to much */
6172 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006173 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 if (PyUnicode_Resize(&res, respos) < 0)
6175 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 }
6177 Py_XDECREF(exc);
6178 Py_XDECREF(errorHandler);
6179 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006182 Py_XDECREF(res);
6183 Py_XDECREF(exc);
6184 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 return NULL;
6186}
6187
6188PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 PyObject *mapping,
6190 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191{
6192 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006193
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 str = PyUnicode_FromObject(str);
6195 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 PyUnicode_GET_SIZE(str),
6199 mapping,
6200 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 Py_DECREF(str);
6202 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006203
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 Py_XDECREF(str);
6206 return NULL;
6207}
Tim Petersced69f82003-09-16 20:30:58 +00006208
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006209PyObject *
6210PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6211 Py_ssize_t length)
6212{
6213 PyObject *result;
6214 Py_UNICODE *p; /* write pointer into result */
6215 Py_ssize_t i;
6216 /* Copy to a new string */
6217 result = (PyObject *)_PyUnicode_New(length);
6218 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6219 if (result == NULL)
6220 return result;
6221 p = PyUnicode_AS_UNICODE(result);
6222 /* Iterate over code points */
6223 for (i = 0; i < length; i++) {
6224 Py_UNICODE ch =s[i];
6225 if (ch > 127) {
6226 int decimal = Py_UNICODE_TODECIMAL(ch);
6227 if (decimal >= 0)
6228 p[i] = '0' + decimal;
6229 }
6230 }
6231 return result;
6232}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006233/* --- Decimal Encoder ---------------------------------------------------- */
6234
6235int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 Py_ssize_t length,
6237 char *output,
6238 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006239{
6240 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006241 PyObject *errorHandler = NULL;
6242 PyObject *exc = NULL;
6243 const char *encoding = "decimal";
6244 const char *reason = "invalid decimal Unicode string";
6245 /* the following variable is used for caching string comparisons
6246 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6247 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006248
6249 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 PyErr_BadArgument();
6251 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006252 }
6253
6254 p = s;
6255 end = s + length;
6256 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 register Py_UNICODE ch = *p;
6258 int decimal;
6259 PyObject *repunicode;
6260 Py_ssize_t repsize;
6261 Py_ssize_t newpos;
6262 Py_UNICODE *uni2;
6263 Py_UNICODE *collstart;
6264 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006265
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006267 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 ++p;
6269 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006270 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 decimal = Py_UNICODE_TODECIMAL(ch);
6272 if (decimal >= 0) {
6273 *output++ = '0' + decimal;
6274 ++p;
6275 continue;
6276 }
6277 if (0 < ch && ch < 256) {
6278 *output++ = (char)ch;
6279 ++p;
6280 continue;
6281 }
6282 /* All other characters are considered unencodable */
6283 collstart = p;
6284 collend = p+1;
6285 while (collend < end) {
6286 if ((0 < *collend && *collend < 256) ||
6287 !Py_UNICODE_ISSPACE(*collend) ||
6288 Py_UNICODE_TODECIMAL(*collend))
6289 break;
6290 }
6291 /* cache callback name lookup
6292 * (if not done yet, i.e. it's the first error) */
6293 if (known_errorHandler==-1) {
6294 if ((errors==NULL) || (!strcmp(errors, "strict")))
6295 known_errorHandler = 1;
6296 else if (!strcmp(errors, "replace"))
6297 known_errorHandler = 2;
6298 else if (!strcmp(errors, "ignore"))
6299 known_errorHandler = 3;
6300 else if (!strcmp(errors, "xmlcharrefreplace"))
6301 known_errorHandler = 4;
6302 else
6303 known_errorHandler = 0;
6304 }
6305 switch (known_errorHandler) {
6306 case 1: /* strict */
6307 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6308 goto onError;
6309 case 2: /* replace */
6310 for (p = collstart; p < collend; ++p)
6311 *output++ = '?';
6312 /* fall through */
6313 case 3: /* ignore */
6314 p = collend;
6315 break;
6316 case 4: /* xmlcharrefreplace */
6317 /* generate replacement (temporarily (mis)uses p) */
6318 for (p = collstart; p < collend; ++p)
6319 output += sprintf(output, "&#%d;", (int)*p);
6320 p = collend;
6321 break;
6322 default:
6323 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6324 encoding, reason, s, length, &exc,
6325 collstart-s, collend-s, &newpos);
6326 if (repunicode == NULL)
6327 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006328 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006329 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006330 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6331 Py_DECREF(repunicode);
6332 goto onError;
6333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 /* generate replacement */
6335 repsize = PyUnicode_GET_SIZE(repunicode);
6336 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6337 Py_UNICODE ch = *uni2;
6338 if (Py_UNICODE_ISSPACE(ch))
6339 *output++ = ' ';
6340 else {
6341 decimal = Py_UNICODE_TODECIMAL(ch);
6342 if (decimal >= 0)
6343 *output++ = '0' + decimal;
6344 else if (0 < ch && ch < 256)
6345 *output++ = (char)ch;
6346 else {
6347 Py_DECREF(repunicode);
6348 raise_encode_exception(&exc, encoding,
6349 s, length, collstart-s, collend-s, reason);
6350 goto onError;
6351 }
6352 }
6353 }
6354 p = s + newpos;
6355 Py_DECREF(repunicode);
6356 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006357 }
6358 /* 0-terminate the output string */
6359 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 Py_XDECREF(exc);
6361 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006362 return 0;
6363
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365 Py_XDECREF(exc);
6366 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006367 return -1;
6368}
6369
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370/* --- Helpers ------------------------------------------------------------ */
6371
Eric Smith8c663262007-08-25 02:26:07 +00006372#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006373#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006374
Thomas Wouters477c8d52006-05-27 19:21:47 +00006375#include "stringlib/count.h"
6376#include "stringlib/find.h"
6377#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006378#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006379
Eric Smith5807c412008-05-11 21:00:57 +00006380#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006381#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006382#include "stringlib/localeutil.h"
6383
Thomas Wouters477c8d52006-05-27 19:21:47 +00006384/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006385#define ADJUST_INDICES(start, end, len) \
6386 if (end > len) \
6387 end = len; \
6388 else if (end < 0) { \
6389 end += len; \
6390 if (end < 0) \
6391 end = 0; \
6392 } \
6393 if (start < 0) { \
6394 start += len; \
6395 if (start < 0) \
6396 start = 0; \
6397 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006398
Martin v. Löwis18e16552006-02-15 17:27:45 +00006399Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006400 PyObject *substr,
6401 Py_ssize_t start,
6402 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006404 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006405 PyUnicodeObject* str_obj;
6406 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006407
Thomas Wouters477c8d52006-05-27 19:21:47 +00006408 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6409 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006411 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6412 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 Py_DECREF(str_obj);
6414 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 }
Tim Petersced69f82003-09-16 20:30:58 +00006416
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006417 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006418 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006419 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6420 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006421 );
6422
6423 Py_DECREF(sub_obj);
6424 Py_DECREF(str_obj);
6425
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 return result;
6427}
6428
Martin v. Löwis18e16552006-02-15 17:27:45 +00006429Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006430 PyObject *sub,
6431 Py_ssize_t start,
6432 Py_ssize_t end,
6433 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006435 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006436
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006438 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006440 sub = PyUnicode_FromObject(sub);
6441 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 Py_DECREF(str);
6443 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 }
Tim Petersced69f82003-09-16 20:30:58 +00006445
Thomas Wouters477c8d52006-05-27 19:21:47 +00006446 if (direction > 0)
6447 result = stringlib_find_slice(
6448 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6449 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6450 start, end
6451 );
6452 else
6453 result = stringlib_rfind_slice(
6454 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6455 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6456 start, end
6457 );
6458
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006460 Py_DECREF(sub);
6461
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 return result;
6463}
6464
Tim Petersced69f82003-09-16 20:30:58 +00006465static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 PyUnicodeObject *substring,
6468 Py_ssize_t start,
6469 Py_ssize_t end,
6470 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 if (substring->length == 0)
6473 return 1;
6474
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006475 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 end -= substring->length;
6477 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479
6480 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 if (Py_UNICODE_MATCH(self, end, substring))
6482 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 } else {
6484 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 }
6487
6488 return 0;
6489}
6490
Martin v. Löwis18e16552006-02-15 17:27:45 +00006491Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 PyObject *substr,
6493 Py_ssize_t start,
6494 Py_ssize_t end,
6495 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006497 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006498
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 str = PyUnicode_FromObject(str);
6500 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 substr = PyUnicode_FromObject(substr);
6503 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 Py_DECREF(str);
6505 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 }
Tim Petersced69f82003-09-16 20:30:58 +00006507
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 (PyUnicodeObject *)substr,
6510 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 Py_DECREF(str);
6512 Py_DECREF(substr);
6513 return result;
6514}
6515
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516/* Apply fixfct filter to the Unicode object self and return a
6517 reference to the modified object */
6518
Tim Petersced69f82003-09-16 20:30:58 +00006519static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522{
6523
6524 PyUnicodeObject *u;
6525
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006526 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006529
6530 Py_UNICODE_COPY(u->str, self->str, self->length);
6531
Tim Peters7a29bd52001-09-12 03:03:31 +00006532 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 /* fixfct should return TRUE if it modified the buffer. If
6534 FALSE, return a reference to the original buffer instead
6535 (to save space, not time) */
6536 Py_INCREF(self);
6537 Py_DECREF(u);
6538 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 }
6540 return (PyObject*) u;
6541}
6542
Tim Petersced69f82003-09-16 20:30:58 +00006543static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544int fixupper(PyUnicodeObject *self)
6545{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006546 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 Py_UNICODE *s = self->str;
6548 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006549
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006552
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 ch = Py_UNICODE_TOUPPER(*s);
6554 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 *s = ch;
6557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 s++;
6559 }
6560
6561 return status;
6562}
6563
Tim Petersced69f82003-09-16 20:30:58 +00006564static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565int fixlower(PyUnicodeObject *self)
6566{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006567 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 Py_UNICODE *s = self->str;
6569 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006570
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006573
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 ch = Py_UNICODE_TOLOWER(*s);
6575 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 *s = ch;
6578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 s++;
6580 }
6581
6582 return status;
6583}
6584
Tim Petersced69f82003-09-16 20:30:58 +00006585static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586int fixswapcase(PyUnicodeObject *self)
6587{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006588 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 Py_UNICODE *s = self->str;
6590 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006591
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 while (len-- > 0) {
6593 if (Py_UNICODE_ISUPPER(*s)) {
6594 *s = Py_UNICODE_TOLOWER(*s);
6595 status = 1;
6596 } else if (Py_UNICODE_ISLOWER(*s)) {
6597 *s = Py_UNICODE_TOUPPER(*s);
6598 status = 1;
6599 }
6600 s++;
6601 }
6602
6603 return status;
6604}
6605
Tim Petersced69f82003-09-16 20:30:58 +00006606static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607int fixcapitalize(PyUnicodeObject *self)
6608{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006609 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006610 Py_UNICODE *s = self->str;
6611 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006612
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006613 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006615 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 *s = Py_UNICODE_TOUPPER(*s);
6617 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006619 s++;
6620 while (--len > 0) {
6621 if (Py_UNICODE_ISUPPER(*s)) {
6622 *s = Py_UNICODE_TOLOWER(*s);
6623 status = 1;
6624 }
6625 s++;
6626 }
6627 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628}
6629
6630static
6631int fixtitle(PyUnicodeObject *self)
6632{
6633 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6634 register Py_UNICODE *e;
6635 int previous_is_cased;
6636
6637 /* Shortcut for single character strings */
6638 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6640 if (*p != ch) {
6641 *p = ch;
6642 return 1;
6643 }
6644 else
6645 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 }
Tim Petersced69f82003-09-16 20:30:58 +00006647
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 e = p + PyUnicode_GET_SIZE(self);
6649 previous_is_cased = 0;
6650 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006652
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 if (previous_is_cased)
6654 *p = Py_UNICODE_TOLOWER(ch);
6655 else
6656 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006657
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 if (Py_UNICODE_ISLOWER(ch) ||
6659 Py_UNICODE_ISUPPER(ch) ||
6660 Py_UNICODE_ISTITLE(ch))
6661 previous_is_cased = 1;
6662 else
6663 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
6665 return 1;
6666}
6667
Tim Peters8ce9f162004-08-27 01:49:32 +00006668PyObject *
6669PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670{
Skip Montanaro6543b452004-09-16 03:28:13 +00006671 const Py_UNICODE blank = ' ';
6672 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006673 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006674 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006675 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6676 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006677 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6678 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006679 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006680 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681
Tim Peters05eba1f2004-08-27 21:32:02 +00006682 fseq = PySequence_Fast(seq, "");
6683 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006684 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006685 }
6686
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006687 /* NOTE: the following code can't call back into Python code,
6688 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006689 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006690
Tim Peters05eba1f2004-08-27 21:32:02 +00006691 seqlen = PySequence_Fast_GET_SIZE(fseq);
6692 /* If empty sequence, return u"". */
6693 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006694 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6695 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006696 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006697 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006698 /* If singleton sequence with an exact Unicode, return that. */
6699 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 item = items[0];
6701 if (PyUnicode_CheckExact(item)) {
6702 Py_INCREF(item);
6703 res = (PyUnicodeObject *)item;
6704 goto Done;
6705 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006706 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006707 else {
6708 /* Set up sep and seplen */
6709 if (separator == NULL) {
6710 sep = &blank;
6711 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006712 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006713 else {
6714 if (!PyUnicode_Check(separator)) {
6715 PyErr_Format(PyExc_TypeError,
6716 "separator: expected str instance,"
6717 " %.80s found",
6718 Py_TYPE(separator)->tp_name);
6719 goto onError;
6720 }
6721 sep = PyUnicode_AS_UNICODE(separator);
6722 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006723 }
6724 }
6725
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006726 /* There are at least two things to join, or else we have a subclass
6727 * of str in the sequence.
6728 * Do a pre-pass to figure out the total amount of space we'll
6729 * need (sz), and see whether all argument are strings.
6730 */
6731 sz = 0;
6732 for (i = 0; i < seqlen; i++) {
6733 const Py_ssize_t old_sz = sz;
6734 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 if (!PyUnicode_Check(item)) {
6736 PyErr_Format(PyExc_TypeError,
6737 "sequence item %zd: expected str instance,"
6738 " %.80s found",
6739 i, Py_TYPE(item)->tp_name);
6740 goto onError;
6741 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006742 sz += PyUnicode_GET_SIZE(item);
6743 if (i != 0)
6744 sz += seplen;
6745 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6746 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006748 goto onError;
6749 }
6750 }
Tim Petersced69f82003-09-16 20:30:58 +00006751
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006752 res = _PyUnicode_New(sz);
6753 if (res == NULL)
6754 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006755
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006756 /* Catenate everything. */
6757 res_p = PyUnicode_AS_UNICODE(res);
6758 for (i = 0; i < seqlen; ++i) {
6759 Py_ssize_t itemlen;
6760 item = items[i];
6761 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 /* Copy item, and maybe the separator. */
6763 if (i) {
6764 Py_UNICODE_COPY(res_p, sep, seplen);
6765 res_p += seplen;
6766 }
6767 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6768 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006769 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006770
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006772 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 return (PyObject *)res;
6774
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006776 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006777 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 return NULL;
6779}
6780
Tim Petersced69f82003-09-16 20:30:58 +00006781static
6782PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 Py_ssize_t left,
6784 Py_ssize_t right,
6785 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786{
6787 PyUnicodeObject *u;
6788
6789 if (left < 0)
6790 left = 0;
6791 if (right < 0)
6792 right = 0;
6793
Tim Peters7a29bd52001-09-12 03:03:31 +00006794 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 Py_INCREF(self);
6796 return self;
6797 }
6798
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006799 if (left > PY_SSIZE_T_MAX - self->length ||
6800 right > PY_SSIZE_T_MAX - (left + self->length)) {
6801 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6802 return NULL;
6803 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 u = _PyUnicode_New(left + self->length + right);
6805 if (u) {
6806 if (left)
6807 Py_UNICODE_FILL(u->str, fill, left);
6808 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6809 if (right)
6810 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6811 }
6812
6813 return u;
6814}
6815
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006816PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819
6820 string = PyUnicode_FromObject(string);
6821 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006824 list = stringlib_splitlines(
6825 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6826 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827
6828 Py_DECREF(string);
6829 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830}
6831
Tim Petersced69f82003-09-16 20:30:58 +00006832static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 PyUnicodeObject *substring,
6835 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006838 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006841 return stringlib_split_whitespace(
6842 (PyObject*) self, self->str, self->length, maxcount
6843 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006845 return stringlib_split(
6846 (PyObject*) self, self->str, self->length,
6847 substring->str, substring->length,
6848 maxcount
6849 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850}
6851
Tim Petersced69f82003-09-16 20:30:58 +00006852static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006853PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 PyUnicodeObject *substring,
6855 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006856{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006857 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006858 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006859
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006860 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006861 return stringlib_rsplit_whitespace(
6862 (PyObject*) self, self->str, self->length, maxcount
6863 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006864
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006865 return stringlib_rsplit(
6866 (PyObject*) self, self->str, self->length,
6867 substring->str, substring->length,
6868 maxcount
6869 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006870}
6871
6872static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 PyUnicodeObject *str1,
6875 PyUnicodeObject *str2,
6876 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877{
6878 PyUnicodeObject *u;
6879
6880 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006882 else if (maxcount == 0 || self->length == 0)
6883 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884
Thomas Wouters477c8d52006-05-27 19:21:47 +00006885 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006886 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006887 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006888 if (str1->length == 0)
6889 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006890 if (str1->length == 1) {
6891 /* replace characters */
6892 Py_UNICODE u1, u2;
6893 if (!findchar(self->str, self->length, str1->str[0]))
6894 goto nothing;
6895 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6896 if (!u)
6897 return NULL;
6898 Py_UNICODE_COPY(u->str, self->str, self->length);
6899 u1 = str1->str[0];
6900 u2 = str2->str[0];
6901 for (i = 0; i < u->length; i++)
6902 if (u->str[i] == u1) {
6903 if (--maxcount < 0)
6904 break;
6905 u->str[i] = u2;
6906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006908 i = stringlib_find(
6909 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006911 if (i < 0)
6912 goto nothing;
6913 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6914 if (!u)
6915 return NULL;
6916 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006917
6918 /* change everything in-place, starting with this one */
6919 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6920 i += str1->length;
6921
6922 while ( --maxcount > 0) {
6923 i = stringlib_find(self->str+i, self->length-i,
6924 str1->str, str1->length,
6925 i);
6926 if (i == -1)
6927 break;
6928 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6929 i += str1->length;
6930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006933
6934 Py_ssize_t n, i, j, e;
6935 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 Py_UNICODE *p;
6937
6938 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006939 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6940 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006941 if (n == 0)
6942 goto nothing;
6943 /* new_size = self->length + n * (str2->length - str1->length)); */
6944 delta = (str2->length - str1->length);
6945 if (delta == 0) {
6946 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006948 product = n * (str2->length - str1->length);
6949 if ((product / (str2->length - str1->length)) != n) {
6950 PyErr_SetString(PyExc_OverflowError,
6951 "replace string is too long");
6952 return NULL;
6953 }
6954 new_size = self->length + product;
6955 if (new_size < 0) {
6956 PyErr_SetString(PyExc_OverflowError,
6957 "replace string is too long");
6958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 }
6960 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006961 u = _PyUnicode_New(new_size);
6962 if (!u)
6963 return NULL;
6964 i = 0;
6965 p = u->str;
6966 e = self->length - str1->length;
6967 if (str1->length > 0) {
6968 while (n-- > 0) {
6969 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006970 j = stringlib_find(self->str+i, self->length-i,
6971 str1->str, str1->length,
6972 i);
6973 if (j == -1)
6974 break;
6975 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006976 /* copy unchanged part [i:j] */
6977 Py_UNICODE_COPY(p, self->str+i, j-i);
6978 p += j - i;
6979 }
6980 /* copy substitution string */
6981 if (str2->length > 0) {
6982 Py_UNICODE_COPY(p, str2->str, str2->length);
6983 p += str2->length;
6984 }
6985 i = j + str1->length;
6986 }
6987 if (i < self->length)
6988 /* copy tail [i:] */
6989 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6990 } else {
6991 /* interleave */
6992 while (n > 0) {
6993 Py_UNICODE_COPY(p, str2->str, str2->length);
6994 p += str2->length;
6995 if (--n <= 0)
6996 break;
6997 *p++ = self->str[i++];
6998 }
6999 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007003
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007005 /* nothing to replace; return original string (when possible) */
7006 if (PyUnicode_CheckExact(self)) {
7007 Py_INCREF(self);
7008 return (PyObject *) self;
7009 }
7010 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011}
7012
7013/* --- Unicode Object Methods --------------------------------------------- */
7014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007015PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017\n\
7018Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007019characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020
7021static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007022unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 return fixup(self, fixtitle);
7025}
7026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007027PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029\n\
7030Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007031have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032
7033static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007034unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036 return fixup(self, fixcapitalize);
7037}
7038
7039#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007040PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042\n\
7043Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007044normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045
7046static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007047unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048{
7049 PyObject *list;
7050 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007051 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 /* Split into words */
7054 list = split(self, NULL, -1);
7055 if (!list)
7056 return NULL;
7057
7058 /* Capitalize each word */
7059 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7060 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 if (item == NULL)
7063 goto onError;
7064 Py_DECREF(PyList_GET_ITEM(list, i));
7065 PyList_SET_ITEM(list, i, item);
7066 }
7067
7068 /* Join the words to form a new string */
7069 item = PyUnicode_Join(NULL, list);
7070
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 Py_DECREF(list);
7073 return (PyObject *)item;
7074}
7075#endif
7076
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007077/* Argument converter. Coerces to a single unicode character */
7078
7079static int
7080convert_uc(PyObject *obj, void *addr)
7081{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007082 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7083 PyObject *uniobj;
7084 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007085
Benjamin Peterson14339b62009-01-31 16:36:08 +00007086 uniobj = PyUnicode_FromObject(obj);
7087 if (uniobj == NULL) {
7088 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007090 return 0;
7091 }
7092 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7093 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007095 Py_DECREF(uniobj);
7096 return 0;
7097 }
7098 unistr = PyUnicode_AS_UNICODE(uniobj);
7099 *fillcharloc = unistr[0];
7100 Py_DECREF(uniobj);
7101 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007102}
7103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007104PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007107Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007108done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109
7110static PyObject *
7111unicode_center(PyUnicodeObject *self, PyObject *args)
7112{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007113 Py_ssize_t marg, left;
7114 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007115 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116
Thomas Woutersde017742006-02-16 19:34:37 +00007117 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118 return NULL;
7119
Tim Peters7a29bd52001-09-12 03:03:31 +00007120 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121 Py_INCREF(self);
7122 return (PyObject*) self;
7123 }
7124
7125 marg = width - self->length;
7126 left = marg / 2 + (marg & width & 1);
7127
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007128 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129}
7130
Marc-André Lemburge5034372000-08-08 08:04:29 +00007131#if 0
7132
7133/* This code should go into some future Unicode collation support
7134 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007135 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007136
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007137/* speedy UTF-16 code point order comparison */
7138/* gleaned from: */
7139/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7140
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007141static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007142{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007143 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007144 0, 0, 0, 0, 0, 0, 0, 0,
7145 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007146 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007147};
7148
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149static int
7150unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7151{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007152 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007153
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 Py_UNICODE *s1 = str1->str;
7155 Py_UNICODE *s2 = str2->str;
7156
7157 len1 = str1->length;
7158 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007159
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007161 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007162
7163 c1 = *s1++;
7164 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007165
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 if (c1 > (1<<11) * 26)
7167 c1 += utf16Fixup[c1>>11];
7168 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007169 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007170 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007171
7172 if (c1 != c2)
7173 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007174
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007175 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 }
7177
7178 return (len1 < len2) ? -1 : (len1 != len2);
7179}
7180
Marc-André Lemburge5034372000-08-08 08:04:29 +00007181#else
7182
7183static int
7184unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7185{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007186 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007187
7188 Py_UNICODE *s1 = str1->str;
7189 Py_UNICODE *s2 = str2->str;
7190
7191 len1 = str1->length;
7192 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007193
Marc-André Lemburge5034372000-08-08 08:04:29 +00007194 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007195 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007196
Fredrik Lundh45714e92001-06-26 16:39:36 +00007197 c1 = *s1++;
7198 c2 = *s2++;
7199
7200 if (c1 != c2)
7201 return (c1 < c2) ? -1 : 1;
7202
Marc-André Lemburge5034372000-08-08 08:04:29 +00007203 len1--; len2--;
7204 }
7205
7206 return (len1 < len2) ? -1 : (len1 != len2);
7207}
7208
7209#endif
7210
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007214 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7215 return unicode_compare((PyUnicodeObject *)left,
7216 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007217 PyErr_Format(PyExc_TypeError,
7218 "Can't compare %.100s and %.100s",
7219 left->ob_type->tp_name,
7220 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 return -1;
7222}
7223
Martin v. Löwis5b222132007-06-10 09:51:05 +00007224int
7225PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7226{
7227 int i;
7228 Py_UNICODE *id;
7229 assert(PyUnicode_Check(uni));
7230 id = PyUnicode_AS_UNICODE(uni);
7231 /* Compare Unicode string and source character set string */
7232 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 if (id[i] != str[i])
7234 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007235 /* This check keeps Python strings that end in '\0' from comparing equal
7236 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007237 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007239 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007241 return 0;
7242}
7243
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007244
Benjamin Peterson29060642009-01-31 22:14:21 +00007245#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007246 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007247
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007248PyObject *PyUnicode_RichCompare(PyObject *left,
7249 PyObject *right,
7250 int op)
7251{
7252 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007253
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007254 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7255 PyObject *v;
7256 if (((PyUnicodeObject *) left)->length !=
7257 ((PyUnicodeObject *) right)->length) {
7258 if (op == Py_EQ) {
7259 Py_INCREF(Py_False);
7260 return Py_False;
7261 }
7262 if (op == Py_NE) {
7263 Py_INCREF(Py_True);
7264 return Py_True;
7265 }
7266 }
7267 if (left == right)
7268 result = 0;
7269 else
7270 result = unicode_compare((PyUnicodeObject *)left,
7271 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007272
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007273 /* Convert the return value to a Boolean */
7274 switch (op) {
7275 case Py_EQ:
7276 v = TEST_COND(result == 0);
7277 break;
7278 case Py_NE:
7279 v = TEST_COND(result != 0);
7280 break;
7281 case Py_LE:
7282 v = TEST_COND(result <= 0);
7283 break;
7284 case Py_GE:
7285 v = TEST_COND(result >= 0);
7286 break;
7287 case Py_LT:
7288 v = TEST_COND(result == -1);
7289 break;
7290 case Py_GT:
7291 v = TEST_COND(result == 1);
7292 break;
7293 default:
7294 PyErr_BadArgument();
7295 return NULL;
7296 }
7297 Py_INCREF(v);
7298 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007299 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007300
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007301 Py_INCREF(Py_NotImplemented);
7302 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007303}
7304
Guido van Rossum403d68b2000-03-13 15:55:09 +00007305int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007307{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007308 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007309 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007310
7311 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007312 sub = PyUnicode_FromObject(element);
7313 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 PyErr_Format(PyExc_TypeError,
7315 "'in <string>' requires string as left operand, not %s",
7316 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007317 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007318 }
7319
Thomas Wouters477c8d52006-05-27 19:21:47 +00007320 str = PyUnicode_FromObject(container);
7321 if (!str) {
7322 Py_DECREF(sub);
7323 return -1;
7324 }
7325
7326 result = stringlib_contains_obj(str, sub);
7327
7328 Py_DECREF(str);
7329 Py_DECREF(sub);
7330
Guido van Rossum403d68b2000-03-13 15:55:09 +00007331 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007332}
7333
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334/* Concat to string or Unicode object giving a new Unicode object. */
7335
7336PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338{
7339 PyUnicodeObject *u = NULL, *v = NULL, *w;
7340
7341 /* Coerce the two arguments */
7342 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7343 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7346 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348
7349 /* Shortcuts */
7350 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 Py_DECREF(v);
7352 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353 }
7354 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 Py_DECREF(u);
7356 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 }
7358
7359 /* Concat the two Unicode strings */
7360 w = _PyUnicode_New(u->length + v->length);
7361 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007362 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363 Py_UNICODE_COPY(w->str, u->str, u->length);
7364 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7365
7366 Py_DECREF(u);
7367 Py_DECREF(v);
7368 return (PyObject *)w;
7369
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371 Py_XDECREF(u);
7372 Py_XDECREF(v);
7373 return NULL;
7374}
7375
Walter Dörwald1ab83302007-05-18 17:15:44 +00007376void
7377PyUnicode_Append(PyObject **pleft, PyObject *right)
7378{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007379 PyObject *new;
7380 if (*pleft == NULL)
7381 return;
7382 if (right == NULL || !PyUnicode_Check(*pleft)) {
7383 Py_DECREF(*pleft);
7384 *pleft = NULL;
7385 return;
7386 }
7387 new = PyUnicode_Concat(*pleft, right);
7388 Py_DECREF(*pleft);
7389 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007390}
7391
7392void
7393PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7394{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007395 PyUnicode_Append(pleft, right);
7396 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007397}
7398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007399PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007402Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007403string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007404interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
7406static PyObject *
7407unicode_count(PyUnicodeObject *self, PyObject *args)
7408{
7409 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007410 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007411 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412 PyObject *result;
7413
Guido van Rossumb8872e62000-05-09 14:14:27 +00007414 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416 return NULL;
7417
7418 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007419 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007422
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007423 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007424 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007425 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007426 substring->str, substring->length,
7427 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007428 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429
7430 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007431
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 return result;
7433}
7434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007435PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007436 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007438Encode S using the codec registered for encoding. Default encoding\n\
7439is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007440handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007441a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7442'xmlcharrefreplace' as well as any other name registered with\n\
7443codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444
7445static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007446unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007448 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 char *encoding = NULL;
7450 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007451
Benjamin Peterson308d6372009-09-18 21:42:35 +00007452 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7453 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007455 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007456}
7457
Georg Brandl02524622010-12-02 18:06:51 +00007458PyDoc_STRVAR(transform__doc__,
7459 "S.transform(encoding, errors='strict') -> str\n\
7460\n\
7461Transform S using the codec registered for encoding. errors may be given\n\
7462to set a different error handling scheme.");
7463
7464static PyObject *
7465unicode_transform(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7466{
7467 static char *kwlist[] = {"encoding", "errors", 0};
7468 char *encoding = NULL;
7469 char *errors = NULL;
7470
7471 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|s:transform",
7472 kwlist, &encoding, &errors))
7473 return NULL;
7474 return PyUnicode_AsEncodedUnicode((PyObject *)self, encoding, errors);
7475}
7476
7477PyDoc_STRVAR(untransform__doc__,
7478 "S.untransform(encoding, errors='strict') -> str\n\
7479\n\
7480Reverse-transform S using the codec registered for encoding. errors may be\n\
7481given to set a different error handling scheme.");
7482
7483static PyObject *
7484unicode_untransform(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7485{
7486 static char *kwlist[] = {"encoding", "errors", 0};
7487 char *encoding = NULL;
7488 char *errors = NULL;
7489
7490 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|s:untransform",
7491 kwlist, &encoding, &errors))
7492 return NULL;
7493 return PyUnicode_AsDecodedUnicode((PyObject *)self, encoding, errors);
7494}
7495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007496PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498\n\
7499Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007500If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501
7502static PyObject*
7503unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7504{
7505 Py_UNICODE *e;
7506 Py_UNICODE *p;
7507 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007508 Py_UNICODE *qe;
7509 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 PyUnicodeObject *u;
7511 int tabsize = 8;
7512
7513 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515
Thomas Wouters7e474022000-07-16 12:04:32 +00007516 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007517 i = 0; /* chars up to and including most recent \n or \r */
7518 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7519 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 for (p = self->str; p < e; p++)
7521 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 if (tabsize > 0) {
7523 incr = tabsize - (j % tabsize); /* cannot overflow */
7524 if (j > PY_SSIZE_T_MAX - incr)
7525 goto overflow1;
7526 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007527 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 if (j > PY_SSIZE_T_MAX - 1)
7531 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 j++;
7533 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 if (i > PY_SSIZE_T_MAX - j)
7535 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007537 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538 }
7539 }
7540
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007541 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007543
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544 /* Second pass: create output string and fill it */
7545 u = _PyUnicode_New(i + j);
7546 if (!u)
7547 return NULL;
7548
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007549 j = 0; /* same as in first pass */
7550 q = u->str; /* next output char */
7551 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552
7553 for (p = self->str; p < e; p++)
7554 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 if (tabsize > 0) {
7556 i = tabsize - (j % tabsize);
7557 j += i;
7558 while (i--) {
7559 if (q >= qe)
7560 goto overflow2;
7561 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007562 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007563 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007564 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007565 else {
7566 if (q >= qe)
7567 goto overflow2;
7568 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007569 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570 if (*p == '\n' || *p == '\r')
7571 j = 0;
7572 }
7573
7574 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007575
7576 overflow2:
7577 Py_DECREF(u);
7578 overflow1:
7579 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581}
7582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007583PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585\n\
7586Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007587such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588arguments start and end are interpreted as in slice notation.\n\
7589\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007590Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591
7592static PyObject *
7593unicode_find(PyUnicodeObject *self, PyObject *args)
7594{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007595 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007596 Py_ssize_t start;
7597 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007598 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599
Christian Heimes9cd17752007-11-18 19:35:23 +00007600 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602
Thomas Wouters477c8d52006-05-27 19:21:47 +00007603 result = stringlib_find_slice(
7604 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7605 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7606 start, end
7607 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608
7609 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007610
Christian Heimes217cfd12007-12-02 14:31:20 +00007611 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612}
7613
7614static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007615unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616{
7617 if (index < 0 || index >= self->length) {
7618 PyErr_SetString(PyExc_IndexError, "string index out of range");
7619 return NULL;
7620 }
7621
7622 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7623}
7624
Guido van Rossumc2504932007-09-18 19:42:40 +00007625/* Believe it or not, this produces the same value for ASCII strings
7626 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007627static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007628unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629{
Guido van Rossumc2504932007-09-18 19:42:40 +00007630 Py_ssize_t len;
7631 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007632 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007633
7634 if (self->hash != -1)
7635 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007636 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007637 p = self->str;
7638 x = *p << 7;
7639 while (--len >= 0)
7640 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007641 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007642 if (x == -1)
7643 x = -2;
7644 self->hash = x;
7645 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646}
7647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007648PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007651Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652
7653static PyObject *
7654unicode_index(PyUnicodeObject *self, PyObject *args)
7655{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007656 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007657 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007658 Py_ssize_t start;
7659 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
Christian Heimes9cd17752007-11-18 19:35:23 +00007661 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663
Thomas Wouters477c8d52006-05-27 19:21:47 +00007664 result = stringlib_find_slice(
7665 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7666 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7667 start, end
7668 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669
7670 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007671
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672 if (result < 0) {
7673 PyErr_SetString(PyExc_ValueError, "substring not found");
7674 return NULL;
7675 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007676
Christian Heimes217cfd12007-12-02 14:31:20 +00007677 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678}
7679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007680PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007683Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007684at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685
7686static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007687unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688{
7689 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7690 register const Py_UNICODE *e;
7691 int cased;
7692
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 /* Shortcut for single character strings */
7694 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007697 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007698 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007700
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 e = p + PyUnicode_GET_SIZE(self);
7702 cased = 0;
7703 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007705
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7707 return PyBool_FromLong(0);
7708 else if (!cased && Py_UNICODE_ISLOWER(ch))
7709 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007711 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712}
7713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007714PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007717Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007718at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
7720static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007721unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722{
7723 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7724 register const Py_UNICODE *e;
7725 int cased;
7726
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727 /* Shortcut for single character strings */
7728 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007731 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007732 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007734
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 e = p + PyUnicode_GET_SIZE(self);
7736 cased = 0;
7737 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007739
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7741 return PyBool_FromLong(0);
7742 else if (!cased && Py_UNICODE_ISUPPER(ch))
7743 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007745 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746}
7747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007748PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007751Return True if S is a titlecased string and there is at least one\n\
7752character in S, i.e. upper- and titlecase characters may only\n\
7753follow uncased characters and lowercase characters only cased ones.\n\
7754Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
7756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007757unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758{
7759 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7760 register const Py_UNICODE *e;
7761 int cased, previous_is_cased;
7762
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 /* Shortcut for single character strings */
7764 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7766 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007768 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007769 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007771
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 e = p + PyUnicode_GET_SIZE(self);
7773 cased = 0;
7774 previous_is_cased = 0;
7775 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007777
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7779 if (previous_is_cased)
7780 return PyBool_FromLong(0);
7781 previous_is_cased = 1;
7782 cased = 1;
7783 }
7784 else if (Py_UNICODE_ISLOWER(ch)) {
7785 if (!previous_is_cased)
7786 return PyBool_FromLong(0);
7787 previous_is_cased = 1;
7788 cased = 1;
7789 }
7790 else
7791 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007793 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794}
7795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007796PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007799Return True if all characters in S are whitespace\n\
7800and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801
7802static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007803unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804{
7805 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7806 register const Py_UNICODE *e;
7807
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 /* Shortcut for single character strings */
7809 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 Py_UNICODE_ISSPACE(*p))
7811 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007813 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007814 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007816
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817 e = p + PyUnicode_GET_SIZE(self);
7818 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 if (!Py_UNICODE_ISSPACE(*p))
7820 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007822 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823}
7824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007825PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007827\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007828Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007829and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007830
7831static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007832unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007833{
7834 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7835 register const Py_UNICODE *e;
7836
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007837 /* Shortcut for single character strings */
7838 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 Py_UNICODE_ISALPHA(*p))
7840 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007841
7842 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007843 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007845
7846 e = p + PyUnicode_GET_SIZE(self);
7847 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 if (!Py_UNICODE_ISALPHA(*p))
7849 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007850 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007851 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007852}
7853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007854PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007856\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007857Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007858and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007859
7860static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007861unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007862{
7863 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7864 register const Py_UNICODE *e;
7865
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007866 /* Shortcut for single character strings */
7867 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 Py_UNICODE_ISALNUM(*p))
7869 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007870
7871 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007872 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007874
7875 e = p + PyUnicode_GET_SIZE(self);
7876 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 if (!Py_UNICODE_ISALNUM(*p))
7878 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007879 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007880 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007881}
7882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007883PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007886Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007887False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888
7889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007890unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891{
7892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7893 register const Py_UNICODE *e;
7894
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895 /* Shortcut for single character strings */
7896 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 Py_UNICODE_ISDECIMAL(*p))
7898 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007900 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007901 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007903
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904 e = p + PyUnicode_GET_SIZE(self);
7905 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 if (!Py_UNICODE_ISDECIMAL(*p))
7907 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007909 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910}
7911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007912PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007915Return True if all characters in S are digits\n\
7916and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917
7918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007919unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920{
7921 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7922 register const Py_UNICODE *e;
7923
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924 /* Shortcut for single character strings */
7925 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 Py_UNICODE_ISDIGIT(*p))
7927 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007928
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007929 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007930 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007932
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933 e = p + PyUnicode_GET_SIZE(self);
7934 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 if (!Py_UNICODE_ISDIGIT(*p))
7936 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007938 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939}
7940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007941PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007944Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007945False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946
7947static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007948unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949{
7950 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7951 register const Py_UNICODE *e;
7952
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953 /* Shortcut for single character strings */
7954 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 Py_UNICODE_ISNUMERIC(*p))
7956 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007958 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007959 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007961
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962 e = p + PyUnicode_GET_SIZE(self);
7963 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 if (!Py_UNICODE_ISNUMERIC(*p))
7965 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007967 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968}
7969
Martin v. Löwis47383402007-08-15 07:32:56 +00007970int
7971PyUnicode_IsIdentifier(PyObject *self)
7972{
7973 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7974 register const Py_UNICODE *e;
7975
7976 /* Special case for empty strings */
7977 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007979
7980 /* PEP 3131 says that the first character must be in
7981 XID_Start and subsequent characters in XID_Continue,
7982 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007983 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007984 letters, digits, underscore). However, given the current
7985 definition of XID_Start and XID_Continue, it is sufficient
7986 to check just for these, except that _ must be allowed
7987 as starting an identifier. */
7988 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7989 return 0;
7990
7991 e = p + PyUnicode_GET_SIZE(self);
7992 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 if (!_PyUnicode_IsXidContinue(*p))
7994 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007995 }
7996 return 1;
7997}
7998
7999PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008001\n\
8002Return True if S is a valid identifier according\n\
8003to the language definition.");
8004
8005static PyObject*
8006unicode_isidentifier(PyObject *self)
8007{
8008 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8009}
8010
Georg Brandl559e5d72008-06-11 18:37:52 +00008011PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008013\n\
8014Return True if all characters in S are considered\n\
8015printable in repr() or S is empty, False otherwise.");
8016
8017static PyObject*
8018unicode_isprintable(PyObject *self)
8019{
8020 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8021 register const Py_UNICODE *e;
8022
8023 /* Shortcut for single character strings */
8024 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8025 Py_RETURN_TRUE;
8026 }
8027
8028 e = p + PyUnicode_GET_SIZE(self);
8029 for (; p < e; p++) {
8030 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8031 Py_RETURN_FALSE;
8032 }
8033 }
8034 Py_RETURN_TRUE;
8035}
8036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008037PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008038 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039\n\
8040Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008041iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042
8043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008044unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008046 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047}
8048
Martin v. Löwis18e16552006-02-15 17:27:45 +00008049static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050unicode_length(PyUnicodeObject *self)
8051{
8052 return self->length;
8053}
8054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008055PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008058Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008059done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060
8061static PyObject *
8062unicode_ljust(PyUnicodeObject *self, PyObject *args)
8063{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008064 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008065 Py_UNICODE fillchar = ' ';
8066
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008067 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068 return NULL;
8069
Tim Peters7a29bd52001-09-12 03:03:31 +00008070 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 Py_INCREF(self);
8072 return (PyObject*) self;
8073 }
8074
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008075 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076}
8077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008078PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008081Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082
8083static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008084unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086 return fixup(self, fixlower);
8087}
8088
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008089#define LEFTSTRIP 0
8090#define RIGHTSTRIP 1
8091#define BOTHSTRIP 2
8092
8093/* Arrays indexed by above */
8094static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8095
8096#define STRIPNAME(i) (stripformat[i]+3)
8097
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008098/* externally visible for str.strip(unicode) */
8099PyObject *
8100_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8101{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008102 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8103 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8104 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8105 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8106 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008107
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008109
Benjamin Peterson14339b62009-01-31 16:36:08 +00008110 i = 0;
8111 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8113 i++;
8114 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008115 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008116
Benjamin Peterson14339b62009-01-31 16:36:08 +00008117 j = len;
8118 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 do {
8120 j--;
8121 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8122 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008123 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008124
Benjamin Peterson14339b62009-01-31 16:36:08 +00008125 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 Py_INCREF(self);
8127 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128 }
8129 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008131}
8132
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133
8134static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008135do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008137 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8138 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008139
Benjamin Peterson14339b62009-01-31 16:36:08 +00008140 i = 0;
8141 if (striptype != RIGHTSTRIP) {
8142 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8143 i++;
8144 }
8145 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008146
Benjamin Peterson14339b62009-01-31 16:36:08 +00008147 j = len;
8148 if (striptype != LEFTSTRIP) {
8149 do {
8150 j--;
8151 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8152 j++;
8153 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008154
Benjamin Peterson14339b62009-01-31 16:36:08 +00008155 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8156 Py_INCREF(self);
8157 return (PyObject*)self;
8158 }
8159 else
8160 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161}
8162
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008163
8164static PyObject *
8165do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8166{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008167 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008168
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8170 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008171
Benjamin Peterson14339b62009-01-31 16:36:08 +00008172 if (sep != NULL && sep != Py_None) {
8173 if (PyUnicode_Check(sep))
8174 return _PyUnicode_XStrip(self, striptype, sep);
8175 else {
8176 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 "%s arg must be None or str",
8178 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 return NULL;
8180 }
8181 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008182
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008184}
8185
8186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008187PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008189\n\
8190Return a copy of the string S with leading and trailing\n\
8191whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008192If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008193
8194static PyObject *
8195unicode_strip(PyUnicodeObject *self, PyObject *args)
8196{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008197 if (PyTuple_GET_SIZE(args) == 0)
8198 return do_strip(self, BOTHSTRIP); /* Common case */
8199 else
8200 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008201}
8202
8203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008204PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008206\n\
8207Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008208If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008209
8210static PyObject *
8211unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8212{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008213 if (PyTuple_GET_SIZE(args) == 0)
8214 return do_strip(self, LEFTSTRIP); /* Common case */
8215 else
8216 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008217}
8218
8219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008220PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008222\n\
8223Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008224If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008225
8226static PyObject *
8227unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8228{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008229 if (PyTuple_GET_SIZE(args) == 0)
8230 return do_strip(self, RIGHTSTRIP); /* Common case */
8231 else
8232 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008233}
8234
8235
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008237unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238{
8239 PyUnicodeObject *u;
8240 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008241 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008242 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243
Georg Brandl222de0f2009-04-12 12:01:50 +00008244 if (len < 1) {
8245 Py_INCREF(unicode_empty);
8246 return (PyObject *)unicode_empty;
8247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248
Tim Peters7a29bd52001-09-12 03:03:31 +00008249 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 /* no repeat, return original string */
8251 Py_INCREF(str);
8252 return (PyObject*) str;
8253 }
Tim Peters8f422462000-09-09 06:13:41 +00008254
8255 /* ensure # of chars needed doesn't overflow int and # of bytes
8256 * needed doesn't overflow size_t
8257 */
8258 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008259 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008260 PyErr_SetString(PyExc_OverflowError,
8261 "repeated string is too long");
8262 return NULL;
8263 }
8264 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8265 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8266 PyErr_SetString(PyExc_OverflowError,
8267 "repeated string is too long");
8268 return NULL;
8269 }
8270 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 if (!u)
8272 return NULL;
8273
8274 p = u->str;
8275
Georg Brandl222de0f2009-04-12 12:01:50 +00008276 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008277 Py_UNICODE_FILL(p, str->str[0], len);
8278 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008279 Py_ssize_t done = str->length; /* number of characters copied this far */
8280 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008282 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008283 Py_UNICODE_COPY(p+done, p, n);
8284 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 }
8287
8288 return (PyObject*) u;
8289}
8290
8291PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 PyObject *subobj,
8293 PyObject *replobj,
8294 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295{
8296 PyObject *self;
8297 PyObject *str1;
8298 PyObject *str2;
8299 PyObject *result;
8300
8301 self = PyUnicode_FromObject(obj);
8302 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 str1 = PyUnicode_FromObject(subobj);
8305 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 Py_DECREF(self);
8307 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 }
8309 str2 = PyUnicode_FromObject(replobj);
8310 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 Py_DECREF(self);
8312 Py_DECREF(str1);
8313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 }
Tim Petersced69f82003-09-16 20:30:58 +00008315 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 (PyUnicodeObject *)str1,
8317 (PyUnicodeObject *)str2,
8318 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 Py_DECREF(self);
8320 Py_DECREF(str1);
8321 Py_DECREF(str2);
8322 return result;
8323}
8324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008325PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008326 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327\n\
8328Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008329old replaced by new. If the optional argument count is\n\
8330given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331
8332static PyObject*
8333unicode_replace(PyUnicodeObject *self, PyObject *args)
8334{
8335 PyUnicodeObject *str1;
8336 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008337 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 PyObject *result;
8339
Martin v. Löwis18e16552006-02-15 17:27:45 +00008340 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 return NULL;
8342 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8343 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008346 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 Py_DECREF(str1);
8348 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008349 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350
8351 result = replace(self, str1, str2, maxcount);
8352
8353 Py_DECREF(str1);
8354 Py_DECREF(str2);
8355 return result;
8356}
8357
8358static
8359PyObject *unicode_repr(PyObject *unicode)
8360{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008361 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008362 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008363 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8364 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8365
8366 /* XXX(nnorwitz): rather than over-allocating, it would be
8367 better to choose a different scheme. Perhaps scan the
8368 first N-chars of the string and allocate based on that size.
8369 */
8370 /* Initial allocation is based on the longest-possible unichr
8371 escape.
8372
8373 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8374 unichr, so in this case it's the longest unichr escape. In
8375 narrow (UTF-16) builds this is five chars per source unichr
8376 since there are two unichrs in the surrogate pair, so in narrow
8377 (UTF-16) builds it's not the longest unichr escape.
8378
8379 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8380 so in the narrow (UTF-16) build case it's the longest unichr
8381 escape.
8382 */
8383
Walter Dörwald1ab83302007-05-18 17:15:44 +00008384 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008386#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008388#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008390#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008392 if (repr == NULL)
8393 return NULL;
8394
Walter Dörwald1ab83302007-05-18 17:15:44 +00008395 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008396
8397 /* Add quote */
8398 *p++ = (findchar(s, size, '\'') &&
8399 !findchar(s, size, '"')) ? '"' : '\'';
8400 while (size-- > 0) {
8401 Py_UNICODE ch = *s++;
8402
8403 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008404 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008405 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008406 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008407 continue;
8408 }
8409
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008411 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008412 *p++ = '\\';
8413 *p++ = 't';
8414 }
8415 else if (ch == '\n') {
8416 *p++ = '\\';
8417 *p++ = 'n';
8418 }
8419 else if (ch == '\r') {
8420 *p++ = '\\';
8421 *p++ = 'r';
8422 }
8423
8424 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008425 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008426 *p++ = '\\';
8427 *p++ = 'x';
8428 *p++ = hexdigits[(ch >> 4) & 0x000F];
8429 *p++ = hexdigits[ch & 0x000F];
8430 }
8431
Georg Brandl559e5d72008-06-11 18:37:52 +00008432 /* Copy ASCII characters as-is */
8433 else if (ch < 0x7F) {
8434 *p++ = ch;
8435 }
8436
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008438 else {
8439 Py_UCS4 ucs = ch;
8440
8441#ifndef Py_UNICODE_WIDE
8442 Py_UNICODE ch2 = 0;
8443 /* Get code point from surrogate pair */
8444 if (size > 0) {
8445 ch2 = *s;
8446 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008448 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008450 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008451 size--;
8452 }
8453 }
8454#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008456 (categories Z* and C* except ASCII space)
8457 */
8458 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8459 /* Map 8-bit characters to '\xhh' */
8460 if (ucs <= 0xff) {
8461 *p++ = '\\';
8462 *p++ = 'x';
8463 *p++ = hexdigits[(ch >> 4) & 0x000F];
8464 *p++ = hexdigits[ch & 0x000F];
8465 }
8466 /* Map 21-bit characters to '\U00xxxxxx' */
8467 else if (ucs >= 0x10000) {
8468 *p++ = '\\';
8469 *p++ = 'U';
8470 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8471 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8472 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8473 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8474 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8475 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8476 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8477 *p++ = hexdigits[ucs & 0x0000000F];
8478 }
8479 /* Map 16-bit characters to '\uxxxx' */
8480 else {
8481 *p++ = '\\';
8482 *p++ = 'u';
8483 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8484 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8485 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8486 *p++ = hexdigits[ucs & 0x000F];
8487 }
8488 }
8489 /* Copy characters as-is */
8490 else {
8491 *p++ = ch;
8492#ifndef Py_UNICODE_WIDE
8493 if (ucs >= 0x10000)
8494 *p++ = ch2;
8495#endif
8496 }
8497 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008498 }
8499 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008500 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008501
8502 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008503 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008504 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505}
8506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008507PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509\n\
8510Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008511such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512arguments start and end are interpreted as in slice notation.\n\
8513\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008514Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515
8516static PyObject *
8517unicode_rfind(PyUnicodeObject *self, PyObject *args)
8518{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008519 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008520 Py_ssize_t start;
8521 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008522 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523
Christian Heimes9cd17752007-11-18 19:35:23 +00008524 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526
Thomas Wouters477c8d52006-05-27 19:21:47 +00008527 result = stringlib_rfind_slice(
8528 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8529 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8530 start, end
8531 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532
8533 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008534
Christian Heimes217cfd12007-12-02 14:31:20 +00008535 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536}
8537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008538PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008541Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542
8543static PyObject *
8544unicode_rindex(PyUnicodeObject *self, PyObject *args)
8545{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008546 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008547 Py_ssize_t start;
8548 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008549 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550
Christian Heimes9cd17752007-11-18 19:35:23 +00008551 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008552 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553
Thomas Wouters477c8d52006-05-27 19:21:47 +00008554 result = stringlib_rfind_slice(
8555 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8556 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8557 start, end
8558 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559
8560 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008561
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 if (result < 0) {
8563 PyErr_SetString(PyExc_ValueError, "substring not found");
8564 return NULL;
8565 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008566 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567}
8568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008569PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008572Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008573done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574
8575static PyObject *
8576unicode_rjust(PyUnicodeObject *self, PyObject *args)
8577{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008578 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008579 Py_UNICODE fillchar = ' ';
8580
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008581 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 return NULL;
8583
Tim Peters7a29bd52001-09-12 03:03:31 +00008584 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 Py_INCREF(self);
8586 return (PyObject*) self;
8587 }
8588
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008589 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590}
8591
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 PyObject *sep,
8594 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595{
8596 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008597
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598 s = PyUnicode_FromObject(s);
8599 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008600 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 if (sep != NULL) {
8602 sep = PyUnicode_FromObject(sep);
8603 if (sep == NULL) {
8604 Py_DECREF(s);
8605 return NULL;
8606 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607 }
8608
8609 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8610
8611 Py_DECREF(s);
8612 Py_XDECREF(sep);
8613 return result;
8614}
8615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008616PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618\n\
8619Return a list of the words in S, using sep as the\n\
8620delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008621splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008622whitespace string is a separator and empty strings are\n\
8623removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624
8625static PyObject*
8626unicode_split(PyUnicodeObject *self, PyObject *args)
8627{
8628 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008629 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630
Martin v. Löwis18e16552006-02-15 17:27:45 +00008631 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 return NULL;
8633
8634 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640}
8641
Thomas Wouters477c8d52006-05-27 19:21:47 +00008642PyObject *
8643PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8644{
8645 PyObject* str_obj;
8646 PyObject* sep_obj;
8647 PyObject* out;
8648
8649 str_obj = PyUnicode_FromObject(str_in);
8650 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008652 sep_obj = PyUnicode_FromObject(sep_in);
8653 if (!sep_obj) {
8654 Py_DECREF(str_obj);
8655 return NULL;
8656 }
8657
8658 out = stringlib_partition(
8659 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8660 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8661 );
8662
8663 Py_DECREF(sep_obj);
8664 Py_DECREF(str_obj);
8665
8666 return out;
8667}
8668
8669
8670PyObject *
8671PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8672{
8673 PyObject* str_obj;
8674 PyObject* sep_obj;
8675 PyObject* out;
8676
8677 str_obj = PyUnicode_FromObject(str_in);
8678 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008680 sep_obj = PyUnicode_FromObject(sep_in);
8681 if (!sep_obj) {
8682 Py_DECREF(str_obj);
8683 return NULL;
8684 }
8685
8686 out = stringlib_rpartition(
8687 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8688 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8689 );
8690
8691 Py_DECREF(sep_obj);
8692 Py_DECREF(str_obj);
8693
8694 return out;
8695}
8696
8697PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008699\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008700Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008701the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008702found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008703
8704static PyObject*
8705unicode_partition(PyUnicodeObject *self, PyObject *separator)
8706{
8707 return PyUnicode_Partition((PyObject *)self, separator);
8708}
8709
8710PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008711 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008712\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008713Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008714the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008715separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008716
8717static PyObject*
8718unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8719{
8720 return PyUnicode_RPartition((PyObject *)self, separator);
8721}
8722
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008723PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 PyObject *sep,
8725 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008726{
8727 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008728
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008729 s = PyUnicode_FromObject(s);
8730 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008731 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 if (sep != NULL) {
8733 sep = PyUnicode_FromObject(sep);
8734 if (sep == NULL) {
8735 Py_DECREF(s);
8736 return NULL;
8737 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008738 }
8739
8740 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8741
8742 Py_DECREF(s);
8743 Py_XDECREF(sep);
8744 return result;
8745}
8746
8747PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008749\n\
8750Return a list of the words in S, using sep as the\n\
8751delimiter string, starting at the end of the string and\n\
8752working to the front. If maxsplit is given, at most maxsplit\n\
8753splits are done. If sep is not specified, any whitespace string\n\
8754is a separator.");
8755
8756static PyObject*
8757unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8758{
8759 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008760 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008761
Martin v. Löwis18e16552006-02-15 17:27:45 +00008762 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008763 return NULL;
8764
8765 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008767 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008769 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008771}
8772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008773PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775\n\
8776Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008777Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008778is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779
8780static PyObject*
8781unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8782{
Guido van Rossum86662912000-04-11 15:38:46 +00008783 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784
Guido van Rossum86662912000-04-11 15:38:46 +00008785 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 return NULL;
8787
Guido van Rossum86662912000-04-11 15:38:46 +00008788 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789}
8790
8791static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008792PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793{
Walter Dörwald346737f2007-05-31 10:44:43 +00008794 if (PyUnicode_CheckExact(self)) {
8795 Py_INCREF(self);
8796 return self;
8797 } else
8798 /* Subtype -- return genuine unicode string with the same value. */
8799 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8800 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801}
8802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008803PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805\n\
8806Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008807and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808
8809static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008810unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812 return fixup(self, fixswapcase);
8813}
8814
Georg Brandlceee0772007-11-27 23:48:05 +00008815PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008817\n\
8818Return a translation table usable for str.translate().\n\
8819If there is only one argument, it must be a dictionary mapping Unicode\n\
8820ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008821Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008822If there are two arguments, they must be strings of equal length, and\n\
8823in the resulting dictionary, each character in x will be mapped to the\n\
8824character at the same position in y. If there is a third argument, it\n\
8825must be a string, whose characters will be mapped to None in the result.");
8826
8827static PyObject*
8828unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8829{
8830 PyObject *x, *y = NULL, *z = NULL;
8831 PyObject *new = NULL, *key, *value;
8832 Py_ssize_t i = 0;
8833 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008834
Georg Brandlceee0772007-11-27 23:48:05 +00008835 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8836 return NULL;
8837 new = PyDict_New();
8838 if (!new)
8839 return NULL;
8840 if (y != NULL) {
8841 /* x must be a string too, of equal length */
8842 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8843 if (!PyUnicode_Check(x)) {
8844 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8845 "be a string if there is a second argument");
8846 goto err;
8847 }
8848 if (PyUnicode_GET_SIZE(x) != ylen) {
8849 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8850 "arguments must have equal length");
8851 goto err;
8852 }
8853 /* create entries for translating chars in x to those in y */
8854 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008855 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8856 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008857 if (!key || !value)
8858 goto err;
8859 res = PyDict_SetItem(new, key, value);
8860 Py_DECREF(key);
8861 Py_DECREF(value);
8862 if (res < 0)
8863 goto err;
8864 }
8865 /* create entries for deleting chars in z */
8866 if (z != NULL) {
8867 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008868 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008869 if (!key)
8870 goto err;
8871 res = PyDict_SetItem(new, key, Py_None);
8872 Py_DECREF(key);
8873 if (res < 0)
8874 goto err;
8875 }
8876 }
8877 } else {
8878 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008879 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008880 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8881 "to maketrans it must be a dict");
8882 goto err;
8883 }
8884 /* copy entries into the new dict, converting string keys to int keys */
8885 while (PyDict_Next(x, &i, &key, &value)) {
8886 if (PyUnicode_Check(key)) {
8887 /* convert string keys to integer keys */
8888 PyObject *newkey;
8889 if (PyUnicode_GET_SIZE(key) != 1) {
8890 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8891 "table must be of length 1");
8892 goto err;
8893 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008894 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008895 if (!newkey)
8896 goto err;
8897 res = PyDict_SetItem(new, newkey, value);
8898 Py_DECREF(newkey);
8899 if (res < 0)
8900 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008901 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008902 /* just keep integer keys */
8903 if (PyDict_SetItem(new, key, value) < 0)
8904 goto err;
8905 } else {
8906 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8907 "be strings or integers");
8908 goto err;
8909 }
8910 }
8911 }
8912 return new;
8913 err:
8914 Py_DECREF(new);
8915 return NULL;
8916}
8917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008918PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920\n\
8921Return a copy of the string S, where all characters have been mapped\n\
8922through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008923Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008924Unmapped characters are left untouched. Characters mapped to None\n\
8925are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926
8927static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008928unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929{
Georg Brandlceee0772007-11-27 23:48:05 +00008930 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931}
8932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008933PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008936Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937
8938static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008939unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 return fixup(self, fixupper);
8942}
8943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008944PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008947Pad a numeric string S with zeros on the left, to fill a field\n\
8948of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949
8950static PyObject *
8951unicode_zfill(PyUnicodeObject *self, PyObject *args)
8952{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008953 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954 PyUnicodeObject *u;
8955
Martin v. Löwis18e16552006-02-15 17:27:45 +00008956 Py_ssize_t width;
8957 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 return NULL;
8959
8960 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008961 if (PyUnicode_CheckExact(self)) {
8962 Py_INCREF(self);
8963 return (PyObject*) self;
8964 }
8965 else
8966 return PyUnicode_FromUnicode(
8967 PyUnicode_AS_UNICODE(self),
8968 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970 }
8971
8972 fill = width - self->length;
8973
8974 u = pad(self, fill, 0, '0');
8975
Walter Dörwald068325e2002-04-15 13:36:47 +00008976 if (u == NULL)
8977 return NULL;
8978
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979 if (u->str[fill] == '+' || u->str[fill] == '-') {
8980 /* move sign to beginning of string */
8981 u->str[0] = u->str[fill];
8982 u->str[fill] = '0';
8983 }
8984
8985 return (PyObject*) u;
8986}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987
8988#if 0
8989static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008990unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991{
Christian Heimes2202f872008-02-06 14:31:34 +00008992 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008994
8995static PyObject *
8996unicode__decimal2ascii(PyObject *self)
8997{
8998 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
8999 PyUnicode_GET_SIZE(self));
9000}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001#endif
9002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009003PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009006Return True if S starts with the specified prefix, False otherwise.\n\
9007With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009008With optional end, stop comparing S at that position.\n\
9009prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010
9011static PyObject *
9012unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009015 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009017 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009018 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009019 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009021 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009022 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9023 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009024 if (PyTuple_Check(subobj)) {
9025 Py_ssize_t i;
9026 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9027 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009029 if (substring == NULL)
9030 return NULL;
9031 result = tailmatch(self, substring, start, end, -1);
9032 Py_DECREF(substring);
9033 if (result) {
9034 Py_RETURN_TRUE;
9035 }
9036 }
9037 /* nothing matched */
9038 Py_RETURN_FALSE;
9039 }
9040 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009043 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009045 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046}
9047
9048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009049PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009050 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009052Return True if S ends with the specified suffix, False otherwise.\n\
9053With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009054With optional end, stop comparing S at that position.\n\
9055suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056
9057static PyObject *
9058unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009059 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009061 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009063 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009064 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009065 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009067 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9069 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009070 if (PyTuple_Check(subobj)) {
9071 Py_ssize_t i;
9072 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9073 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009074 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009075 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009076 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009077 result = tailmatch(self, substring, start, end, +1);
9078 Py_DECREF(substring);
9079 if (result) {
9080 Py_RETURN_TRUE;
9081 }
9082 }
9083 Py_RETURN_FALSE;
9084 }
9085 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009089 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009091 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092}
9093
Eric Smith8c663262007-08-25 02:26:07 +00009094#include "stringlib/string_format.h"
9095
9096PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009098\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009099Return a formatted version of S, using substitutions from args and kwargs.\n\
9100The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009101
Eric Smith27bbca62010-11-04 17:06:58 +00009102PyDoc_STRVAR(format_map__doc__,
9103 "S.format_map(mapping) -> str\n\
9104\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009105Return a formatted version of S, using substitutions from mapping.\n\
9106The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009107
Eric Smith4a7d76d2008-05-30 18:10:19 +00009108static PyObject *
9109unicode__format__(PyObject* self, PyObject* args)
9110{
9111 PyObject *format_spec;
9112
9113 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9114 return NULL;
9115
9116 return _PyUnicode_FormatAdvanced(self,
9117 PyUnicode_AS_UNICODE(format_spec),
9118 PyUnicode_GET_SIZE(format_spec));
9119}
9120
Eric Smith8c663262007-08-25 02:26:07 +00009121PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009123\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009124Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009125
9126static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009127unicode__sizeof__(PyUnicodeObject *v)
9128{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009129 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9130 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009131}
9132
9133PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009135
9136static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009137unicode_getnewargs(PyUnicodeObject *v)
9138{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009139 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009140}
9141
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142static PyMethodDef unicode_methods[] = {
9143
9144 /* Order is according to common usage: often used methods should
9145 appear first, since lookup is done sequentially. */
9146
Georg Brandl02524622010-12-02 18:06:51 +00009147 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS,
9148 encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009149 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9150 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009151 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009152 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9153 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9154 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9155 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9156 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9157 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9158 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009159 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009160 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9161 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9162 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009163 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009164 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9165 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9166 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009167 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009168 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009169 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009170 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009171 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9172 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9173 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9174 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9175 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9176 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9177 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9178 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9179 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9180 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9181 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9182 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9183 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9184 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009185 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009186 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009187 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009188 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009189 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009190 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009191 {"maketrans", (PyCFunction) unicode_maketrans,
9192 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandl02524622010-12-02 18:06:51 +00009193 {"transform", (PyCFunction) unicode_transform, METH_VARARGS | METH_KEYWORDS,
9194 transform__doc__},
9195 {"untransform", (PyCFunction) unicode_untransform, METH_VARARGS | METH_KEYWORDS,
9196 untransform__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009197 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009198#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009199 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200#endif
9201
9202#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009203 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009204 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009205 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009206#endif
9207
Benjamin Peterson14339b62009-01-31 16:36:08 +00009208 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009209 {NULL, NULL}
9210};
9211
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009212static PyObject *
9213unicode_mod(PyObject *v, PyObject *w)
9214{
Benjamin Peterson29060642009-01-31 22:14:21 +00009215 if (!PyUnicode_Check(v)) {
9216 Py_INCREF(Py_NotImplemented);
9217 return Py_NotImplemented;
9218 }
9219 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009220}
9221
9222static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009223 0, /*nb_add*/
9224 0, /*nb_subtract*/
9225 0, /*nb_multiply*/
9226 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009227};
9228
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009230 (lenfunc) unicode_length, /* sq_length */
9231 PyUnicode_Concat, /* sq_concat */
9232 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9233 (ssizeargfunc) unicode_getitem, /* sq_item */
9234 0, /* sq_slice */
9235 0, /* sq_ass_item */
9236 0, /* sq_ass_slice */
9237 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238};
9239
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009240static PyObject*
9241unicode_subscript(PyUnicodeObject* self, PyObject* item)
9242{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009243 if (PyIndex_Check(item)) {
9244 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009245 if (i == -1 && PyErr_Occurred())
9246 return NULL;
9247 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009248 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009249 return unicode_getitem(self, i);
9250 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009251 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009252 Py_UNICODE* source_buf;
9253 Py_UNICODE* result_buf;
9254 PyObject* result;
9255
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009256 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009257 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009258 return NULL;
9259 }
9260
9261 if (slicelength <= 0) {
9262 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009263 } else if (start == 0 && step == 1 && slicelength == self->length &&
9264 PyUnicode_CheckExact(self)) {
9265 Py_INCREF(self);
9266 return (PyObject *)self;
9267 } else if (step == 1) {
9268 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009269 } else {
9270 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009271 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9272 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009273
Benjamin Peterson29060642009-01-31 22:14:21 +00009274 if (result_buf == NULL)
9275 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009276
9277 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9278 result_buf[i] = source_buf[cur];
9279 }
Tim Petersced69f82003-09-16 20:30:58 +00009280
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009281 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009282 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009283 return result;
9284 }
9285 } else {
9286 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9287 return NULL;
9288 }
9289}
9290
9291static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009292 (lenfunc)unicode_length, /* mp_length */
9293 (binaryfunc)unicode_subscript, /* mp_subscript */
9294 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009295};
9296
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298/* Helpers for PyUnicode_Format() */
9299
9300static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009301getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009303 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 (*p_argidx)++;
9306 if (arglen < 0)
9307 return args;
9308 else
9309 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310 }
9311 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009312 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009313 return NULL;
9314}
9315
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009316/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009318static PyObject *
9319formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009321 char *p;
9322 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009323 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009324
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325 x = PyFloat_AsDouble(v);
9326 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009327 return NULL;
9328
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009330 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009331
Eric Smith0923d1d2009-04-16 20:16:10 +00009332 p = PyOS_double_to_string(x, type, prec,
9333 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009334 if (p == NULL)
9335 return NULL;
9336 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009337 PyMem_Free(p);
9338 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339}
9340
Tim Peters38fd5b62000-09-21 05:43:11 +00009341static PyObject*
9342formatlong(PyObject *val, int flags, int prec, int type)
9343{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009344 char *buf;
9345 int len;
9346 PyObject *str; /* temporary string object. */
9347 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009348
Benjamin Peterson14339b62009-01-31 16:36:08 +00009349 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9350 if (!str)
9351 return NULL;
9352 result = PyUnicode_FromStringAndSize(buf, len);
9353 Py_DECREF(str);
9354 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009355}
9356
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357static int
9358formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009359 size_t buflen,
9360 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009362 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009363 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 if (PyUnicode_GET_SIZE(v) == 1) {
9365 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9366 buf[1] = '\0';
9367 return 1;
9368 }
9369#ifndef Py_UNICODE_WIDE
9370 if (PyUnicode_GET_SIZE(v) == 2) {
9371 /* Decode a valid surrogate pair */
9372 int c0 = PyUnicode_AS_UNICODE(v)[0];
9373 int c1 = PyUnicode_AS_UNICODE(v)[1];
9374 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9375 0xDC00 <= c1 && c1 <= 0xDFFF) {
9376 buf[0] = c0;
9377 buf[1] = c1;
9378 buf[2] = '\0';
9379 return 2;
9380 }
9381 }
9382#endif
9383 goto onError;
9384 }
9385 else {
9386 /* Integer input truncated to a character */
9387 long x;
9388 x = PyLong_AsLong(v);
9389 if (x == -1 && PyErr_Occurred())
9390 goto onError;
9391
9392 if (x < 0 || x > 0x10ffff) {
9393 PyErr_SetString(PyExc_OverflowError,
9394 "%c arg not in range(0x110000)");
9395 return -1;
9396 }
9397
9398#ifndef Py_UNICODE_WIDE
9399 if (x > 0xffff) {
9400 x -= 0x10000;
9401 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9402 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9403 return 2;
9404 }
9405#endif
9406 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009407 buf[1] = '\0';
9408 return 1;
9409 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009410
Benjamin Peterson29060642009-01-31 22:14:21 +00009411 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009412 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009413 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009414 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415}
9416
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009417/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009418 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009419*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009420#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009421
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009423 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424{
9425 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009426 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427 int args_owned = 0;
9428 PyUnicodeObject *result = NULL;
9429 PyObject *dict = NULL;
9430 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009431
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009433 PyErr_BadInternalCall();
9434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 }
9436 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009437 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439 fmt = PyUnicode_AS_UNICODE(uformat);
9440 fmtcnt = PyUnicode_GET_SIZE(uformat);
9441
9442 reslen = rescnt = fmtcnt + 100;
9443 result = _PyUnicode_New(reslen);
9444 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009445 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446 res = PyUnicode_AS_UNICODE(result);
9447
9448 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009449 arglen = PyTuple_Size(args);
9450 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451 }
9452 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009453 arglen = -1;
9454 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009456 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009457 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009458 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459
9460 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009461 if (*fmt != '%') {
9462 if (--rescnt < 0) {
9463 rescnt = fmtcnt + 100;
9464 reslen += rescnt;
9465 if (_PyUnicode_Resize(&result, reslen) < 0)
9466 goto onError;
9467 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9468 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009469 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009470 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009471 }
9472 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 /* Got a format specifier */
9474 int flags = 0;
9475 Py_ssize_t width = -1;
9476 int prec = -1;
9477 Py_UNICODE c = '\0';
9478 Py_UNICODE fill;
9479 int isnumok;
9480 PyObject *v = NULL;
9481 PyObject *temp = NULL;
9482 Py_UNICODE *pbuf;
9483 Py_UNICODE sign;
9484 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009485 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486
Benjamin Peterson29060642009-01-31 22:14:21 +00009487 fmt++;
9488 if (*fmt == '(') {
9489 Py_UNICODE *keystart;
9490 Py_ssize_t keylen;
9491 PyObject *key;
9492 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009493
Benjamin Peterson29060642009-01-31 22:14:21 +00009494 if (dict == NULL) {
9495 PyErr_SetString(PyExc_TypeError,
9496 "format requires a mapping");
9497 goto onError;
9498 }
9499 ++fmt;
9500 --fmtcnt;
9501 keystart = fmt;
9502 /* Skip over balanced parentheses */
9503 while (pcount > 0 && --fmtcnt >= 0) {
9504 if (*fmt == ')')
9505 --pcount;
9506 else if (*fmt == '(')
9507 ++pcount;
9508 fmt++;
9509 }
9510 keylen = fmt - keystart - 1;
9511 if (fmtcnt < 0 || pcount > 0) {
9512 PyErr_SetString(PyExc_ValueError,
9513 "incomplete format key");
9514 goto onError;
9515 }
9516#if 0
9517 /* keys are converted to strings using UTF-8 and
9518 then looked up since Python uses strings to hold
9519 variables names etc. in its namespaces and we
9520 wouldn't want to break common idioms. */
9521 key = PyUnicode_EncodeUTF8(keystart,
9522 keylen,
9523 NULL);
9524#else
9525 key = PyUnicode_FromUnicode(keystart, keylen);
9526#endif
9527 if (key == NULL)
9528 goto onError;
9529 if (args_owned) {
9530 Py_DECREF(args);
9531 args_owned = 0;
9532 }
9533 args = PyObject_GetItem(dict, key);
9534 Py_DECREF(key);
9535 if (args == NULL) {
9536 goto onError;
9537 }
9538 args_owned = 1;
9539 arglen = -1;
9540 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009541 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 while (--fmtcnt >= 0) {
9543 switch (c = *fmt++) {
9544 case '-': flags |= F_LJUST; continue;
9545 case '+': flags |= F_SIGN; continue;
9546 case ' ': flags |= F_BLANK; continue;
9547 case '#': flags |= F_ALT; continue;
9548 case '0': flags |= F_ZERO; continue;
9549 }
9550 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009551 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009552 if (c == '*') {
9553 v = getnextarg(args, arglen, &argidx);
9554 if (v == NULL)
9555 goto onError;
9556 if (!PyLong_Check(v)) {
9557 PyErr_SetString(PyExc_TypeError,
9558 "* wants int");
9559 goto onError;
9560 }
9561 width = PyLong_AsLong(v);
9562 if (width == -1 && PyErr_Occurred())
9563 goto onError;
9564 if (width < 0) {
9565 flags |= F_LJUST;
9566 width = -width;
9567 }
9568 if (--fmtcnt >= 0)
9569 c = *fmt++;
9570 }
9571 else if (c >= '0' && c <= '9') {
9572 width = c - '0';
9573 while (--fmtcnt >= 0) {
9574 c = *fmt++;
9575 if (c < '0' || c > '9')
9576 break;
9577 if ((width*10) / 10 != width) {
9578 PyErr_SetString(PyExc_ValueError,
9579 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009580 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009581 }
9582 width = width*10 + (c - '0');
9583 }
9584 }
9585 if (c == '.') {
9586 prec = 0;
9587 if (--fmtcnt >= 0)
9588 c = *fmt++;
9589 if (c == '*') {
9590 v = getnextarg(args, arglen, &argidx);
9591 if (v == NULL)
9592 goto onError;
9593 if (!PyLong_Check(v)) {
9594 PyErr_SetString(PyExc_TypeError,
9595 "* wants int");
9596 goto onError;
9597 }
9598 prec = PyLong_AsLong(v);
9599 if (prec == -1 && PyErr_Occurred())
9600 goto onError;
9601 if (prec < 0)
9602 prec = 0;
9603 if (--fmtcnt >= 0)
9604 c = *fmt++;
9605 }
9606 else if (c >= '0' && c <= '9') {
9607 prec = c - '0';
9608 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009609 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 if (c < '0' || c > '9')
9611 break;
9612 if ((prec*10) / 10 != prec) {
9613 PyErr_SetString(PyExc_ValueError,
9614 "prec too big");
9615 goto onError;
9616 }
9617 prec = prec*10 + (c - '0');
9618 }
9619 }
9620 } /* prec */
9621 if (fmtcnt >= 0) {
9622 if (c == 'h' || c == 'l' || c == 'L') {
9623 if (--fmtcnt >= 0)
9624 c = *fmt++;
9625 }
9626 }
9627 if (fmtcnt < 0) {
9628 PyErr_SetString(PyExc_ValueError,
9629 "incomplete format");
9630 goto onError;
9631 }
9632 if (c != '%') {
9633 v = getnextarg(args, arglen, &argidx);
9634 if (v == NULL)
9635 goto onError;
9636 }
9637 sign = 0;
9638 fill = ' ';
9639 switch (c) {
9640
9641 case '%':
9642 pbuf = formatbuf;
9643 /* presume that buffer length is at least 1 */
9644 pbuf[0] = '%';
9645 len = 1;
9646 break;
9647
9648 case 's':
9649 case 'r':
9650 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009651 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 temp = v;
9653 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009654 }
9655 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009656 if (c == 's')
9657 temp = PyObject_Str(v);
9658 else if (c == 'r')
9659 temp = PyObject_Repr(v);
9660 else
9661 temp = PyObject_ASCII(v);
9662 if (temp == NULL)
9663 goto onError;
9664 if (PyUnicode_Check(temp))
9665 /* nothing to do */;
9666 else {
9667 Py_DECREF(temp);
9668 PyErr_SetString(PyExc_TypeError,
9669 "%s argument has non-string str()");
9670 goto onError;
9671 }
9672 }
9673 pbuf = PyUnicode_AS_UNICODE(temp);
9674 len = PyUnicode_GET_SIZE(temp);
9675 if (prec >= 0 && len > prec)
9676 len = prec;
9677 break;
9678
9679 case 'i':
9680 case 'd':
9681 case 'u':
9682 case 'o':
9683 case 'x':
9684 case 'X':
9685 if (c == 'i')
9686 c = 'd';
9687 isnumok = 0;
9688 if (PyNumber_Check(v)) {
9689 PyObject *iobj=NULL;
9690
9691 if (PyLong_Check(v)) {
9692 iobj = v;
9693 Py_INCREF(iobj);
9694 }
9695 else {
9696 iobj = PyNumber_Long(v);
9697 }
9698 if (iobj!=NULL) {
9699 if (PyLong_Check(iobj)) {
9700 isnumok = 1;
9701 temp = formatlong(iobj, flags, prec, c);
9702 Py_DECREF(iobj);
9703 if (!temp)
9704 goto onError;
9705 pbuf = PyUnicode_AS_UNICODE(temp);
9706 len = PyUnicode_GET_SIZE(temp);
9707 sign = 1;
9708 }
9709 else {
9710 Py_DECREF(iobj);
9711 }
9712 }
9713 }
9714 if (!isnumok) {
9715 PyErr_Format(PyExc_TypeError,
9716 "%%%c format: a number is required, "
9717 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9718 goto onError;
9719 }
9720 if (flags & F_ZERO)
9721 fill = '0';
9722 break;
9723
9724 case 'e':
9725 case 'E':
9726 case 'f':
9727 case 'F':
9728 case 'g':
9729 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009730 temp = formatfloat(v, flags, prec, c);
9731 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009732 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009733 pbuf = PyUnicode_AS_UNICODE(temp);
9734 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009735 sign = 1;
9736 if (flags & F_ZERO)
9737 fill = '0';
9738 break;
9739
9740 case 'c':
9741 pbuf = formatbuf;
9742 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9743 if (len < 0)
9744 goto onError;
9745 break;
9746
9747 default:
9748 PyErr_Format(PyExc_ValueError,
9749 "unsupported format character '%c' (0x%x) "
9750 "at index %zd",
9751 (31<=c && c<=126) ? (char)c : '?',
9752 (int)c,
9753 (Py_ssize_t)(fmt - 1 -
9754 PyUnicode_AS_UNICODE(uformat)));
9755 goto onError;
9756 }
9757 if (sign) {
9758 if (*pbuf == '-' || *pbuf == '+') {
9759 sign = *pbuf++;
9760 len--;
9761 }
9762 else if (flags & F_SIGN)
9763 sign = '+';
9764 else if (flags & F_BLANK)
9765 sign = ' ';
9766 else
9767 sign = 0;
9768 }
9769 if (width < len)
9770 width = len;
9771 if (rescnt - (sign != 0) < width) {
9772 reslen -= rescnt;
9773 rescnt = width + fmtcnt + 100;
9774 reslen += rescnt;
9775 if (reslen < 0) {
9776 Py_XDECREF(temp);
9777 PyErr_NoMemory();
9778 goto onError;
9779 }
9780 if (_PyUnicode_Resize(&result, reslen) < 0) {
9781 Py_XDECREF(temp);
9782 goto onError;
9783 }
9784 res = PyUnicode_AS_UNICODE(result)
9785 + reslen - rescnt;
9786 }
9787 if (sign) {
9788 if (fill != ' ')
9789 *res++ = sign;
9790 rescnt--;
9791 if (width > len)
9792 width--;
9793 }
9794 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9795 assert(pbuf[0] == '0');
9796 assert(pbuf[1] == c);
9797 if (fill != ' ') {
9798 *res++ = *pbuf++;
9799 *res++ = *pbuf++;
9800 }
9801 rescnt -= 2;
9802 width -= 2;
9803 if (width < 0)
9804 width = 0;
9805 len -= 2;
9806 }
9807 if (width > len && !(flags & F_LJUST)) {
9808 do {
9809 --rescnt;
9810 *res++ = fill;
9811 } while (--width > len);
9812 }
9813 if (fill == ' ') {
9814 if (sign)
9815 *res++ = sign;
9816 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9817 assert(pbuf[0] == '0');
9818 assert(pbuf[1] == c);
9819 *res++ = *pbuf++;
9820 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009821 }
9822 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009823 Py_UNICODE_COPY(res, pbuf, len);
9824 res += len;
9825 rescnt -= len;
9826 while (--width >= len) {
9827 --rescnt;
9828 *res++ = ' ';
9829 }
9830 if (dict && (argidx < arglen) && c != '%') {
9831 PyErr_SetString(PyExc_TypeError,
9832 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009833 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009834 goto onError;
9835 }
9836 Py_XDECREF(temp);
9837 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009838 } /* until end */
9839 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009840 PyErr_SetString(PyExc_TypeError,
9841 "not all arguments converted during string formatting");
9842 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843 }
9844
Thomas Woutersa96affe2006-03-12 00:29:36 +00009845 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009846 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009848 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849 }
9850 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851 return (PyObject *)result;
9852
Benjamin Peterson29060642009-01-31 22:14:21 +00009853 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854 Py_XDECREF(result);
9855 Py_DECREF(uformat);
9856 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009857 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858 }
9859 return NULL;
9860}
9861
Jeremy Hylton938ace62002-07-17 16:30:39 +00009862static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009863unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9864
Tim Peters6d6c1a32001-08-02 04:15:00 +00009865static PyObject *
9866unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9867{
Benjamin Peterson29060642009-01-31 22:14:21 +00009868 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009869 static char *kwlist[] = {"object", "encoding", "errors", 0};
9870 char *encoding = NULL;
9871 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009872
Benjamin Peterson14339b62009-01-31 16:36:08 +00009873 if (type != &PyUnicode_Type)
9874 return unicode_subtype_new(type, args, kwds);
9875 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009876 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009877 return NULL;
9878 if (x == NULL)
9879 return (PyObject *)_PyUnicode_New(0);
9880 if (encoding == NULL && errors == NULL)
9881 return PyObject_Str(x);
9882 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009883 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009884}
9885
Guido van Rossume023fe02001-08-30 03:12:59 +00009886static PyObject *
9887unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9888{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009889 PyUnicodeObject *tmp, *pnew;
9890 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009891
Benjamin Peterson14339b62009-01-31 16:36:08 +00009892 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9893 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9894 if (tmp == NULL)
9895 return NULL;
9896 assert(PyUnicode_Check(tmp));
9897 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9898 if (pnew == NULL) {
9899 Py_DECREF(tmp);
9900 return NULL;
9901 }
9902 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9903 if (pnew->str == NULL) {
9904 _Py_ForgetReference((PyObject *)pnew);
9905 PyObject_Del(pnew);
9906 Py_DECREF(tmp);
9907 return PyErr_NoMemory();
9908 }
9909 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9910 pnew->length = n;
9911 pnew->hash = tmp->hash;
9912 Py_DECREF(tmp);
9913 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009914}
9915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009916PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009918\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009919Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009920encoding defaults to the current default string encoding.\n\
9921errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009922
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009923static PyObject *unicode_iter(PyObject *seq);
9924
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009926 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009927 "str", /* tp_name */
9928 sizeof(PyUnicodeObject), /* tp_size */
9929 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009931 (destructor)unicode_dealloc, /* tp_dealloc */
9932 0, /* tp_print */
9933 0, /* tp_getattr */
9934 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009935 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009936 unicode_repr, /* tp_repr */
9937 &unicode_as_number, /* tp_as_number */
9938 &unicode_as_sequence, /* tp_as_sequence */
9939 &unicode_as_mapping, /* tp_as_mapping */
9940 (hashfunc) unicode_hash, /* tp_hash*/
9941 0, /* tp_call*/
9942 (reprfunc) unicode_str, /* tp_str */
9943 PyObject_GenericGetAttr, /* tp_getattro */
9944 0, /* tp_setattro */
9945 0, /* tp_as_buffer */
9946 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009947 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009948 unicode_doc, /* tp_doc */
9949 0, /* tp_traverse */
9950 0, /* tp_clear */
9951 PyUnicode_RichCompare, /* tp_richcompare */
9952 0, /* tp_weaklistoffset */
9953 unicode_iter, /* tp_iter */
9954 0, /* tp_iternext */
9955 unicode_methods, /* tp_methods */
9956 0, /* tp_members */
9957 0, /* tp_getset */
9958 &PyBaseObject_Type, /* tp_base */
9959 0, /* tp_dict */
9960 0, /* tp_descr_get */
9961 0, /* tp_descr_set */
9962 0, /* tp_dictoffset */
9963 0, /* tp_init */
9964 0, /* tp_alloc */
9965 unicode_new, /* tp_new */
9966 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967};
9968
9969/* Initialize the Unicode implementation */
9970
Thomas Wouters78890102000-07-22 19:25:51 +00009971void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009973 int i;
9974
Thomas Wouters477c8d52006-05-27 19:21:47 +00009975 /* XXX - move this array to unicodectype.c ? */
9976 Py_UNICODE linebreak[] = {
9977 0x000A, /* LINE FEED */
9978 0x000D, /* CARRIAGE RETURN */
9979 0x001C, /* FILE SEPARATOR */
9980 0x001D, /* GROUP SEPARATOR */
9981 0x001E, /* RECORD SEPARATOR */
9982 0x0085, /* NEXT LINE */
9983 0x2028, /* LINE SEPARATOR */
9984 0x2029, /* PARAGRAPH SEPARATOR */
9985 };
9986
Fred Drakee4315f52000-05-09 19:53:39 +00009987 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009988 free_list = NULL;
9989 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009991 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009992 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009993
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009994 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009995 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009996 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009997 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009998
9999 /* initialize the linebreak bloom filter */
10000 bloom_linebreak = make_bloom_mask(
10001 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10002 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010003
10004 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005}
10006
10007/* Finalize the Unicode implementation */
10008
Christian Heimesa156e092008-02-16 07:38:31 +000010009int
10010PyUnicode_ClearFreeList(void)
10011{
10012 int freelist_size = numfree;
10013 PyUnicodeObject *u;
10014
10015 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010016 PyUnicodeObject *v = u;
10017 u = *(PyUnicodeObject **)u;
10018 if (v->str)
10019 PyObject_DEL(v->str);
10020 Py_XDECREF(v->defenc);
10021 PyObject_Del(v);
10022 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010023 }
10024 free_list = NULL;
10025 assert(numfree == 0);
10026 return freelist_size;
10027}
10028
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029void
Thomas Wouters78890102000-07-22 19:25:51 +000010030_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010032 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010034 Py_XDECREF(unicode_empty);
10035 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010036
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010037 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010038 if (unicode_latin1[i]) {
10039 Py_DECREF(unicode_latin1[i]);
10040 unicode_latin1[i] = NULL;
10041 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010042 }
Christian Heimesa156e092008-02-16 07:38:31 +000010043 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010045
Walter Dörwald16807132007-05-25 13:52:07 +000010046void
10047PyUnicode_InternInPlace(PyObject **p)
10048{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010049 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10050 PyObject *t;
10051 if (s == NULL || !PyUnicode_Check(s))
10052 Py_FatalError(
10053 "PyUnicode_InternInPlace: unicode strings only please!");
10054 /* If it's a subclass, we don't really know what putting
10055 it in the interned dict might do. */
10056 if (!PyUnicode_CheckExact(s))
10057 return;
10058 if (PyUnicode_CHECK_INTERNED(s))
10059 return;
10060 if (interned == NULL) {
10061 interned = PyDict_New();
10062 if (interned == NULL) {
10063 PyErr_Clear(); /* Don't leave an exception */
10064 return;
10065 }
10066 }
10067 /* It might be that the GetItem call fails even
10068 though the key is present in the dictionary,
10069 namely when this happens during a stack overflow. */
10070 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010071 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010072 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010073
Benjamin Peterson29060642009-01-31 22:14:21 +000010074 if (t) {
10075 Py_INCREF(t);
10076 Py_DECREF(*p);
10077 *p = t;
10078 return;
10079 }
Walter Dörwald16807132007-05-25 13:52:07 +000010080
Benjamin Peterson14339b62009-01-31 16:36:08 +000010081 PyThreadState_GET()->recursion_critical = 1;
10082 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10083 PyErr_Clear();
10084 PyThreadState_GET()->recursion_critical = 0;
10085 return;
10086 }
10087 PyThreadState_GET()->recursion_critical = 0;
10088 /* The two references in interned are not counted by refcnt.
10089 The deallocator will take care of this */
10090 Py_REFCNT(s) -= 2;
10091 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010092}
10093
10094void
10095PyUnicode_InternImmortal(PyObject **p)
10096{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010097 PyUnicode_InternInPlace(p);
10098 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10099 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10100 Py_INCREF(*p);
10101 }
Walter Dörwald16807132007-05-25 13:52:07 +000010102}
10103
10104PyObject *
10105PyUnicode_InternFromString(const char *cp)
10106{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010107 PyObject *s = PyUnicode_FromString(cp);
10108 if (s == NULL)
10109 return NULL;
10110 PyUnicode_InternInPlace(&s);
10111 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010112}
10113
10114void _Py_ReleaseInternedUnicodeStrings(void)
10115{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010116 PyObject *keys;
10117 PyUnicodeObject *s;
10118 Py_ssize_t i, n;
10119 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010120
Benjamin Peterson14339b62009-01-31 16:36:08 +000010121 if (interned == NULL || !PyDict_Check(interned))
10122 return;
10123 keys = PyDict_Keys(interned);
10124 if (keys == NULL || !PyList_Check(keys)) {
10125 PyErr_Clear();
10126 return;
10127 }
Walter Dörwald16807132007-05-25 13:52:07 +000010128
Benjamin Peterson14339b62009-01-31 16:36:08 +000010129 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10130 detector, interned unicode strings are not forcibly deallocated;
10131 rather, we give them their stolen references back, and then clear
10132 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010133
Benjamin Peterson14339b62009-01-31 16:36:08 +000010134 n = PyList_GET_SIZE(keys);
10135 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010136 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010137 for (i = 0; i < n; i++) {
10138 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10139 switch (s->state) {
10140 case SSTATE_NOT_INTERNED:
10141 /* XXX Shouldn't happen */
10142 break;
10143 case SSTATE_INTERNED_IMMORTAL:
10144 Py_REFCNT(s) += 1;
10145 immortal_size += s->length;
10146 break;
10147 case SSTATE_INTERNED_MORTAL:
10148 Py_REFCNT(s) += 2;
10149 mortal_size += s->length;
10150 break;
10151 default:
10152 Py_FatalError("Inconsistent interned string state.");
10153 }
10154 s->state = SSTATE_NOT_INTERNED;
10155 }
10156 fprintf(stderr, "total size of all interned strings: "
10157 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10158 "mortal/immortal\n", mortal_size, immortal_size);
10159 Py_DECREF(keys);
10160 PyDict_Clear(interned);
10161 Py_DECREF(interned);
10162 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010163}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010164
10165
10166/********************* Unicode Iterator **************************/
10167
10168typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010169 PyObject_HEAD
10170 Py_ssize_t it_index;
10171 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010172} unicodeiterobject;
10173
10174static void
10175unicodeiter_dealloc(unicodeiterobject *it)
10176{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010177 _PyObject_GC_UNTRACK(it);
10178 Py_XDECREF(it->it_seq);
10179 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010180}
10181
10182static int
10183unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10184{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010185 Py_VISIT(it->it_seq);
10186 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010187}
10188
10189static PyObject *
10190unicodeiter_next(unicodeiterobject *it)
10191{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010192 PyUnicodeObject *seq;
10193 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010194
Benjamin Peterson14339b62009-01-31 16:36:08 +000010195 assert(it != NULL);
10196 seq = it->it_seq;
10197 if (seq == NULL)
10198 return NULL;
10199 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010200
Benjamin Peterson14339b62009-01-31 16:36:08 +000010201 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10202 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010204 if (item != NULL)
10205 ++it->it_index;
10206 return item;
10207 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010208
Benjamin Peterson14339b62009-01-31 16:36:08 +000010209 Py_DECREF(seq);
10210 it->it_seq = NULL;
10211 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010212}
10213
10214static PyObject *
10215unicodeiter_len(unicodeiterobject *it)
10216{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010217 Py_ssize_t len = 0;
10218 if (it->it_seq)
10219 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10220 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010221}
10222
10223PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10224
10225static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010226 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010227 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010228 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010229};
10230
10231PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010232 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10233 "str_iterator", /* tp_name */
10234 sizeof(unicodeiterobject), /* tp_basicsize */
10235 0, /* tp_itemsize */
10236 /* methods */
10237 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10238 0, /* tp_print */
10239 0, /* tp_getattr */
10240 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010241 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010242 0, /* tp_repr */
10243 0, /* tp_as_number */
10244 0, /* tp_as_sequence */
10245 0, /* tp_as_mapping */
10246 0, /* tp_hash */
10247 0, /* tp_call */
10248 0, /* tp_str */
10249 PyObject_GenericGetAttr, /* tp_getattro */
10250 0, /* tp_setattro */
10251 0, /* tp_as_buffer */
10252 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10253 0, /* tp_doc */
10254 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10255 0, /* tp_clear */
10256 0, /* tp_richcompare */
10257 0, /* tp_weaklistoffset */
10258 PyObject_SelfIter, /* tp_iter */
10259 (iternextfunc)unicodeiter_next, /* tp_iternext */
10260 unicodeiter_methods, /* tp_methods */
10261 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010262};
10263
10264static PyObject *
10265unicode_iter(PyObject *seq)
10266{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010267 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010268
Benjamin Peterson14339b62009-01-31 16:36:08 +000010269 if (!PyUnicode_Check(seq)) {
10270 PyErr_BadInternalCall();
10271 return NULL;
10272 }
10273 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10274 if (it == NULL)
10275 return NULL;
10276 it->it_index = 0;
10277 Py_INCREF(seq);
10278 it->it_seq = (PyUnicodeObject *)seq;
10279 _PyObject_GC_TRACK(it);
10280 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010281}
10282
Martin v. Löwis5b222132007-06-10 09:51:05 +000010283size_t
10284Py_UNICODE_strlen(const Py_UNICODE *u)
10285{
10286 int res = 0;
10287 while(*u++)
10288 res++;
10289 return res;
10290}
10291
10292Py_UNICODE*
10293Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10294{
10295 Py_UNICODE *u = s1;
10296 while ((*u++ = *s2++));
10297 return s1;
10298}
10299
10300Py_UNICODE*
10301Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10302{
10303 Py_UNICODE *u = s1;
10304 while ((*u++ = *s2++))
10305 if (n-- == 0)
10306 break;
10307 return s1;
10308}
10309
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010310Py_UNICODE*
10311Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10312{
10313 Py_UNICODE *u1 = s1;
10314 u1 += Py_UNICODE_strlen(u1);
10315 Py_UNICODE_strcpy(u1, s2);
10316 return s1;
10317}
10318
Martin v. Löwis5b222132007-06-10 09:51:05 +000010319int
10320Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10321{
10322 while (*s1 && *s2 && *s1 == *s2)
10323 s1++, s2++;
10324 if (*s1 && *s2)
10325 return (*s1 < *s2) ? -1 : +1;
10326 if (*s1)
10327 return 1;
10328 if (*s2)
10329 return -1;
10330 return 0;
10331}
10332
Victor Stinneref8d95c2010-08-16 22:03:11 +000010333int
10334Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10335{
10336 register Py_UNICODE u1, u2;
10337 for (; n != 0; n--) {
10338 u1 = *s1;
10339 u2 = *s2;
10340 if (u1 != u2)
10341 return (u1 < u2) ? -1 : +1;
10342 if (u1 == '\0')
10343 return 0;
10344 s1++;
10345 s2++;
10346 }
10347 return 0;
10348}
10349
Martin v. Löwis5b222132007-06-10 09:51:05 +000010350Py_UNICODE*
10351Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10352{
10353 const Py_UNICODE *p;
10354 for (p = s; *p; p++)
10355 if (*p == c)
10356 return (Py_UNICODE*)p;
10357 return NULL;
10358}
10359
Victor Stinner331ea922010-08-10 16:37:20 +000010360Py_UNICODE*
10361Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10362{
10363 const Py_UNICODE *p;
10364 p = s + Py_UNICODE_strlen(s);
10365 while (p != s) {
10366 p--;
10367 if (*p == c)
10368 return (Py_UNICODE*)p;
10369 }
10370 return NULL;
10371}
10372
Victor Stinner71133ff2010-09-01 23:43:53 +000010373Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010374PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010375{
10376 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10377 Py_UNICODE *copy;
10378 Py_ssize_t size;
10379
10380 /* Ensure we won't overflow the size. */
10381 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10382 PyErr_NoMemory();
10383 return NULL;
10384 }
10385 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10386 size *= sizeof(Py_UNICODE);
10387 copy = PyMem_Malloc(size);
10388 if (copy == NULL) {
10389 PyErr_NoMemory();
10390 return NULL;
10391 }
10392 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10393 return copy;
10394}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010395
Georg Brandl66c221e2010-10-14 07:04:07 +000010396/* A _string module, to export formatter_parser and formatter_field_name_split
10397 to the string.Formatter class implemented in Python. */
10398
10399static PyMethodDef _string_methods[] = {
10400 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10401 METH_O, PyDoc_STR("split the argument as a field name")},
10402 {"formatter_parser", (PyCFunction) formatter_parser,
10403 METH_O, PyDoc_STR("parse the argument as a format string")},
10404 {NULL, NULL}
10405};
10406
10407static struct PyModuleDef _string_module = {
10408 PyModuleDef_HEAD_INIT,
10409 "_string",
10410 PyDoc_STR("string helper module"),
10411 0,
10412 _string_methods,
10413 NULL,
10414 NULL,
10415 NULL,
10416 NULL
10417};
10418
10419PyMODINIT_FUNC
10420PyInit__string(void)
10421{
10422 return PyModule_Create(&_string_module);
10423}
10424
10425
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010426#ifdef __cplusplus
10427}
10428#endif